All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH -mm] add extra free kbytes tunable
@ 2011-09-01 14:52 ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-09-01 14:52 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: linux-kernel, linux-mm, lwoodman, Seiji Aguchi, akpm, hughd, hannes

Add a userspace visible knob to tell the VM to keep an extra amount
of memory free, by increasing the gap between each zone's min and
low watermarks.

This is useful for realtime applications that call system
calls and have a bound on the number of allocations that happen
in any short time period.  In this application, extra_free_kbytes
would be left at an amount equal to or larger than than the
maximum number of allocations that happen in any burst.

It may also be useful to reduce the memory use of virtual
machines (temporarily?), in a way that does not cause memory
fragmentation like ballooning does.

Signed-off-by: Rik van Riel<riel@redhat.com>

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b5..01a9acd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,7 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
+extern int extra_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "extra_free_kbytes",
+		.data		= &extra_free_kbytes,
+		.maxlen		= sizeof(extra_free_kbytes),
+		.mode		= 0644,
+		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e8ecb6..47d185c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -175,8 +175,20 @@ static char * const zone_names[MAX_NR_ZONES] = {
 	 "Movable",
 };
 
+/*
+ * Try to keep at least this much lowmem free.  Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
 int min_free_kbytes = 1024;
 
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
+
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
@@ -5123,6 +5135,7 @@ static void setup_per_zone_lowmem_reserve(void)
 void setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
@@ -5134,11 +5147,14 @@ void setup_per_zone_wmarks(void)
 	}
 
 	for_each_zone(zone) {
-		u64 tmp;
+		u64 min, low;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		tmp = (u64)pages_min * zone->present_pages;
-		do_div(tmp, lowmem_pages);
+		min = (u64)pages_min * zone->present_pages;
+		do_div(min, lowmem_pages);
+		low = (u64)pages_low * zone->present_pages;
+		do_div(low, vm_total_pages);
+
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -5162,11 +5178,13 @@ void setup_per_zone_wmarks(void)
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->watermark[WMARK_MIN] = tmp;
+			zone->watermark[WMARK_MIN] = min;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) +
+					low + (min >> 2);
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+					low + (min >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
@@ -5264,7 +5282,7 @@ module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
  *	that we can call two helper functions whenever min_free_kbytes
- *	changes.
+ *	or extra_free_kbytes changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
 	void __user *buffer, size_t *length, loff_t *ppos)


^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH -mm] add extra free kbytes tunable
@ 2011-09-01 14:52 ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-09-01 14:52 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: linux-kernel, linux-mm, lwoodman, Seiji Aguchi, akpm, hughd, hannes

Add a userspace visible knob to tell the VM to keep an extra amount
of memory free, by increasing the gap between each zone's min and
low watermarks.

This is useful for realtime applications that call system
calls and have a bound on the number of allocations that happen
in any short time period.  In this application, extra_free_kbytes
would be left at an amount equal to or larger than than the
maximum number of allocations that happen in any burst.

It may also be useful to reduce the memory use of virtual
machines (temporarily?), in a way that does not cause memory
fragmentation like ballooning does.

Signed-off-by: Rik van Riel<riel@redhat.com>

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b5..01a9acd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,7 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
+extern int extra_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "extra_free_kbytes",
+		.data		= &extra_free_kbytes,
+		.maxlen		= sizeof(extra_free_kbytes),
+		.mode		= 0644,
+		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e8ecb6..47d185c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -175,8 +175,20 @@ static char * const zone_names[MAX_NR_ZONES] = {
 	 "Movable",
 };
 
+/*
+ * Try to keep at least this much lowmem free.  Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
 int min_free_kbytes = 1024;
 
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
+
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
@@ -5123,6 +5135,7 @@ static void setup_per_zone_lowmem_reserve(void)
 void setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
@@ -5134,11 +5147,14 @@ void setup_per_zone_wmarks(void)
 	}
 
 	for_each_zone(zone) {
-		u64 tmp;
+		u64 min, low;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		tmp = (u64)pages_min * zone->present_pages;
-		do_div(tmp, lowmem_pages);
+		min = (u64)pages_min * zone->present_pages;
+		do_div(min, lowmem_pages);
+		low = (u64)pages_low * zone->present_pages;
+		do_div(low, vm_total_pages);
+
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -5162,11 +5178,13 @@ void setup_per_zone_wmarks(void)
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->watermark[WMARK_MIN] = tmp;
+			zone->watermark[WMARK_MIN] = min;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) +
+					low + (min >> 2);
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+					low + (min >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
@@ -5264,7 +5282,7 @@ module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
  *	that we can call two helper functions whenever min_free_kbytes
- *	changes.
+ *	or extra_free_kbytes changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
 	void __user *buffer, size_t *length, loff_t *ppos)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH -mm] add extra free kbytes tunable
  2011-09-01 14:52 ` Rik van Riel
@ 2011-09-01 17:06   ` Randy Dunlap
  -1 siblings, 0 replies; 100+ messages in thread
From: Randy Dunlap @ 2011-09-01 17:06 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	akpm, hughd, hannes

On Thu, 1 Sep 2011 10:52:08 -0400 Rik van Riel wrote:

> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 
> Signed-off-by: Rik van Riel<riel@redhat.com>

Hi Rik,

Please add to Documentation/syctl/vm.txt ..

Thanks.


> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 11d65b5..01a9acd 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -96,6 +96,7 @@ extern char core_pattern[];
>  extern unsigned int core_pipe_limit;
>  extern int pid_max;
>  extern int min_free_kbytes;
> +extern int extra_free_kbytes;
>  extern int pid_max_min, pid_max_max;
>  extern int sysctl_drop_caches;
>  extern int percpu_pagelist_fraction;
> @@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
>  		.extra1		= &zero,
>  	},
>  	{
> +		.procname	= "extra_free_kbytes",
> +		.data		= &extra_free_kbytes,
> +		.maxlen		= sizeof(extra_free_kbytes),
> +		.mode		= 0644,
> +		.proc_handler	= min_free_kbytes_sysctl_handler,
> +		.extra1		= &zero,
> +	},
> +	{
>  		.procname	= "percpu_pagelist_fraction",
>  		.data		= &percpu_pagelist_fraction,
>  		.maxlen		= sizeof(percpu_pagelist_fraction),
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6e8ecb6..47d185c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -175,8 +175,20 @@ static char * const zone_names[MAX_NR_ZONES] = {
>  	 "Movable",
>  };
>  
> +/*
> + * Try to keep at least this much lowmem free.  Do not allow normal
> + * allocations below this point, only high priority ones. Automatically
> + * tuned according to the amount of memory in the system.
> + */
>  int min_free_kbytes = 1024;
>  
> +/*
> + * Extra memory for the system to try freeing. Used to temporarily
> + * free memory, to make space for new workloads. Anyone can allocate
> + * down to the min watermarks controlled by min_free_kbytes above.
> + */
> +int extra_free_kbytes = 0;


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -mm] add extra free kbytes tunable
@ 2011-09-01 17:06   ` Randy Dunlap
  0 siblings, 0 replies; 100+ messages in thread
From: Randy Dunlap @ 2011-09-01 17:06 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	akpm, hughd, hannes

On Thu, 1 Sep 2011 10:52:08 -0400 Rik van Riel wrote:

> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 
> Signed-off-by: Rik van Riel<riel@redhat.com>

Hi Rik,

Please add to Documentation/syctl/vm.txt ..

Thanks.


> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 11d65b5..01a9acd 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -96,6 +96,7 @@ extern char core_pattern[];
>  extern unsigned int core_pipe_limit;
>  extern int pid_max;
>  extern int min_free_kbytes;
> +extern int extra_free_kbytes;
>  extern int pid_max_min, pid_max_max;
>  extern int sysctl_drop_caches;
>  extern int percpu_pagelist_fraction;
> @@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
>  		.extra1		= &zero,
>  	},
>  	{
> +		.procname	= "extra_free_kbytes",
> +		.data		= &extra_free_kbytes,
> +		.maxlen		= sizeof(extra_free_kbytes),
> +		.mode		= 0644,
> +		.proc_handler	= min_free_kbytes_sysctl_handler,
> +		.extra1		= &zero,
> +	},
> +	{
>  		.procname	= "percpu_pagelist_fraction",
>  		.data		= &percpu_pagelist_fraction,
>  		.maxlen		= sizeof(percpu_pagelist_fraction),
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6e8ecb6..47d185c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -175,8 +175,20 @@ static char * const zone_names[MAX_NR_ZONES] = {
>  	 "Movable",
>  };
>  
> +/*
> + * Try to keep at least this much lowmem free.  Do not allow normal
> + * allocations below this point, only high priority ones. Automatically
> + * tuned according to the amount of memory in the system.
> + */
>  int min_free_kbytes = 1024;
>  
> +/*
> + * Extra memory for the system to try freeing. Used to temporarily
> + * free memory, to make space for new workloads. Anyone can allocate
> + * down to the min watermarks controlled by min_free_kbytes above.
> + */
> +int extra_free_kbytes = 0;


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 17:06   ` Randy Dunlap
@ 2011-09-01 19:26     ` Rik van Riel
  -1 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-09-01 19:26 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	akpm, hughd, hannes

Add a userspace visible knob to tell the VM to keep an extra amount
of memory free, by increasing the gap between each zone's min and
low watermarks.

This is useful for realtime applications that call system
calls and have a bound on the number of allocations that happen
in any short time period.  In this application, extra_free_kbytes
would be left at an amount equal to or larger than than the
maximum number of allocations that happen in any burst.

It may also be useful to reduce the memory use of virtual
machines (temporarily?), in a way that does not cause memory
fragmentation like ballooning does.

Signed-off-by: Rik van Riel<riel@redhat.com>

---
 Documentation/sysctl/vm.txt |   16 ++++++++++++++++
 kernel/sysctl.c             |    9 +++++++++
 mm/page_alloc.c             |   32 +++++++++++++++++++++++++-------
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 96f0ee8..9c11d97 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
 - dirty_writeback_centisecs
 - drop_caches
 - extfrag_threshold
+- extra_free_kbytes
 - hugepages_treat_as_movable
 - hugetlb_shm_group
 - laptop_mode
@@ -168,6 +169,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
 
 ==============================================================
 
+extra_free_kbytes
+
+This parameter tells the VM to keep extra free memory between the threshold
+where background reclaim (kswapd) kicks in, and the threshold where direct
+reclaim (by allocating processes) kicks in.
+
+This is useful for workloads that require low latency memory allocations
+and have a bounded burstiness in memory allocations, for example a
+realtime application that receives and transmits network traffic
+(causing in-kernel memory allocations) with a maximum total message burst
+size of 200MB may need 200MB of extra free memory to avoid direct reclaim
+related latencies.
+
+==============================================================
+
 hugepages_treat_as_movable
 
 This parameter is only useful when kernelcore= is specified at boot time to
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b5..01a9acd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,7 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
+extern int extra_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "extra_free_kbytes",
+		.data		= &extra_free_kbytes,
+		.maxlen		= sizeof(extra_free_kbytes),
+		.mode		= 0644,
+		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e8ecb6..47d185c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -175,8 +175,20 @@ static char * const zone_names[MAX_NR_ZONES] = {
 	 "Movable",
 };
 
+/*
+ * Try to keep at least this much lowmem free.  Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
 int min_free_kbytes = 1024;
 
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
+
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
@@ -5123,6 +5135,7 @@ static void setup_per_zone_lowmem_reserve(void)
 void setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
@@ -5134,11 +5147,14 @@ void setup_per_zone_wmarks(void)
 	}
 
 	for_each_zone(zone) {
-		u64 tmp;
+		u64 min, low;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		tmp = (u64)pages_min * zone->present_pages;
-		do_div(tmp, lowmem_pages);
+		min = (u64)pages_min * zone->present_pages;
+		do_div(min, lowmem_pages);
+		low = (u64)pages_low * zone->present_pages;
+		do_div(low, vm_total_pages);
+
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -5162,11 +5178,13 @@ void setup_per_zone_wmarks(void)
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->watermark[WMARK_MIN] = tmp;
+			zone->watermark[WMARK_MIN] = min;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) +
+					low + (min >> 2);
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+					low + (min >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
@@ -5264,7 +5282,7 @@ module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
  *	that we can call two helper functions whenever min_free_kbytes
- *	changes.
+ *	or extra_free_kbytes changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
 	void __user *buffer, size_t *length, loff_t *ppos)

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-01 19:26     ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-09-01 19:26 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	akpm, hughd, hannes

Add a userspace visible knob to tell the VM to keep an extra amount
of memory free, by increasing the gap between each zone's min and
low watermarks.

This is useful for realtime applications that call system
calls and have a bound on the number of allocations that happen
in any short time period.  In this application, extra_free_kbytes
would be left at an amount equal to or larger than than the
maximum number of allocations that happen in any burst.

It may also be useful to reduce the memory use of virtual
machines (temporarily?), in a way that does not cause memory
fragmentation like ballooning does.

Signed-off-by: Rik van Riel<riel@redhat.com>

---
 Documentation/sysctl/vm.txt |   16 ++++++++++++++++
 kernel/sysctl.c             |    9 +++++++++
 mm/page_alloc.c             |   32 +++++++++++++++++++++++++-------
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 96f0ee8..9c11d97 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
 - dirty_writeback_centisecs
 - drop_caches
 - extfrag_threshold
+- extra_free_kbytes
 - hugepages_treat_as_movable
 - hugetlb_shm_group
 - laptop_mode
@@ -168,6 +169,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
 
 ==============================================================
 
+extra_free_kbytes
+
+This parameter tells the VM to keep extra free memory between the threshold
+where background reclaim (kswapd) kicks in, and the threshold where direct
+reclaim (by allocating processes) kicks in.
+
+This is useful for workloads that require low latency memory allocations
+and have a bounded burstiness in memory allocations, for example a
+realtime application that receives and transmits network traffic
+(causing in-kernel memory allocations) with a maximum total message burst
+size of 200MB may need 200MB of extra free memory to avoid direct reclaim
+related latencies.
+
+==============================================================
+
 hugepages_treat_as_movable
 
 This parameter is only useful when kernelcore= is specified at boot time to
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b5..01a9acd 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,7 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
 extern int min_free_kbytes;
+extern int extra_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "extra_free_kbytes",
+		.data		= &extra_free_kbytes,
+		.maxlen		= sizeof(extra_free_kbytes),
+		.mode		= 0644,
+		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e8ecb6..47d185c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -175,8 +175,20 @@ static char * const zone_names[MAX_NR_ZONES] = {
 	 "Movable",
 };
 
+/*
+ * Try to keep at least this much lowmem free.  Do not allow normal
+ * allocations below this point, only high priority ones. Automatically
+ * tuned according to the amount of memory in the system.
+ */
 int min_free_kbytes = 1024;
 
+/*
+ * Extra memory for the system to try freeing. Used to temporarily
+ * free memory, to make space for new workloads. Anyone can allocate
+ * down to the min watermarks controlled by min_free_kbytes above.
+ */
+int extra_free_kbytes = 0;
+
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
@@ -5123,6 +5135,7 @@ static void setup_per_zone_lowmem_reserve(void)
 void setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
@@ -5134,11 +5147,14 @@ void setup_per_zone_wmarks(void)
 	}
 
 	for_each_zone(zone) {
-		u64 tmp;
+		u64 min, low;
 
 		spin_lock_irqsave(&zone->lock, flags);
-		tmp = (u64)pages_min * zone->present_pages;
-		do_div(tmp, lowmem_pages);
+		min = (u64)pages_min * zone->present_pages;
+		do_div(min, lowmem_pages);
+		low = (u64)pages_low * zone->present_pages;
+		do_div(low, vm_total_pages);
+
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -5162,11 +5178,13 @@ void setup_per_zone_wmarks(void)
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->watermark[WMARK_MIN] = tmp;
+			zone->watermark[WMARK_MIN] = min;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) +
+					low + (min >> 2);
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+					low + (min >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
@@ -5264,7 +5282,7 @@ module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
  *	that we can call two helper functions whenever min_free_kbytes
- *	changes.
+ *	or extra_free_kbytes changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
 	void __user *buffer, size_t *length, loff_t *ppos)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 19:26     ` Rik van Riel
@ 2011-09-01 21:58       ` Andrew Morton
  -1 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-09-01 21:58 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011 15:26:50 -0400
Rik van Riel <riel@redhat.com> wrote:

> Add a userspace visible knob

argh.  Fear and hostility at new knobs which need to be maintained for
ever, even if the underlying implementation changes.

Unfortunately, this one makes sense.

> to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.

_is_ it useful?  Proof?

Who is requesting this?  Have they tested it?  Results?

> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.

Maybe.  You need to alter the setting, then somehow persuade all the
targeted kswapd's to start running, then somehow determine that they've
done their thing, then unalter the /proc setting.  Not the best API
we've ever designed ;)

> ...
>  
> +extra_free_kbytes
> +
> +This parameter tells the VM to keep extra free memory between the threshold
> +where background reclaim (kswapd) kicks in, and the threshold where direct
> +reclaim (by allocating processes) kicks in.
> +
> +This is useful for workloads that require low latency memory allocations
> +and have a bounded burstiness in memory allocations, for example a
> +realtime application that receives and transmits network traffic
> +(causing in-kernel memory allocations) with a maximum total message burst
> +size of 200MB may need 200MB of extra free memory to avoid direct reclaim
> +related latencies.
> +
> +==============================================================

It's upsetting that the names min_free_kbytes and extra_free_kbytes
don't map onto the kernel variables (WMARK_MIN, WMARK_LOW, WMARK_HIGH)
and also that they just aren't very communicative.

Oh well, doesn't matter much.

> ...
> +/*
> + * Extra memory for the system to try freeing. Used to temporarily
> + * free memory, to make space for new workloads. Anyone can allocate
> + * down to the min watermarks controlled by min_free_kbytes above.
> + */

The comment isn't really complete, is it?  There are valid use cases
where an alteration here isn't temporary.



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-01 21:58       ` Andrew Morton
  0 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-09-01 21:58 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011 15:26:50 -0400
Rik van Riel <riel@redhat.com> wrote:

> Add a userspace visible knob

argh.  Fear and hostility at new knobs which need to be maintained for
ever, even if the underlying implementation changes.

Unfortunately, this one makes sense.

> to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.

_is_ it useful?  Proof?

Who is requesting this?  Have they tested it?  Results?

> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.

Maybe.  You need to alter the setting, then somehow persuade all the
targeted kswapd's to start running, then somehow determine that they've
done their thing, then unalter the /proc setting.  Not the best API
we've ever designed ;)

> ...
>  
> +extra_free_kbytes
> +
> +This parameter tells the VM to keep extra free memory between the threshold
> +where background reclaim (kswapd) kicks in, and the threshold where direct
> +reclaim (by allocating processes) kicks in.
> +
> +This is useful for workloads that require low latency memory allocations
> +and have a bounded burstiness in memory allocations, for example a
> +realtime application that receives and transmits network traffic
> +(causing in-kernel memory allocations) with a maximum total message burst
> +size of 200MB may need 200MB of extra free memory to avoid direct reclaim
> +related latencies.
> +
> +==============================================================

It's upsetting that the names min_free_kbytes and extra_free_kbytes
don't map onto the kernel variables (WMARK_MIN, WMARK_LOW, WMARK_HIGH)
and also that they just aren't very communicative.

Oh well, doesn't matter much.

> ...
> +/*
> + * Extra memory for the system to try freeing. Used to temporarily
> + * free memory, to make space for new workloads. Anyone can allocate
> + * down to the min watermarks controlled by min_free_kbytes above.
> + */

The comment isn't really complete, is it?  There are valid use cases
where an alteration here isn't temporary.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 21:58       ` Andrew Morton
@ 2011-09-01 22:08         ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-09-01 22:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011, Andrew Morton wrote:

> > Add a userspace visible knob
> 
> argh.  Fear and hostility at new knobs which need to be maintained for
> ever, even if the underlying implementation changes.
> 

Do we really need to maintain tunables that lose their purpose either 
because the implementation changes or is patched to fix the issue that the 
tunable was intended to address without requiring it?

Are there really userspace tools written to not be able to handle -ENOENT 
when one of these gets removed?

> > It may also be useful to reduce the memory use of virtual
> > machines (temporarily?), in a way that does not cause memory
> > fragmentation like ballooning does.
> 
> Maybe.  You need to alter the setting, then somehow persuade all the
> targeted kswapd's to start running, then somehow determine that they've
> done their thing, then unalter the /proc setting.  Not the best API
> we've ever designed ;)
> 

And, unfortunately, this could have negative effects if using cpusets 
and/or mempolicies since this is global across all zones such that jobs 
that do not require such "extra" memory would be unfairly penalized with 
work incurred by additional reclaim they don't need.  And if the job is 
constrained to a memory cgroup, there's no guarantee it will reclaim back 
to these altered watermarks.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-01 22:08         ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-09-01 22:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011, Andrew Morton wrote:

> > Add a userspace visible knob
> 
> argh.  Fear and hostility at new knobs which need to be maintained for
> ever, even if the underlying implementation changes.
> 

Do we really need to maintain tunables that lose their purpose either 
because the implementation changes or is patched to fix the issue that the 
tunable was intended to address without requiring it?

Are there really userspace tools written to not be able to handle -ENOENT 
when one of these gets removed?

> > It may also be useful to reduce the memory use of virtual
> > machines (temporarily?), in a way that does not cause memory
> > fragmentation like ballooning does.
> 
> Maybe.  You need to alter the setting, then somehow persuade all the
> targeted kswapd's to start running, then somehow determine that they've
> done their thing, then unalter the /proc setting.  Not the best API
> we've ever designed ;)
> 

And, unfortunately, this could have negative effects if using cpusets 
and/or mempolicies since this is global across all zones such that jobs 
that do not require such "extra" memory would be unfairly penalized with 
work incurred by additional reclaim they don't need.  And if the job is 
constrained to a memory cgroup, there's no guarantee it will reclaim back 
to these altered watermarks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 19:26     ` Rik van Riel
@ 2011-09-01 22:09       ` Andrew Morton
  -1 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-09-01 22:09 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011 15:26:50 -0400
Rik van Riel <riel@redhat.com> wrote:

> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -96,6 +96,7 @@ extern char core_pattern[];
>  extern unsigned int core_pipe_limit;
>  extern int pid_max;
>  extern int min_free_kbytes;
> +extern int extra_free_kbytes;

No externs in C, please.  Feel free to fix min_free_kbytes while you're
there ;)

swap.h is a common place to declare these things.  mmzone.h would make
sense too.


>  extern int pid_max_min, pid_max_max;
>  extern int sysctl_drop_caches;
>  extern int percpu_pagelist_fraction;
> @@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
>  		.extra1		= &zero,
>  	},
>  	{
> +		.procname	= "extra_free_kbytes",
> +		.data		= &extra_free_kbytes,
> +		.maxlen		= sizeof(extra_free_kbytes),
> +		.mode		= 0644,
> +		.proc_handler	= min_free_kbytes_sysctl_handler,

Lazy.  The function should be renamed to accurately reflect its role.

> +		.extra1		= &zero,
> +	},
> +	{
>  		.procname	= "percpu_pagelist_fraction",
>  		.data		= &percpu_pagelist_fraction,
>  		.maxlen		= sizeof(percpu_pagelist_fraction),
>
> ...
>
> +int extra_free_kbytes = 0;

I'm inclined to agree with checkpatch here - it's just noise.



^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-01 22:09       ` Andrew Morton
  0 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-09-01 22:09 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011 15:26:50 -0400
Rik van Riel <riel@redhat.com> wrote:

> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -96,6 +96,7 @@ extern char core_pattern[];
>  extern unsigned int core_pipe_limit;
>  extern int pid_max;
>  extern int min_free_kbytes;
> +extern int extra_free_kbytes;

No externs in C, please.  Feel free to fix min_free_kbytes while you're
there ;)

swap.h is a common place to declare these things.  mmzone.h would make
sense too.


>  extern int pid_max_min, pid_max_max;
>  extern int sysctl_drop_caches;
>  extern int percpu_pagelist_fraction;
> @@ -1189,6 +1190,14 @@ static struct ctl_table vm_table[] = {
>  		.extra1		= &zero,
>  	},
>  	{
> +		.procname	= "extra_free_kbytes",
> +		.data		= &extra_free_kbytes,
> +		.maxlen		= sizeof(extra_free_kbytes),
> +		.mode		= 0644,
> +		.proc_handler	= min_free_kbytes_sysctl_handler,

Lazy.  The function should be renamed to accurately reflect its role.

> +		.extra1		= &zero,
> +	},
> +	{
>  		.procname	= "percpu_pagelist_fraction",
>  		.data		= &percpu_pagelist_fraction,
>  		.maxlen		= sizeof(percpu_pagelist_fraction),
>
> ...
>
> +int extra_free_kbytes = 0;

I'm inclined to agree with checkpatch here - it's just noise.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 22:08         ` David Rientjes
@ 2011-09-01 22:16           ` Andrew Morton
  -1 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-09-01 22:16 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011 15:08:00 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> On Thu, 1 Sep 2011, Andrew Morton wrote:
> 
> > > Add a userspace visible knob
> > 
> > argh.  Fear and hostility at new knobs which need to be maintained for
> > ever, even if the underlying implementation changes.
> > 
> 
> Do we really need to maintain tunables that lose their purpose either 
> because the implementation changes or is patched to fix the issue that the 
> tunable was intended to address without requiring it?
> 
> Are there really userspace tools written to not be able to handle -ENOENT 
> when one of these gets removed?

I don't know, and neither does anyone else.  So we need to be cautious.
Like putting a warning printk in there and waiting several years.

And it's not just a matter of handling ENOENT.  The user modified this
tunable for a *reason*.  They were expecting some behaviour change in
the kernel.  If we remove the tunable, we take that behaviour change
away from them.  So by adding this tunable, we constrain future
implementations by requiring those implementations to automatically do
whatever the user was previously doing manually.  And we don't reliably
know *why* each user altered that tunable.  It's a horrid mess.




^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-01 22:16           ` Andrew Morton
  0 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-09-01 22:16 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Thu, 1 Sep 2011 15:08:00 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> On Thu, 1 Sep 2011, Andrew Morton wrote:
> 
> > > Add a userspace visible knob
> > 
> > argh.  Fear and hostility at new knobs which need to be maintained for
> > ever, even if the underlying implementation changes.
> > 
> 
> Do we really need to maintain tunables that lose their purpose either 
> because the implementation changes or is patched to fix the issue that the 
> tunable was intended to address without requiring it?
> 
> Are there really userspace tools written to not be able to handle -ENOENT 
> when one of these gets removed?

I don't know, and neither does anyone else.  So we need to be cautious.
Like putting a warning printk in there and waiting several years.

And it's not just a matter of handling ENOENT.  The user modified this
tunable for a *reason*.  They were expecting some behaviour change in
the kernel.  If we remove the tunable, we take that behaviour change
away from them.  So by adding this tunable, we constrain future
implementations by requiring those implementations to automatically do
whatever the user was previously doing manually.  And we don't reliably
know *why* each user altered that tunable.  It's a horrid mess.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* [PATCH -mm] fixes & cleanups for "add extra free kbytes tunable"
  2011-09-01 22:09       ` Andrew Morton
@ 2011-09-02 16:26         ` Rik van Riel
  -1 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-09-02 16:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

All the fixes suggested by Andrew Morton.   Not much of a changelog
since the patch should probably be folded into
mm-add-extra-free-kbytes-tunable.patch

Thank you for pointing these out, Andrew.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 include/linux/mmzone.h |    2 +-
 include/linux/swap.h   |    2 ++
 kernel/sysctl.c        |    6 ++----
 mm/page_alloc.c        |   13 +++++++------
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index be1ac8d..7013bab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -772,7 +772,7 @@ static inline int is_dma(struct zone *zone)
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
+int free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 14d6249..0679ed5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -207,6 +207,8 @@ struct swap_list_t {
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
+extern int min_free_kbytes;
+extern int extra_free_kbytes;
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 01a9acd..a3a015c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,8 +95,6 @@ extern int suid_dumpable;
 extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
-extern int min_free_kbytes;
-extern int extra_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1186,7 +1184,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &min_free_kbytes,
 		.maxlen		= sizeof(min_free_kbytes),
 		.mode		= 0644,
-		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.proc_handler	= free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
 	{
@@ -1194,7 +1192,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &extra_free_kbytes,
 		.maxlen		= sizeof(extra_free_kbytes),
 		.mode		= 0644,
-		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.proc_handler	= free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
 	{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47d185c..14fc9e9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -183,11 +183,12 @@ static char * const zone_names[MAX_NR_ZONES] = {
 int min_free_kbytes = 1024;
 
 /*
- * Extra memory for the system to try freeing. Used to temporarily
- * free memory, to make space for new workloads. Anyone can allocate
- * down to the min watermarks controlled by min_free_kbytes above.
+ * Extra memory for the system to try freeing between the min and
+ * low watermarks.  Useful for workloads that require low latency
+ * memory allocations in bursts larger than the normal gap between
+ * low and min.
  */
-int extra_free_kbytes = 0;
+int extra_free_kbytes;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -5280,11 +5281,11 @@ int __meminit init_per_zone_wmark_min(void)
 module_init(init_per_zone_wmark_min)
 
 /*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ * free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
  *	that we can call two helper functions whenever min_free_kbytes
  *	or extra_free_kbytes changes.
  */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+int free_kbytes_sysctl_handler(ctl_table *table, int write, 
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);


^ permalink raw reply related	[flat|nested] 100+ messages in thread

* [PATCH -mm] fixes & cleanups for "add extra free kbytes tunable"
@ 2011-09-02 16:26         ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-09-02 16:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

All the fixes suggested by Andrew Morton.   Not much of a changelog
since the patch should probably be folded into
mm-add-extra-free-kbytes-tunable.patch

Thank you for pointing these out, Andrew.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 include/linux/mmzone.h |    2 +-
 include/linux/swap.h   |    2 ++
 kernel/sysctl.c        |    6 ++----
 mm/page_alloc.c        |   13 +++++++------
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index be1ac8d..7013bab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -772,7 +772,7 @@ static inline int is_dma(struct zone *zone)
 
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
-int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
+int free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 14d6249..0679ed5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -207,6 +207,8 @@ struct swap_list_t {
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
+extern int min_free_kbytes;
+extern int extra_free_kbytes;
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 01a9acd..a3a015c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,8 +95,6 @@ extern int suid_dumpable;
 extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 extern int pid_max;
-extern int min_free_kbytes;
-extern int extra_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -1186,7 +1184,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &min_free_kbytes,
 		.maxlen		= sizeof(min_free_kbytes),
 		.mode		= 0644,
-		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.proc_handler	= free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
 	{
@@ -1194,7 +1192,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &extra_free_kbytes,
 		.maxlen		= sizeof(extra_free_kbytes),
 		.mode		= 0644,
-		.proc_handler	= min_free_kbytes_sysctl_handler,
+		.proc_handler	= free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
 	{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47d185c..14fc9e9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -183,11 +183,12 @@ static char * const zone_names[MAX_NR_ZONES] = {
 int min_free_kbytes = 1024;
 
 /*
- * Extra memory for the system to try freeing. Used to temporarily
- * free memory, to make space for new workloads. Anyone can allocate
- * down to the min watermarks controlled by min_free_kbytes above.
+ * Extra memory for the system to try freeing between the min and
+ * low watermarks.  Useful for workloads that require low latency
+ * memory allocations in bursts larger than the normal gap between
+ * low and min.
  */
-int extra_free_kbytes = 0;
+int extra_free_kbytes;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -5280,11 +5281,11 @@ int __meminit init_per_zone_wmark_min(void)
 module_init(init_per_zone_wmark_min)
 
 /*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ * free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
  *	that we can call two helper functions whenever min_free_kbytes
  *	or extra_free_kbytes changes.
  */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+int free_kbytes_sysctl_handler(ctl_table *table, int write, 
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 21:58       ` Andrew Morton
@ 2011-09-02 16:31         ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-09-02 16:31 UTC (permalink / raw)
  To: Andrew Morton, Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

On 09/01/2011 05:58 PM, Andrew Morton wrote:
> On Thu, 1 Sep 2011 15:26:50 -0400
> Rik van Riel <riel@redhat.com> wrote:
> 
>> Add a userspace visible knob
> 
> argh.  Fear and hostility at new knobs which need to be maintained for 
> ever, even if the underlying implementation changes.
> 
> Unfortunately, this one makes sense.
> 
>> to tell the VM to keep an extra amount of memory free, by increasing 
>> the gap between each zone's min and low watermarks.
>>
>> This is useful for realtime applications that call system calls and 
>> have a bound on the number of allocations that happen in any short 
>> time period.  In this application, extra_free_kbytes would be left at 
>> an amount equal to or larger than the maximum number of 
>> allocations that happen in any burst.
> 
> _is_ it useful?  Proof?
> 
> Who is requesting this?  Have they tested it?  Results?

This is interesting for me.

Some of our customers have realtime applications and they are concerned 
the fact that Linux uses free memory as pagecache. It means that
when their application allocate memory, Linux kernel tries to reclaim
memory at first and then allocate it. This may make memory allocation
latency bigger.

In many cases this is not a big issue because Linux has kswapd for
background reclaim and it is fast enough not to enter direct reclaim
path if there are a lot of clean cache. But under some situations -
e.g. Application allocates a lot of memory which is larger than delta
between watermark_low and watermark_min in a short time and kswapd
can't reclaim fast enough due to dirty page reclaim, direct reclaim
is executed and causes big latency.

We can avoid the issue above by using preallocation and mlock.
But it can't cover kmalloc used in systemcall. So I'd like to use
this patch with mlock to avoid memory allocation latency issue as
low as possible. It may not be a perfect solution but it is important
for customers in enterprise area to configure the amount of free
memory at their own risk.

Anyway, now I'm testing this patch and will report a test result later.

Thanks,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-02 16:31         ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-09-02 16:31 UTC (permalink / raw)
  To: Andrew Morton, Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, hughd, hannes

On 09/01/2011 05:58 PM, Andrew Morton wrote:
> On Thu, 1 Sep 2011 15:26:50 -0400
> Rik van Riel <riel@redhat.com> wrote:
> 
>> Add a userspace visible knob
> 
> argh.  Fear and hostility at new knobs which need to be maintained for 
> ever, even if the underlying implementation changes.
> 
> Unfortunately, this one makes sense.
> 
>> to tell the VM to keep an extra amount of memory free, by increasing 
>> the gap between each zone's min and low watermarks.
>>
>> This is useful for realtime applications that call system calls and 
>> have a bound on the number of allocations that happen in any short 
>> time period.  In this application, extra_free_kbytes would be left at 
>> an amount equal to or larger than the maximum number of 
>> allocations that happen in any burst.
> 
> _is_ it useful?  Proof?
> 
> Who is requesting this?  Have they tested it?  Results?

This is interesting for me.

Some of our customers have realtime applications and they are concerned 
the fact that Linux uses free memory as pagecache. It means that
when their application allocate memory, Linux kernel tries to reclaim
memory at first and then allocate it. This may make memory allocation
latency bigger.

In many cases this is not a big issue because Linux has kswapd for
background reclaim and it is fast enough not to enter direct reclaim
path if there are a lot of clean cache. But under some situations -
e.g. Application allocates a lot of memory which is larger than delta
between watermark_low and watermark_min in a short time and kswapd
can't reclaim fast enough due to dirty page reclaim, direct reclaim
is executed and causes big latency.

We can avoid the issue above by using preallocation and mlock.
But it can't cover kmalloc used in systemcall. So I'd like to use
this patch with mlock to avoid memory allocation latency issue as
low as possible. It may not be a perfect solution but it is important
for customers in enterprise area to configure the amount of free
memory at their own risk.

Anyway, now I'm testing this patch and will report a test result later.

Thanks,
Satoru

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
       [not found]       ` <E1FA588BC672D846BDBB452FCA1E308C2389B4@USINDEVS02.corp.hds.com>
@ 2011-09-15  3:33           ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-09-15  3:33 UTC (permalink / raw)
  To: Andrew Morton, Rik van Riel
  Cc: Randy Dunlap, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	hughd, hannes

On 09/02/2011 12:31 PM, Satoru Moriya wrote:
> On 09/01/2011 05:58 PM, Andrew Morton wrote:
>> On Thu, 1 Sep 2011 15:26:50 -0400
>> Rik van Riel <riel@redhat.com> wrote:
> Anyway, now I'm testing this patch and will report a test result later.

Sorry for late reply. Here is my test result.

I ran some sample workloads and measure memory allocation latency
(latency of __alloc_page_nodemask()).
The test is like following:

 - CPU: 1 socket, 4 core
 - Memory: 4GB

 - Background load:
   $ dd if=/dev/zero of=/tmp/tmp1
   $ dd if=/dev/zero of=/tmp/tmp2
   $ dd if=/dev/zero of=/tmp/tmp3

 - Main load:
   $ mapped-file-stream 1 $((1024 * 1024 * 640))  --(*)

 (*) This is made by Johannes Weiner
     https://lkml.org/lkml/2010/8/30/226

     It allocates/access 640MByte memory at a burst.

The result is follwoing:

                               |         |  extra   |
                               | default |  kbytes  |
--------------------------------------------------------------
min_free_kbytes                |    8113 |   8113   |
extra_free_kbytes              |       0 | 640*1024 | (KB)
--------------------------------------------------------------
worst latency                  | 517.762 |  20.775  | (usec)
--------------------------------------------------------------
vmstat result                  |         |          |
 nr_vmscan_write               |       0 |      0   |
 pgsteal_dma                   |       0 |      0   |
 pgsteal_dma32                 |  143667 | 144882   |
 pgsteal_normal                |   31486 |  27001   |
 pgsteal_movable               |       0 |      0   |
 pgscan_kswapd_dma             |       0 |      0   |
 pgscan_kswapd_dma32           |  138617 | 156351   |
 pgscan_kswapd_normal          |   30593 |  27955   |
 pgscan_kswapd_movable         |       0 |      0   |
 pgscan_direct_dma             |       0 |      0   |
 pgscan_direct_dma32           |    5050 |      0   |
 pgscan_direct_normal          |     896 |      0   |
 pgscan_direct_movable         |       0 |      0   |
 kswapd_steal                  |  169207 | 171883   |
 kswapd_inodesteal             |       0 |      0   |
 kswapd_low_wmark_hit_quickly  |      43 |     45   |
 kswapd_high_wmark_hit_quickly |       1 |      0   |
 allocstall                    |      32 |      0   |


As you can see, in the default case there were 32 direct reclaim (allocstall)
and its worst latency was 517.762 usecs. This value may be larger if
a process would sleep or issue I/O in the direct reclaim path. OTOH,
ii the other case where I add extra free bytes, there were no direct
reclaim and its worst latency was 20.775 usecs.

In this test case, we can avoid direct reclaim and keep a latency low.

Tested-by: Satoru Moriya <satoru.moriya@hds.com>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-15  3:33           ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-09-15  3:33 UTC (permalink / raw)
  To: Andrew Morton, Rik van Riel
  Cc: Randy Dunlap, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	hughd, hannes

On 09/02/2011 12:31 PM, Satoru Moriya wrote:
> On 09/01/2011 05:58 PM, Andrew Morton wrote:
>> On Thu, 1 Sep 2011 15:26:50 -0400
>> Rik van Riel <riel@redhat.com> wrote:
> Anyway, now I'm testing this patch and will report a test result later.

Sorry for late reply. Here is my test result.

I ran some sample workloads and measure memory allocation latency
(latency of __alloc_page_nodemask()).
The test is like following:

 - CPU: 1 socket, 4 core
 - Memory: 4GB

 - Background load:
   $ dd if=/dev/zero of=/tmp/tmp1
   $ dd if=/dev/zero of=/tmp/tmp2
   $ dd if=/dev/zero of=/tmp/tmp3

 - Main load:
   $ mapped-file-stream 1 $((1024 * 1024 * 640))  --(*)

 (*) This is made by Johannes Weiner
     https://lkml.org/lkml/2010/8/30/226

     It allocates/access 640MByte memory at a burst.

The result is follwoing:

                               |         |  extra   |
                               | default |  kbytes  |
--------------------------------------------------------------
min_free_kbytes                |    8113 |   8113   |
extra_free_kbytes              |       0 | 640*1024 | (KB)
--------------------------------------------------------------
worst latency                  | 517.762 |  20.775  | (usec)
--------------------------------------------------------------
vmstat result                  |         |          |
 nr_vmscan_write               |       0 |      0   |
 pgsteal_dma                   |       0 |      0   |
 pgsteal_dma32                 |  143667 | 144882   |
 pgsteal_normal                |   31486 |  27001   |
 pgsteal_movable               |       0 |      0   |
 pgscan_kswapd_dma             |       0 |      0   |
 pgscan_kswapd_dma32           |  138617 | 156351   |
 pgscan_kswapd_normal          |   30593 |  27955   |
 pgscan_kswapd_movable         |       0 |      0   |
 pgscan_direct_dma             |       0 |      0   |
 pgscan_direct_dma32           |    5050 |      0   |
 pgscan_direct_normal          |     896 |      0   |
 pgscan_direct_movable         |       0 |      0   |
 kswapd_steal                  |  169207 | 171883   |
 kswapd_inodesteal             |       0 |      0   |
 kswapd_low_wmark_hit_quickly  |      43 |     45   |
 kswapd_high_wmark_hit_quickly |       1 |      0   |
 allocstall                    |      32 |      0   |


As you can see, in the default case there were 32 direct reclaim (allocstall)
and its worst latency was 517.762 usecs. This value may be larger if
a process would sleep or issue I/O in the direct reclaim path. OTOH,
ii the other case where I add extra free bytes, there were no direct
reclaim and its worst latency was 20.775 usecs.

In this test case, we can avoid direct reclaim and keep a latency low.

Tested-by: Satoru Moriya <satoru.moriya@hds.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 19:26     ` Rik van Riel
@ 2011-09-30 21:43       ` Johannes Weiner
  -1 siblings, 0 replies; 100+ messages in thread
From: Johannes Weiner @ 2011-09-30 21:43 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, akpm, hughd, hannes

On Thu, Sep 01, 2011 at 03:26:50PM -0400, Rik van Riel wrote:
> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 
> Signed-off-by: Rik van Riel<riel@redhat.com>

Acked-by: Johannes Weiner <jweiner@redhat.com>

Btw, I wonder if there should be a waking of the kswapds in
setup_per_zone_wmarks() in general to make sure the new watermarks are
met.  But that applies to min_free_kbytes as well, so not a
requirement for this patch.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-09-30 21:43       ` Johannes Weiner
  0 siblings, 0 replies; 100+ messages in thread
From: Johannes Weiner @ 2011-09-30 21:43 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, akpm, hughd, hannes

On Thu, Sep 01, 2011 at 03:26:50PM -0400, Rik van Riel wrote:
> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 
> Signed-off-by: Rik van Riel<riel@redhat.com>

Acked-by: Johannes Weiner <jweiner@redhat.com>

Btw, I wonder if there should be a waking of the kswapds in
setup_per_zone_wmarks() in general to make sure the new watermarks are
met.  But that applies to min_free_kbytes as well, so not a
requirement for this patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-01 19:26     ` Rik van Riel
@ 2011-10-08  3:08       ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-08  3:08 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, akpm, hughd, hannes

On Thu, 1 Sep 2011, Rik van Riel wrote:

> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 

I know this was merged into -mm, but I still have to disagree with it 
because I think it adds yet another userspace knob that will never be 
obsoleted, will be misinterepted, and is tied very closely to the 
implementation of page reclaim, both synchronous and asynchronous.  I also 
think that it will cause regressions on other cpu intensive workloads 
that don't require this extra freed memory because it works as a global 
heuristic and is not tied to any specific application.

I think it would be far better to reclaim beyond above the high watermark 
if the types of workloads that need this tunable can be somehow detected 
(the worst case scenario is being a prctl() that does synchronous reclaim 
above the watermark so admins can identify these workloads), or be able to 
mark allocations within the kernel as potentially coming in large bursts 
where allocation is problematic.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-08  3:08       ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-08  3:08 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, akpm, hughd, hannes

On Thu, 1 Sep 2011, Rik van Riel wrote:

> Add a userspace visible knob to tell the VM to keep an extra amount
> of memory free, by increasing the gap between each zone's min and
> low watermarks.
> 
> This is useful for realtime applications that call system
> calls and have a bound on the number of allocations that happen
> in any short time period.  In this application, extra_free_kbytes
> would be left at an amount equal to or larger than than the
> maximum number of allocations that happen in any burst.
> 
> It may also be useful to reduce the memory use of virtual
> machines (temporarily?), in a way that does not cause memory
> fragmentation like ballooning does.
> 

I know this was merged into -mm, but I still have to disagree with it 
because I think it adds yet another userspace knob that will never be 
obsoleted, will be misinterepted, and is tied very closely to the 
implementation of page reclaim, both synchronous and asynchronous.  I also 
think that it will cause regressions on other cpu intensive workloads 
that don't require this extra freed memory because it works as a global 
heuristic and is not tied to any specific application.

I think it would be far better to reclaim beyond above the high watermark 
if the types of workloads that need this tunable can be somehow detected 
(the worst case scenario is being a prctl() that does synchronous reclaim 
above the watermark so admins can identify these workloads), or be able to 
mark allocations within the kernel as potentially coming in large bursts 
where allocation is problematic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-08  3:08       ` David Rientjes
@ 2011-10-10 22:37         ` Andrew Morton
  -1 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-10 22:37 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Fri, 7 Oct 2011 20:08:19 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> On Thu, 1 Sep 2011, Rik van Riel wrote:
> 
> > Add a userspace visible knob to tell the VM to keep an extra amount
> > of memory free, by increasing the gap between each zone's min and
> > low watermarks.
> > 
> > This is useful for realtime applications that call system
> > calls and have a bound on the number of allocations that happen
> > in any short time period.  In this application, extra_free_kbytes
> > would be left at an amount equal to or larger than than the
> > maximum number of allocations that happen in any burst.
> > 
> > It may also be useful to reduce the memory use of virtual
> > machines (temporarily?), in a way that does not cause memory
> > fragmentation like ballooning does.
> > 
> 
> I know this was merged into -mm, but I still have to disagree with it 
> because I think it adds yet another userspace knob that will never be 
> obsoleted, will be misinterepted, and is tied very closely to the 
> implementation of page reclaim, both synchronous and asynchronous.

Yup.  We should strenuously avoid merging it, for these reasons.

>  I also 
> think that it will cause regressions on other cpu intensive workloads 
> that don't require this extra freed memory because it works as a global 
> heuristic and is not tied to any specific application.
> 
> I think it would be far better to reclaim beyond above the high watermark 
> if the types of workloads that need this tunable can be somehow detected 
> (the worst case scenario is being a prctl() that does synchronous reclaim 
> above the watermark so admins can identify these workloads), or be able to 
> mark allocations within the kernel as potentially coming in large bursts 
> where allocation is problematic.

The page allocator already tries harder if the caller has
rt_task(current).  Why is this inadequate?  Can we extend this idea
further to fix whatever-the-problem-is?

Does there exist anything like a test case which demonstrates the need
for this feature?


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-10 22:37         ` Andrew Morton
  0 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-10 22:37 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Fri, 7 Oct 2011 20:08:19 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> On Thu, 1 Sep 2011, Rik van Riel wrote:
> 
> > Add a userspace visible knob to tell the VM to keep an extra amount
> > of memory free, by increasing the gap between each zone's min and
> > low watermarks.
> > 
> > This is useful for realtime applications that call system
> > calls and have a bound on the number of allocations that happen
> > in any short time period.  In this application, extra_free_kbytes
> > would be left at an amount equal to or larger than than the
> > maximum number of allocations that happen in any burst.
> > 
> > It may also be useful to reduce the memory use of virtual
> > machines (temporarily?), in a way that does not cause memory
> > fragmentation like ballooning does.
> > 
> 
> I know this was merged into -mm, but I still have to disagree with it 
> because I think it adds yet another userspace knob that will never be 
> obsoleted, will be misinterepted, and is tied very closely to the 
> implementation of page reclaim, both synchronous and asynchronous.

Yup.  We should strenuously avoid merging it, for these reasons.

>  I also 
> think that it will cause regressions on other cpu intensive workloads 
> that don't require this extra freed memory because it works as a global 
> heuristic and is not tied to any specific application.
> 
> I think it would be far better to reclaim beyond above the high watermark 
> if the types of workloads that need this tunable can be somehow detected 
> (the worst case scenario is being a prctl() that does synchronous reclaim 
> above the watermark so admins can identify these workloads), or be able to 
> mark allocations within the kernel as potentially coming in large bursts 
> where allocation is problematic.

The page allocator already tries harder if the caller has
rt_task(current).  Why is this inadequate?  Can we extend this idea
further to fix whatever-the-problem-is?

Does there exist anything like a test case which demonstrates the need
for this feature?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-08  3:08       ` David Rientjes
@ 2011-10-11 19:20         ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-11 19:20 UTC (permalink / raw)
  To: David Rientjes, Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, akpm, hughd, hannes

On 10/07/2011 11:08 PM, David Rientjes wrote:
> On Thu, 1 Sep 2011, Rik van Riel wrote:
>
> I also
> think that it will cause regressions on other cpu intensive workloads 
> that don't require this extra freed memory because it works as a 
> global heuristic and is not tied to any specific application.

It's yes and no. It may cause regressions on the workloads due to
less amount of available memory. But it may improve the workloads'
performance because they can avoid direct reclaim due to extra
free memory.

Of course if one doesn't need extra free memory, one can turn it
off. I think we can add this feature to cgroup if we want to set
it for any specific process or process group. (Before that we
need to implement min_free_kbytes for cgroup and the implementation
of extra free kbytes strongly depends on it.)

> I think it would be far better to reclaim beyond above the high 
> watermark if the types of workloads that need this tunable can be 
> somehow detected (the worst case scenario is being a prctl() that does 
> synchronous reclaim above the watermark so admins can identify these 
> workloads), or be able to mark allocations within the kernel as 
> potentially coming in large bursts where allocation is problematic.

It may work. But isn't it difficult and/or complex to decide
how much memory we should reclaim beyond high watermark
automatically?

I believe that extra free kbytes is far simpler than them.

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-11 19:20         ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-11 19:20 UTC (permalink / raw)
  To: David Rientjes, Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, akpm, hughd, hannes

On 10/07/2011 11:08 PM, David Rientjes wrote:
> On Thu, 1 Sep 2011, Rik van Riel wrote:
>
> I also
> think that it will cause regressions on other cpu intensive workloads 
> that don't require this extra freed memory because it works as a 
> global heuristic and is not tied to any specific application.

It's yes and no. It may cause regressions on the workloads due to
less amount of available memory. But it may improve the workloads'
performance because they can avoid direct reclaim due to extra
free memory.

Of course if one doesn't need extra free memory, one can turn it
off. I think we can add this feature to cgroup if we want to set
it for any specific process or process group. (Before that we
need to implement min_free_kbytes for cgroup and the implementation
of extra free kbytes strongly depends on it.)

> I think it would be far better to reclaim beyond above the high 
> watermark if the types of workloads that need this tunable can be 
> somehow detected (the worst case scenario is being a prctl() that does 
> synchronous reclaim above the watermark so admins can identify these 
> workloads), or be able to mark allocations within the kernel as 
> potentially coming in large bursts where allocation is problematic.

It may work. But isn't it difficult and/or complex to decide
how much memory we should reclaim beyond high watermark
automatically?

I believe that extra free kbytes is far simpler than them.

Regards,
Satoru

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-10 22:37         ` Andrew Morton
@ 2011-10-11 19:32           ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-11 19:32 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/10/2011 06:37 PM, Andrew Morton wrote:
> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
> <rientjes@google.com> wrote:
> 
>> On Thu, 1 Sep 2011, Rik van Riel wrote:
>
> The page allocator already tries harder if the caller has 
> rt_task(current).  Why is this inadequate?  Can we extend this idea 
> further to fix whatever-the-problem-is?

Actually page allocator decreases min watermark to 3/4 * min watermark
for rt-task. But in our case some applications create a lot of
processes and if all of them are rt-task, the amount of watermark
bonus(1/4 * min watermark) is not enough.

If we can tune the amount of bonus, it may be fine. But that is
almost all same as extra free kbytes.

> Does there exist anything like a test case which demonstrates the need 
> for this feature?

Unfortunately I don't have a real test case but just simple one.
And in my simple test case, I can avoid direct reclaim if we set
workload as rt-task.

The simple test case I used is following:
http://marc.info/?l=linux-mm&m=131605773321672&w=2

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-11 19:32           ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-11 19:32 UTC (permalink / raw)
  To: Andrew Morton, David Rientjes
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/10/2011 06:37 PM, Andrew Morton wrote:
> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
> <rientjes@google.com> wrote:
> 
>> On Thu, 1 Sep 2011, Rik van Riel wrote:
>
> The page allocator already tries harder if the caller has 
> rt_task(current).  Why is this inadequate?  Can we extend this idea 
> further to fix whatever-the-problem-is?

Actually page allocator decreases min watermark to 3/4 * min watermark
for rt-task. But in our case some applications create a lot of
processes and if all of them are rt-task, the amount of watermark
bonus(1/4 * min watermark) is not enough.

If we can tune the amount of bonus, it may be fine. But that is
almost all same as extra free kbytes.

> Does there exist anything like a test case which demonstrates the need 
> for this feature?

Unfortunately I don't have a real test case but just simple one.
And in my simple test case, I can avoid direct reclaim if we set
workload as rt-task.

The simple test case I used is following:
http://marc.info/?l=linux-mm&m=131605773321672&w=2

Regards,
Satoru

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 19:32           ` Satoru Moriya
@ 2011-10-11 19:54             ` Andrew Morton
  -1 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-11 19:54 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Tue, 11 Oct 2011 15:32:11 -0400
Satoru Moriya <satoru.moriya@hds.com> wrote:

> On 10/10/2011 06:37 PM, Andrew Morton wrote:
> > On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
> > <rientjes@google.com> wrote:
> > 
> >> On Thu, 1 Sep 2011, Rik van Riel wrote:
> >
> > The page allocator already tries harder if the caller has 
> > rt_task(current).  Why is this inadequate?  Can we extend this idea 
> > further to fix whatever-the-problem-is?
> 
> Actually page allocator decreases min watermark to 3/4 * min watermark
> for rt-task. But in our case some applications create a lot of
> processes and if all of them are rt-task, the amount of watermark
> bonus(1/4 * min watermark) is not enough.
> 
> If we can tune the amount of bonus, it may be fine. But that is
> almost all same as extra free kbytes.

This situation is detectable at runtime.  If realtime tasks are being
stalled in the page allocator then start to increase the free-page
reserves.  A little control system.


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-11 19:54             ` Andrew Morton
  0 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-11 19:54 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Tue, 11 Oct 2011 15:32:11 -0400
Satoru Moriya <satoru.moriya@hds.com> wrote:

> On 10/10/2011 06:37 PM, Andrew Morton wrote:
> > On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
> > <rientjes@google.com> wrote:
> > 
> >> On Thu, 1 Sep 2011, Rik van Riel wrote:
> >
> > The page allocator already tries harder if the caller has 
> > rt_task(current).  Why is this inadequate?  Can we extend this idea 
> > further to fix whatever-the-problem-is?
> 
> Actually page allocator decreases min watermark to 3/4 * min watermark
> for rt-task. But in our case some applications create a lot of
> processes and if all of them are rt-task, the amount of watermark
> bonus(1/4 * min watermark) is not enough.
> 
> If we can tune the amount of bonus, it may be fine. But that is
> almost all same as extra free kbytes.

This situation is detectable at runtime.  If realtime tasks are being
stalled in the page allocator then start to increase the free-page
reserves.  A little control system.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 19:54             ` Andrew Morton
@ 2011-10-11 20:23               ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-11 20:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/11/2011 03:55 PM, Andrew Morton wrote:
> On Tue, 11 Oct 2011 15:32:11 -0400
> Satoru Moriya <satoru.moriya@hds.com> wrote:
> 
>> On 10/10/2011 06:37 PM, Andrew Morton wrote:
>>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
>>> <rientjes@google.com> wrote:
>>>
>>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
>>
>> Actually page allocator decreases min watermark to 3/4 * min 
>> watermark for rt-task. But in our case some applications create a lot 
>> of processes and if all of them are rt-task, the amount of watermark
>> bonus(1/4 * min watermark) is not enough.
>>
>> If we can tune the amount of bonus, it may be fine. But that is 
>> almost all same as extra free kbytes.
> 
> This situation is detectable at runtime.  If realtime tasks are being 
> stalled in the page allocator then start to increase the free-page 
> reserves.  A little control system.

Detecting at runtime is too late for some latency critical systems.
At that system, we must avoid a stall before it happens.

Also, if we increase the free-page reserves a.k.a min_free_kbytes,
the possibility of direct reclaim on other workloads increases.
I think it's a bad side effect.

Thanks,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-11 20:23               ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-11 20:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/11/2011 03:55 PM, Andrew Morton wrote:
> On Tue, 11 Oct 2011 15:32:11 -0400
> Satoru Moriya <satoru.moriya@hds.com> wrote:
> 
>> On 10/10/2011 06:37 PM, Andrew Morton wrote:
>>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
>>> <rientjes@google.com> wrote:
>>>
>>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
>>
>> Actually page allocator decreases min watermark to 3/4 * min 
>> watermark for rt-task. But in our case some applications create a lot 
>> of processes and if all of them are rt-task, the amount of watermark
>> bonus(1/4 * min watermark) is not enough.
>>
>> If we can tune the amount of bonus, it may be fine. But that is 
>> almost all same as extra free kbytes.
> 
> This situation is detectable at runtime.  If realtime tasks are being 
> stalled in the page allocator then start to increase the free-page 
> reserves.  A little control system.

Detecting at runtime is too late for some latency critical systems.
At that system, we must avoid a stall before it happens.

Also, if we increase the free-page reserves a.k.a min_free_kbytes,
the possibility of direct reclaim on other workloads increases.
I think it's a bad side effect.

Thanks,
Satoru

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 20:23               ` Satoru Moriya
@ 2011-10-11 20:54                 ` Andrew Morton
  -1 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-11 20:54 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Tue, 11 Oct 2011 16:23:22 -0400
Satoru Moriya <satoru.moriya@hds.com> wrote:

> On 10/11/2011 03:55 PM, Andrew Morton wrote:
> > On Tue, 11 Oct 2011 15:32:11 -0400
> > Satoru Moriya <satoru.moriya@hds.com> wrote:
> > 
> >> On 10/10/2011 06:37 PM, Andrew Morton wrote:
> >>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
> >>> <rientjes@google.com> wrote:
> >>>
> >>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
> >>
> >> Actually page allocator decreases min watermark to 3/4 * min 
> >> watermark for rt-task. But in our case some applications create a lot 
> >> of processes and if all of them are rt-task, the amount of watermark
> >> bonus(1/4 * min watermark) is not enough.
> >>
> >> If we can tune the amount of bonus, it may be fine. But that is 
> >> almost all same as extra free kbytes.
> > 
> > This situation is detectable at runtime.  If realtime tasks are being 
> > stalled in the page allocator then start to increase the free-page 
> > reserves.  A little control system.
> 
> Detecting at runtime is too late for some latency critical systems.
> At that system, we must avoid a stall before it happens.

It's pretty darn obvious that the kernel can easily see the situation
developing before it happens.  By comparing a few integers.

Look, please don't go bending over backwards like this to defend a bad
patch.  It's a bad patch!  It would be better not to have to merge it. 
Let's do something better.

> Also, if we increase the free-page reserves a.k.a min_free_kbytes,
> the possibility of direct reclaim on other workloads increases.
> I think it's a bad side effect.

extra_free_kbytes has the same side-effect.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-11 20:54                 ` Andrew Morton
  0 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-11 20:54 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Tue, 11 Oct 2011 16:23:22 -0400
Satoru Moriya <satoru.moriya@hds.com> wrote:

> On 10/11/2011 03:55 PM, Andrew Morton wrote:
> > On Tue, 11 Oct 2011 15:32:11 -0400
> > Satoru Moriya <satoru.moriya@hds.com> wrote:
> > 
> >> On 10/10/2011 06:37 PM, Andrew Morton wrote:
> >>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes 
> >>> <rientjes@google.com> wrote:
> >>>
> >>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
> >>
> >> Actually page allocator decreases min watermark to 3/4 * min 
> >> watermark for rt-task. But in our case some applications create a lot 
> >> of processes and if all of them are rt-task, the amount of watermark
> >> bonus(1/4 * min watermark) is not enough.
> >>
> >> If we can tune the amount of bonus, it may be fine. But that is 
> >> almost all same as extra free kbytes.
> > 
> > This situation is detectable at runtime.  If realtime tasks are being 
> > stalled in the page allocator then start to increase the free-page 
> > reserves.  A little control system.
> 
> Detecting at runtime is too late for some latency critical systems.
> At that system, we must avoid a stall before it happens.

It's pretty darn obvious that the kernel can easily see the situation
developing before it happens.  By comparing a few integers.

Look, please don't go bending over backwards like this to defend a bad
patch.  It's a bad patch!  It would be better not to have to merge it. 
Let's do something better.

> Also, if we increase the free-page reserves a.k.a min_free_kbytes,
> the possibility of direct reclaim on other workloads increases.
> I think it's a bad side effect.

extra_free_kbytes has the same side-effect.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 19:20         ` Satoru Moriya
@ 2011-10-11 21:04           ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-11 21:04 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On Tue, 11 Oct 2011, Satoru Moriya wrote:

> > I also
> > think that it will cause regressions on other cpu intensive workloads 
> > that don't require this extra freed memory because it works as a 
> > global heuristic and is not tied to any specific application.
> 
> It's yes and no. It may cause regressions on the workloads due to
> less amount of available memory. But it may improve the workloads'
> performance because they can avoid direct reclaim due to extra
> free memory.
> 

There's only a memory-availability regression if background reclaim is 
actually triggered in the first place, i.e. extra_free_kbytes doesn't 
affect the watermarks themselves when reclaim is started but rather causes 
it to, when set, reclaim more memory than otherwise.

That's not really what I was referring to; I was referring to cpu 
intensive workloads that now incur a regression because kswapd is now 
doing more work (potentially a significant amount of work since 
extra_free_kbytes is unbounded) on shared machines.  These applications 
may not be allocating memory at all and now they incur a performance 
penalty because kswapd is taking away one of their cores.

In other words, I think it's a fine solution if you're running a single 
application with very bursty memory allocations so you need to reclaim 
more memory when low, but that solution is troublesome if it comes at 
the penalty of other applications and that's a direct consequence of it 
being a global tunable.  I'd much rather identify memory allocations in 
the kernel that causing the pain here and mitigate it by (i) attempting to 
sanely rate limit those allocations, (ii) preallocate at least a partial 
amount of those allocations ahead of time so avoid significant reclaim 
all at one, or (iii) annotate memory allocations with such potential so 
that the page allocator can add this reclaim bonus itself only in these 
conditions.

> Of course if one doesn't need extra free memory, one can turn it
> off. I think we can add this feature to cgroup if we want to set
> it for any specific process or process group. (Before that we
> need to implement min_free_kbytes for cgroup and the implementation
> of extra free kbytes strongly depends on it.)
> 

That would allow you to only reclaim additional memory when certain 
applications tirgger it, but it's not actually a solution since another 
task can hit a zone's low watermark and kick kswapd and then the bursty 
memory allocations happen immediately following that and doesn't actually 
do anything because kswapd was already running.  So I disagree, as I did 
when per-cgroup watermark tunables were proposed, that watermarks should 
be changed for a subset of applications unless you guarantee memory 
isolation such that that subset of applications has exclusive access to 
the memory zones being tuned.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-11 21:04           ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-11 21:04 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Rik van Riel, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On Tue, 11 Oct 2011, Satoru Moriya wrote:

> > I also
> > think that it will cause regressions on other cpu intensive workloads 
> > that don't require this extra freed memory because it works as a 
> > global heuristic and is not tied to any specific application.
> 
> It's yes and no. It may cause regressions on the workloads due to
> less amount of available memory. But it may improve the workloads'
> performance because they can avoid direct reclaim due to extra
> free memory.
> 

There's only a memory-availability regression if background reclaim is 
actually triggered in the first place, i.e. extra_free_kbytes doesn't 
affect the watermarks themselves when reclaim is started but rather causes 
it to, when set, reclaim more memory than otherwise.

That's not really what I was referring to; I was referring to cpu 
intensive workloads that now incur a regression because kswapd is now 
doing more work (potentially a significant amount of work since 
extra_free_kbytes is unbounded) on shared machines.  These applications 
may not be allocating memory at all and now they incur a performance 
penalty because kswapd is taking away one of their cores.

In other words, I think it's a fine solution if you're running a single 
application with very bursty memory allocations so you need to reclaim 
more memory when low, but that solution is troublesome if it comes at 
the penalty of other applications and that's a direct consequence of it 
being a global tunable.  I'd much rather identify memory allocations in 
the kernel that causing the pain here and mitigate it by (i) attempting to 
sanely rate limit those allocations, (ii) preallocate at least a partial 
amount of those allocations ahead of time so avoid significant reclaim 
all at one, or (iii) annotate memory allocations with such potential so 
that the page allocator can add this reclaim bonus itself only in these 
conditions.

> Of course if one doesn't need extra free memory, one can turn it
> off. I think we can add this feature to cgroup if we want to set
> it for any specific process or process group. (Before that we
> need to implement min_free_kbytes for cgroup and the implementation
> of extra free kbytes strongly depends on it.)
> 

That would allow you to only reclaim additional memory when certain 
applications tirgger it, but it's not actually a solution since another 
task can hit a zone's low watermark and kick kswapd and then the bursty 
memory allocations happen immediately following that and doesn't actually 
do anything because kswapd was already running.  So I disagree, as I did 
when per-cgroup watermark tunables were proposed, that watermarks should 
be changed for a subset of applications unless you guarantee memory 
isolation such that that subset of applications has exclusive access to 
the memory zones being tuned.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 19:32           ` Satoru Moriya
@ 2011-10-11 23:22             ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-11 23:22 UTC (permalink / raw)
  To: Satoru Moriya, Con Kolivas
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On Tue, 11 Oct 2011, Satoru Moriya wrote:

> Actually page allocator decreases min watermark to 3/4 * min watermark
> for rt-task. But in our case some applications create a lot of
> processes and if all of them are rt-task, the amount of watermark
> bonus(1/4 * min watermark) is not enough.
> 

Right, if you can exhaust (1/4 * min_wmark) of memory quickly enough, 
you'll still have latency issues.

> If we can tune the amount of bonus, it may be fine. But that is
> almost all same as extra free kbytes.
> 

I don't know if your test case is the only thing that Rik is looking at, 
but if so, then that statement makes me believe that this patch is 
definitely in the wrong direction, so NACK on it until additional 
information is presented.  The reasoning is simple: if tuning the bonus 
given to rt-tasks in the page allocator itself would fix the issue, then 
we can certainly add logic specifically for rt-tasks that can reclaim more 
aggressively without needing any tunable from userspace (and _certainly_ 
not a global tunable that affects every application!).

> > Does there exist anything like a test case which demonstrates the need 
> > for this feature?
> 
> Unfortunately I don't have a real test case but just simple one.
> And in my simple test case, I can avoid direct reclaim if we set
> workload as rt-task.
> 
> The simple test case I used is following:
> http://marc.info/?l=linux-mm&m=131605773321672&w=2
> 

I tried proposing one of Con's patches from his BFS scheduler ("mm: adjust 
kswapd nice level for high priority page") about 1 1/2 years ago that I 
recall and believe may significantly help your test case.  The thread is 
at http://marc.info/?t=126743860700002.  (There's a lot of interesting 
things in Con's patchset that can be pulled into the VM, this isn't the 
only one.)

The patch wasn't merged back then because we wanted a test case that was 
specifically fixed by this issue, and it may be that we have just found 
one.  If you could try it out without any extra_free_kbytes, I think we 
may be able to help your situation.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-11 23:22             ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-11 23:22 UTC (permalink / raw)
  To: Satoru Moriya, Con Kolivas
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On Tue, 11 Oct 2011, Satoru Moriya wrote:

> Actually page allocator decreases min watermark to 3/4 * min watermark
> for rt-task. But in our case some applications create a lot of
> processes and if all of them are rt-task, the amount of watermark
> bonus(1/4 * min watermark) is not enough.
> 

Right, if you can exhaust (1/4 * min_wmark) of memory quickly enough, 
you'll still have latency issues.

> If we can tune the amount of bonus, it may be fine. But that is
> almost all same as extra free kbytes.
> 

I don't know if your test case is the only thing that Rik is looking at, 
but if so, then that statement makes me believe that this patch is 
definitely in the wrong direction, so NACK on it until additional 
information is presented.  The reasoning is simple: if tuning the bonus 
given to rt-tasks in the page allocator itself would fix the issue, then 
we can certainly add logic specifically for rt-tasks that can reclaim more 
aggressively without needing any tunable from userspace (and _certainly_ 
not a global tunable that affects every application!).

> > Does there exist anything like a test case which demonstrates the need 
> > for this feature?
> 
> Unfortunately I don't have a real test case but just simple one.
> And in my simple test case, I can avoid direct reclaim if we set
> workload as rt-task.
> 
> The simple test case I used is following:
> http://marc.info/?l=linux-mm&m=131605773321672&w=2
> 

I tried proposing one of Con's patches from his BFS scheduler ("mm: adjust 
kswapd nice level for high priority page") about 1 1/2 years ago that I 
recall and believe may significantly help your test case.  The thread is 
at http://marc.info/?t=126743860700002.  (There's a lot of interesting 
things in Con's patchset that can be pulled into the VM, this isn't the 
only one.)

The patch wasn't merged back then because we wanted a test case that was 
specifically fixed by this issue, and it may be that we have just found 
one.  If you could try it out without any extra_free_kbytes, I think we 
may be able to help your situation.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 20:54                 ` Andrew Morton
@ 2011-10-12 13:09                   ` Rik van Riel
  -1 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-12 13:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Satoru Moriya, David Rientjes, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/11/2011 04:54 PM, Andrew Morton wrote:
> On Tue, 11 Oct 2011 16:23:22 -0400
> Satoru Moriya<satoru.moriya@hds.com>  wrote:
>
>> On 10/11/2011 03:55 PM, Andrew Morton wrote:
>>> On Tue, 11 Oct 2011 15:32:11 -0400
>>> Satoru Moriya<satoru.moriya@hds.com>  wrote:
>>>
>>>> On 10/10/2011 06:37 PM, Andrew Morton wrote:
>>>>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes
>>>>> <rientjes@google.com>  wrote:
>>>>>
>>>>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
>>>>
>>>> Actually page allocator decreases min watermark to 3/4 * min
>>>> watermark for rt-task. But in our case some applications create a lot
>>>> of processes and if all of them are rt-task, the amount of watermark
>>>> bonus(1/4 * min watermark) is not enough.
>>>>
>>>> If we can tune the amount of bonus, it may be fine. But that is
>>>> almost all same as extra free kbytes.
>>>
>>> This situation is detectable at runtime.  If realtime tasks are being
>>> stalled in the page allocator then start to increase the free-page
>>> reserves.  A little control system.
>>
>> Detecting at runtime is too late for some latency critical systems.
>> At that system, we must avoid a stall before it happens.
>
> It's pretty darn obvious that the kernel can easily see the situation
> developing before it happens.  By comparing a few integers.

The problem is that we may be dealing with bursts, not steady
states of allocations.  Without knowing the size of a burst,
we have no idea when we should wake up kswapd to get enough
memory freed ahead of the application's allocations.

> Look, please don't go bending over backwards like this to defend a bad
> patch.  It's a bad patch!  It would be better not to have to merge it.
> Let's do something better.

I would love it if we could come up with something better,
and have thought about it a lot.

However, so far we do not seem to have an alternative yet :(

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 13:09                   ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-12 13:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Satoru Moriya, David Rientjes, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/11/2011 04:54 PM, Andrew Morton wrote:
> On Tue, 11 Oct 2011 16:23:22 -0400
> Satoru Moriya<satoru.moriya@hds.com>  wrote:
>
>> On 10/11/2011 03:55 PM, Andrew Morton wrote:
>>> On Tue, 11 Oct 2011 15:32:11 -0400
>>> Satoru Moriya<satoru.moriya@hds.com>  wrote:
>>>
>>>> On 10/10/2011 06:37 PM, Andrew Morton wrote:
>>>>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes
>>>>> <rientjes@google.com>  wrote:
>>>>>
>>>>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
>>>>
>>>> Actually page allocator decreases min watermark to 3/4 * min
>>>> watermark for rt-task. But in our case some applications create a lot
>>>> of processes and if all of them are rt-task, the amount of watermark
>>>> bonus(1/4 * min watermark) is not enough.
>>>>
>>>> If we can tune the amount of bonus, it may be fine. But that is
>>>> almost all same as extra free kbytes.
>>>
>>> This situation is detectable at runtime.  If realtime tasks are being
>>> stalled in the page allocator then start to increase the free-page
>>> reserves.  A little control system.
>>
>> Detecting at runtime is too late for some latency critical systems.
>> At that system, we must avoid a stall before it happens.
>
> It's pretty darn obvious that the kernel can easily see the situation
> developing before it happens.  By comparing a few integers.

The problem is that we may be dealing with bursts, not steady
states of allocations.  Without knowing the size of a burst,
we have no idea when we should wake up kswapd to get enough
memory freed ahead of the application's allocations.

> Look, please don't go bending over backwards like this to defend a bad
> patch.  It's a bad patch!  It would be better not to have to merge it.
> Let's do something better.

I would love it if we could come up with something better,
and have thought about it a lot.

However, so far we do not seem to have an alternative yet :(

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 21:04           ` David Rientjes
@ 2011-10-12 13:13             ` Rik van Riel
  -1 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-12 13:13 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On 10/11/2011 05:04 PM, David Rientjes wrote:

> In other words, I think it's a fine solution if you're running a single
> application with very bursty memory allocations so you need to reclaim
> more memory when low, but that solution is troublesome if it comes at
> the penalty of other applications and that's a direct consequence of it
> being a global tunable.  I'd much rather identify memory allocations in
> the kernel that causing the pain here and mitigate it by (i) attempting to
> sanely rate limit those allocations,

Rate limiting just increases the problem from what it was
before the patch was introduced, because the entire purpose
is to reduce allocation latencies by tasks with low latency
requirements.

> (ii) preallocate at least a partial
> amount of those allocations ahead of time so avoid significant reclaim
> all at one,

Unless I'm mistaken, isn't this functionally equivalent to
increasing the size of the free memory pool?

> or (iii) annotate memory allocations with such potential so
> that the page allocator can add this reclaim bonus itself only in these
> conditions.

I am not sure what you are proposing here.

How would this scheme work?

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 13:13             ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-12 13:13 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On 10/11/2011 05:04 PM, David Rientjes wrote:

> In other words, I think it's a fine solution if you're running a single
> application with very bursty memory allocations so you need to reclaim
> more memory when low, but that solution is troublesome if it comes at
> the penalty of other applications and that's a direct consequence of it
> being a global tunable.  I'd much rather identify memory allocations in
> the kernel that causing the pain here and mitigate it by (i) attempting to
> sanely rate limit those allocations,

Rate limiting just increases the problem from what it was
before the patch was introduced, because the entire purpose
is to reduce allocation latencies by tasks with low latency
requirements.

> (ii) preallocate at least a partial
> amount of those allocations ahead of time so avoid significant reclaim
> all at one,

Unless I'm mistaken, isn't this functionally equivalent to
increasing the size of the free memory pool?

> or (iii) annotate memory allocations with such potential so
> that the page allocator can add this reclaim bonus itself only in these
> conditions.

I am not sure what you are proposing here.

How would this scheme work?

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 13:09                   ` Rik van Riel
@ 2011-10-12 19:20                     ` Andrew Morton
  -1 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-12 19:20 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, David Rientjes, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Wed, 12 Oct 2011 09:09:17 -0400
Rik van Riel <riel@redhat.com> wrote:

> On 10/11/2011 04:54 PM, Andrew Morton wrote:
> > On Tue, 11 Oct 2011 16:23:22 -0400
> > Satoru Moriya<satoru.moriya@hds.com>  wrote:
> >
> >> On 10/11/2011 03:55 PM, Andrew Morton wrote:
> >>> On Tue, 11 Oct 2011 15:32:11 -0400
> >>> Satoru Moriya<satoru.moriya@hds.com>  wrote:
> >>>
> >>>> On 10/10/2011 06:37 PM, Andrew Morton wrote:
> >>>>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes
> >>>>> <rientjes@google.com>  wrote:
> >>>>>
> >>>>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
> >>>>
> >>>> Actually page allocator decreases min watermark to 3/4 * min
> >>>> watermark for rt-task. But in our case some applications create a lot
> >>>> of processes and if all of them are rt-task, the amount of watermark
> >>>> bonus(1/4 * min watermark) is not enough.
> >>>>
> >>>> If we can tune the amount of bonus, it may be fine. But that is
> >>>> almost all same as extra free kbytes.
> >>>
> >>> This situation is detectable at runtime.  If realtime tasks are being
> >>> stalled in the page allocator then start to increase the free-page
> >>> reserves.  A little control system.
> >>
> >> Detecting at runtime is too late for some latency critical systems.
> >> At that system, we must avoid a stall before it happens.
> >
> > It's pretty darn obvious that the kernel can easily see the situation
> > developing before it happens.  By comparing a few integers.
> 
> The problem is that we may be dealing with bursts, not steady
> states of allocations.  Without knowing the size of a burst,
> we have no idea when we should wake up kswapd to get enough
> memory freed ahead of the application's allocations.

That problem remains with this patch - it just takes a larger burst.

Unless the admin somehow manages to configure the tunable large enough
to cover the largest burst, and there aren't other applications
allocating memory during that burst, and the time between bursts is
sufficient for kswapd to be able to sufficiently replenish free-page
reserves.  All of which sounds rather unlikely.

> > Look, please don't go bending over backwards like this to defend a bad
> > patch.  It's a bad patch!  It would be better not to have to merge it.
> > Let's do something better.
> 
> I would love it if we could come up with something better,
> and have thought about it a lot.
> 
> However, so far we do not seem to have an alternative yet :(

Do we actually have a real-world application which is hurting from
this?

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 19:20                     ` Andrew Morton
  0 siblings, 0 replies; 100+ messages in thread
From: Andrew Morton @ 2011-10-12 19:20 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, David Rientjes, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Wed, 12 Oct 2011 09:09:17 -0400
Rik van Riel <riel@redhat.com> wrote:

> On 10/11/2011 04:54 PM, Andrew Morton wrote:
> > On Tue, 11 Oct 2011 16:23:22 -0400
> > Satoru Moriya<satoru.moriya@hds.com>  wrote:
> >
> >> On 10/11/2011 03:55 PM, Andrew Morton wrote:
> >>> On Tue, 11 Oct 2011 15:32:11 -0400
> >>> Satoru Moriya<satoru.moriya@hds.com>  wrote:
> >>>
> >>>> On 10/10/2011 06:37 PM, Andrew Morton wrote:
> >>>>> On Fri, 7 Oct 2011 20:08:19 -0700 (PDT) David Rientjes
> >>>>> <rientjes@google.com>  wrote:
> >>>>>
> >>>>>> On Thu, 1 Sep 2011, Rik van Riel wrote:
> >>>>
> >>>> Actually page allocator decreases min watermark to 3/4 * min
> >>>> watermark for rt-task. But in our case some applications create a lot
> >>>> of processes and if all of them are rt-task, the amount of watermark
> >>>> bonus(1/4 * min watermark) is not enough.
> >>>>
> >>>> If we can tune the amount of bonus, it may be fine. But that is
> >>>> almost all same as extra free kbytes.
> >>>
> >>> This situation is detectable at runtime.  If realtime tasks are being
> >>> stalled in the page allocator then start to increase the free-page
> >>> reserves.  A little control system.
> >>
> >> Detecting at runtime is too late for some latency critical systems.
> >> At that system, we must avoid a stall before it happens.
> >
> > It's pretty darn obvious that the kernel can easily see the situation
> > developing before it happens.  By comparing a few integers.
> 
> The problem is that we may be dealing with bursts, not steady
> states of allocations.  Without knowing the size of a burst,
> we have no idea when we should wake up kswapd to get enough
> memory freed ahead of the application's allocations.

That problem remains with this patch - it just takes a larger burst.

Unless the admin somehow manages to configure the tunable large enough
to cover the largest burst, and there aren't other applications
allocating memory during that burst, and the time between bursts is
sufficient for kswapd to be able to sufficiently replenish free-page
reserves.  All of which sounds rather unlikely.

> > Look, please don't go bending over backwards like this to defend a bad
> > patch.  It's a bad patch!  It would be better not to have to merge it.
> > Let's do something better.
> 
> I would love it if we could come up with something better,
> and have thought about it a lot.
> 
> However, so far we do not seem to have an alternative yet :(

Do we actually have a real-world application which is hurting from
this?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 19:20                     ` Andrew Morton
@ 2011-10-12 19:58                       ` Rik van Riel
  -1 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-12 19:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Satoru Moriya, David Rientjes, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/12/2011 03:20 PM, Andrew Morton wrote:
> On Wed, 12 Oct 2011 09:09:17 -0400
> Rik van Riel<riel@redhat.com>  wrote:

>> The problem is that we may be dealing with bursts, not steady
>> states of allocations.  Without knowing the size of a burst,
>> we have no idea when we should wake up kswapd to get enough
>> memory freed ahead of the application's allocations.
>
> That problem remains with this patch - it just takes a larger burst.
>
> Unless the admin somehow manages to configure the tunable large enough
> to cover the largest burst, and there aren't other applications
> allocating memory during that burst, and the time between bursts is
> sufficient for kswapd to be able to sufficiently replenish free-page
> reserves.  All of which sounds rather unlikely.

It depends on the system. For a setup which is packed to
the brim with workloads, this patch is not likely to help.
On the other hand, on a system that is packed to the brim
with workloads, you are unlikely to get low latencies anyway.

For situations where people really care about low latencies,
I imagine having dedicated hardware for a workload is not at
all unusual, and the patch works for that.

>>> Look, please don't go bending over backwards like this to defend a bad
>>> patch.  It's a bad patch!  It would be better not to have to merge it.
>>> Let's do something better.
>>
>> I would love it if we could come up with something better,
>> and have thought about it a lot.
>>
>> However, so far we do not seem to have an alternative yet :(
>
> Do we actually have a real-world application which is hurting from
> this?

Satoru-san?

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 19:58                       ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-12 19:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Satoru Moriya, David Rientjes, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/12/2011 03:20 PM, Andrew Morton wrote:
> On Wed, 12 Oct 2011 09:09:17 -0400
> Rik van Riel<riel@redhat.com>  wrote:

>> The problem is that we may be dealing with bursts, not steady
>> states of allocations.  Without knowing the size of a burst,
>> we have no idea when we should wake up kswapd to get enough
>> memory freed ahead of the application's allocations.
>
> That problem remains with this patch - it just takes a larger burst.
>
> Unless the admin somehow manages to configure the tunable large enough
> to cover the largest burst, and there aren't other applications
> allocating memory during that burst, and the time between bursts is
> sufficient for kswapd to be able to sufficiently replenish free-page
> reserves.  All of which sounds rather unlikely.

It depends on the system. For a setup which is packed to
the brim with workloads, this patch is not likely to help.
On the other hand, on a system that is packed to the brim
with workloads, you are unlikely to get low latencies anyway.

For situations where people really care about low latencies,
I imagine having dedicated hardware for a workload is not at
all unusual, and the patch works for that.

>>> Look, please don't go bending over backwards like this to defend a bad
>>> patch.  It's a bad patch!  It would be better not to have to merge it.
>>> Let's do something better.
>>
>> I would love it if we could come up with something better,
>> and have thought about it a lot.
>>
>> However, so far we do not seem to have an alternative yet :(
>
> Do we actually have a real-world application which is hurting from
> this?

Satoru-san?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 13:13             ` Rik van Riel
@ 2011-10-12 20:21               ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-12 20:21 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On Wed, 12 Oct 2011, Rik van Riel wrote:

> How would this scheme work?
> 

I suggested a patch from BFS that would raise kswapd to the same priority 
of the task that triggered it (not completely up to rt, but the highest 
possible in that case) and I'm waiting to hear if that helps for Satoru's 
test case before looking at alternatives.  We could also extend the patch 
to raise the priority of an already running kswapd if a higher priority 
task calls into the page allocator's slowpath.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 20:21               ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-12 20:21 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On Wed, 12 Oct 2011, Rik van Riel wrote:

> How would this scheme work?
> 

I suggested a patch from BFS that would raise kswapd to the same priority 
of the task that triggered it (not completely up to rt, but the highest 
possible in that case) and I'm waiting to hear if that helps for Satoru's 
test case before looking at alternatives.  We could also extend the patch 
to raise the priority of an already running kswapd if a higher priority 
task calls into the page allocator's slowpath.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 19:58                       ` Rik van Riel
@ 2011-10-12 20:26                         ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-12 20:26 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andrew Morton, Satoru Moriya, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Wed, 12 Oct 2011, Rik van Riel wrote:

> > > The problem is that we may be dealing with bursts, not steady
> > > states of allocations.  Without knowing the size of a burst,
> > > we have no idea when we should wake up kswapd to get enough
> > > memory freed ahead of the application's allocations.
> > 

Raising the priority of kswapd to be the highest possible when triggered 
by rt-tasks should help to reclaim memory faster.  If that doesn't work 
fully with Con's patch on Satoru's testcase then we'll want to extend it 
to raise the priority for a running kswapd when a higher priority thread 
calls into the page allocator slowpath.  If that also doesn't mitigate the 
problem entirely, then we'll need to suggest raising min_free_kbytes so 
these threads have a larger pool of exclusive access to memory when the 
burst first happens.

> > That problem remains with this patch - it just takes a larger burst.
> > 
> > Unless the admin somehow manages to configure the tunable large enough
> > to cover the largest burst, and there aren't other applications
> > allocating memory during that burst, and the time between bursts is
> > sufficient for kswapd to be able to sufficiently replenish free-page
> > reserves.  All of which sounds rather unlikely.
> 
> It depends on the system. For a setup which is packed to
> the brim with workloads, this patch is not likely to help.
> On the other hand, on a system that is packed to the brim
> with workloads, you are unlikely to get low latencies anyway.
> 
> For situations where people really care about low latencies,
> I imagine having dedicated hardware for a workload is not at
> all unusual, and the patch works for that.
> 

If it's dedicated hardware, then you should be able to just raise 
min_free_kbytes so that rt-tasks get exclusive access to a larger amount 
of memory.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 20:26                         ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-12 20:26 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andrew Morton, Satoru Moriya, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Wed, 12 Oct 2011, Rik van Riel wrote:

> > > The problem is that we may be dealing with bursts, not steady
> > > states of allocations.  Without knowing the size of a burst,
> > > we have no idea when we should wake up kswapd to get enough
> > > memory freed ahead of the application's allocations.
> > 

Raising the priority of kswapd to be the highest possible when triggered 
by rt-tasks should help to reclaim memory faster.  If that doesn't work 
fully with Con's patch on Satoru's testcase then we'll want to extend it 
to raise the priority for a running kswapd when a higher priority thread 
calls into the page allocator slowpath.  If that also doesn't mitigate the 
problem entirely, then we'll need to suggest raising min_free_kbytes so 
these threads have a larger pool of exclusive access to memory when the 
burst first happens.

> > That problem remains with this patch - it just takes a larger burst.
> > 
> > Unless the admin somehow manages to configure the tunable large enough
> > to cover the largest burst, and there aren't other applications
> > allocating memory during that burst, and the time between bursts is
> > sufficient for kswapd to be able to sufficiently replenish free-page
> > reserves.  All of which sounds rather unlikely.
> 
> It depends on the system. For a setup which is packed to
> the brim with workloads, this patch is not likely to help.
> On the other hand, on a system that is packed to the brim
> with workloads, you are unlikely to get low latencies anyway.
> 
> For situations where people really care about low latencies,
> I imagine having dedicated hardware for a workload is not at
> all unusual, and the patch works for that.
> 

If it's dedicated hardware, then you should be able to just raise 
min_free_kbytes so that rt-tasks get exclusive access to a larger amount 
of memory.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 20:54                 ` Andrew Morton
@ 2011-10-12 21:08                   ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-12 21:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/11/2011 04:54 PM, Andrew Morton wrote:
> On Tue, 11 Oct 2011 16:23:22 -0400
> Satoru Moriya <satoru.moriya@hds.com> wrote:
>
>> Also, if we increase the free-page reserves a.k.a min_free_kbytes, 
>> the possibility of direct reclaim on other workloads increases.
>> I think it's a bad side effect.
> 
> extra_free_kbytes has the same side-effect.

I don't think so. If we make low watermark bigger to increase
free-page reserves by extra_free_kbytes, the possibility of
direct reclaim on other workload does not increase directly
because min watermark is not changed. 
Of course, as David pointed out, other workloads may incur
a regression because kswapd uses more cpu time.

Thanks,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 21:08                   ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-12 21:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/11/2011 04:54 PM, Andrew Morton wrote:
> On Tue, 11 Oct 2011 16:23:22 -0400
> Satoru Moriya <satoru.moriya@hds.com> wrote:
>
>> Also, if we increase the free-page reserves a.k.a min_free_kbytes, 
>> the possibility of direct reclaim on other workloads increases.
>> I think it's a bad side effect.
> 
> extra_free_kbytes has the same side-effect.

I don't think so. If we make low watermark bigger to increase
free-page reserves by extra_free_kbytes, the possibility of
direct reclaim on other workload does not increase directly
because min watermark is not changed. 
Of course, as David pointed out, other workloads may incur
a regression because kswapd uses more cpu time.

Thanks,
Satoru

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 21:08                   ` Satoru Moriya
@ 2011-10-12 22:41                     ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-12 22:41 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Wed, 12 Oct 2011, Satoru Moriya wrote:

> >> Also, if we increase the free-page reserves a.k.a min_free_kbytes, 
> >> the possibility of direct reclaim on other workloads increases.
> >> I think it's a bad side effect.
> > 
> > extra_free_kbytes has the same side-effect.
> 
> I don't think so. If we make low watermark bigger to increase
> free-page reserves by extra_free_kbytes, the possibility of
> direct reclaim on other workload does not increase directly
> because min watermark is not changed. 

I think the point was that extra_free_kbytes needs to be tuned to cover at 
least the amount of memory of the largest allocation burst or it doesn't 
help to prevent latencies for rt threads and, depending on how the 
implementation of the VM evolves, that value may change significantly over 
time from kernel release to kernel release.

For example, if we were to merge Con's patch so kswapd operates at a much 
higher priority for rt threads later on for another issue, it may 
significantly reduce the need for extra_free_kbytes to be set as high as 
it is.  Everybody who is setting this in init scripts, though, will 
continue to set the value because they have no reason to believe it should 
be changed.  Then, we have users who start to use the tunable after Con's 
patch has been merged and now we have widely different settings for the 
same tunable and it can never be obsoleted because everybody is using it 
but for different historic reasons.

This is why I nack'd the patch originally: it will never be removed, it 
is widely misunderstood, and is tied directly to the implementation of 
reclaim which will change over time.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 22:41                     ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-12 22:41 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Wed, 12 Oct 2011, Satoru Moriya wrote:

> >> Also, if we increase the free-page reserves a.k.a min_free_kbytes, 
> >> the possibility of direct reclaim on other workloads increases.
> >> I think it's a bad side effect.
> > 
> > extra_free_kbytes has the same side-effect.
> 
> I don't think so. If we make low watermark bigger to increase
> free-page reserves by extra_free_kbytes, the possibility of
> direct reclaim on other workload does not increase directly
> because min watermark is not changed. 

I think the point was that extra_free_kbytes needs to be tuned to cover at 
least the amount of memory of the largest allocation burst or it doesn't 
help to prevent latencies for rt threads and, depending on how the 
implementation of the VM evolves, that value may change significantly over 
time from kernel release to kernel release.

For example, if we were to merge Con's patch so kswapd operates at a much 
higher priority for rt threads later on for another issue, it may 
significantly reduce the need for extra_free_kbytes to be set as high as 
it is.  Everybody who is setting this in init scripts, though, will 
continue to set the value because they have no reason to believe it should 
be changed.  Then, we have users who start to use the tunable after Con's 
patch has been merged and now we have widely different settings for the 
same tunable and it can never be obsoleted because everybody is using it 
but for different historic reasons.

This is why I nack'd the patch originally: it will never be removed, it 
is widely misunderstood, and is tied directly to the implementation of 
reclaim which will change over time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 22:41                     ` David Rientjes
@ 2011-10-12 23:52                       ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-12 23:52 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/12/2011 06:41 PM, David Rientjes wrote:
> On Wed, 12 Oct 2011, Satoru Moriya wrote:
> 
> I think the point was that extra_free_kbytes needs to be tuned to 
> cover at least the amount of memory of the largest allocation burst

Right. In enterprise area, we strictly test the system we build
again and again before we release it. In that situation, we can
set extra_free_kbytes appropriately based on system's requirements
and/or specifications etc.

> or it doesn't help to prevent latencies for rt threads and,

I thinks it also helps rt threads prevent latency as well as
normal threads because we can start kswapd earlier and avoid
direct reclaim.

> For example, if we were to merge Con's patch so kswapd operates at a 
> much higher priority for rt threads later on for another issue, it may 
> significantly reduce the need for extra_free_kbytes to be set as high 
> as it is.  Everybody who is setting this in init scripts, though, will 
> continue to set the value because they have no reason to believe it 
> should be changed.  Then, we have users who start to use the tunable 
> after Con's patch has been merged and now we have widely different 
> settings for the same tunable and it can never be obsoleted because 
> everybody is using it but for different historic reasons.
> 
> This is why I nack'd the patch originally: it will never be removed, 
> it is widely misunderstood, and is tied directly to the implementation 
> of reclaim which will change over time.

I understand what you concern. But in some area such as banking,
stock exchange, train/power/plant control sysemts etc this kind
of tunable is welcomed because they can tune their systems at
their own risk.

Also those systems have been used for a long time without significant
updating. If we update it or build a new system, we configure all
tunables from scratch.

Thanks,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-12 23:52                       ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-12 23:52 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/12/2011 06:41 PM, David Rientjes wrote:
> On Wed, 12 Oct 2011, Satoru Moriya wrote:
> 
> I think the point was that extra_free_kbytes needs to be tuned to 
> cover at least the amount of memory of the largest allocation burst

Right. In enterprise area, we strictly test the system we build
again and again before we release it. In that situation, we can
set extra_free_kbytes appropriately based on system's requirements
and/or specifications etc.

> or it doesn't help to prevent latencies for rt threads and,

I thinks it also helps rt threads prevent latency as well as
normal threads because we can start kswapd earlier and avoid
direct reclaim.

> For example, if we were to merge Con's patch so kswapd operates at a 
> much higher priority for rt threads later on for another issue, it may 
> significantly reduce the need for extra_free_kbytes to be set as high 
> as it is.  Everybody who is setting this in init scripts, though, will 
> continue to set the value because they have no reason to believe it 
> should be changed.  Then, we have users who start to use the tunable 
> after Con's patch has been merged and now we have widely different 
> settings for the same tunable and it can never be obsoleted because 
> everybody is using it but for different historic reasons.
> 
> This is why I nack'd the patch originally: it will never be removed, 
> it is widely misunderstood, and is tied directly to the implementation 
> of reclaim which will change over time.

I understand what you concern. But in some area such as banking,
stock exchange, train/power/plant control sysemts etc this kind
of tunable is welcomed because they can tune their systems at
their own risk.

Also those systems have been used for a long time without significant
updating. If we update it or build a new system, we configure all
tunables from scratch.

Thanks,
Satoru

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 23:52                       ` Satoru Moriya
@ 2011-10-13  0:01                         ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13  0:01 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On Wed, 12 Oct 2011, Satoru Moriya wrote:

> > I think the point was that extra_free_kbytes needs to be tuned to 
> > cover at least the amount of memory of the largest allocation burst
> 
> Right. In enterprise area, we strictly test the system we build
> again and again before we release it. In that situation, we can
> set extra_free_kbytes appropriately based on system's requirements
> and/or specifications etc.
> 

You would also need to guarantee that min_free_kbytes isn't subsequently 
changed because that would change the value that extra_free_kbytes would 
need to preserve the same exclusive access to memory that the rt threads 
would have without increasing it.

> I understand what you concern. But in some area such as banking,
> stock exchange, train/power/plant control sysemts etc this kind
> of tunable is welcomed because they can tune their systems at
> their own risk.
> 

You haven't tried the patch that increases the priority of kswapd when 
such a latency sensitive thread triggers background reclaim?

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13  0:01                         ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13  0:01 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On Wed, 12 Oct 2011, Satoru Moriya wrote:

> > I think the point was that extra_free_kbytes needs to be tuned to 
> > cover at least the amount of memory of the largest allocation burst
> 
> Right. In enterprise area, we strictly test the system we build
> again and again before we release it. In that situation, we can
> set extra_free_kbytes appropriately based on system's requirements
> and/or specifications etc.
> 

You would also need to guarantee that min_free_kbytes isn't subsequently 
changed because that would change the value that extra_free_kbytes would 
need to preserve the same exclusive access to memory that the rt threads 
would have without increasing it.

> I understand what you concern. But in some area such as banking,
> stock exchange, train/power/plant control sysemts etc this kind
> of tunable is welcomed because they can tune their systems at
> their own risk.
> 

You haven't tried the patch that increases the priority of kswapd when 
such a latency sensitive thread triggers background reclaim?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 20:21               ` David Rientjes
@ 2011-10-13  4:13                 ` Rik van Riel
  -1 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-13  4:13 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On 10/12/2011 04:21 PM, David Rientjes wrote:
> On Wed, 12 Oct 2011, Rik van Riel wrote:
>
>> How would this scheme work?
>>
>
> I suggested a patch from BFS that would raise kswapd to the same priority
> of the task that triggered it (not completely up to rt, but the highest
> possible in that case) and I'm waiting to hear if that helps for Satoru's
> test case before looking at alternatives.  We could also extend the patch
> to raise the priority of an already running kswapd if a higher priority
> task calls into the page allocator's slowpath.

This has the distinct benefit of making kswapd most active right
at the same time the application is most active, which returns
us to your first objection to the extra free kbytes patch (apps
will suffer from kswapd cpu use).

Furthermore, I am not sure that giving kswapd more CPU time is
going to help, because kswapd could be stuck on some lock, held
by a lower priority (or sleeping) context.

I agree that the BFS patch would be worth a try, and would be
very pleasantly surprised if it worked, but I am not very
optimistic about it...

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13  4:13                 ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-13  4:13 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On 10/12/2011 04:21 PM, David Rientjes wrote:
> On Wed, 12 Oct 2011, Rik van Riel wrote:
>
>> How would this scheme work?
>>
>
> I suggested a patch from BFS that would raise kswapd to the same priority
> of the task that triggered it (not completely up to rt, but the highest
> possible in that case) and I'm waiting to hear if that helps for Satoru's
> test case before looking at alternatives.  We could also extend the patch
> to raise the priority of an already running kswapd if a higher priority
> task calls into the page allocator's slowpath.

This has the distinct benefit of making kswapd most active right
at the same time the application is most active, which returns
us to your first objection to the extra free kbytes patch (apps
will suffer from kswapd cpu use).

Furthermore, I am not sure that giving kswapd more CPU time is
going to help, because kswapd could be stuck on some lock, held
by a lower priority (or sleeping) context.

I agree that the BFS patch would be worth a try, and would be
very pleasantly surprised if it worked, but I am not very
optimistic about it...

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13  4:13                 ` Rik van Riel
@ 2011-10-13  5:22                   ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13  5:22 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On Thu, 13 Oct 2011, Rik van Riel wrote:

> > I suggested a patch from BFS that would raise kswapd to the same priority
> > of the task that triggered it (not completely up to rt, but the highest
> > possible in that case) and I'm waiting to hear if that helps for Satoru's
> > test case before looking at alternatives.  We could also extend the patch
> > to raise the priority of an already running kswapd if a higher priority
> > task calls into the page allocator's slowpath.
> 
> This has the distinct benefit of making kswapd most active right
> at the same time the application is most active, which returns
> us to your first objection to the extra free kbytes patch (apps
> will suffer from kswapd cpu use).
> 

Not necessarily, it only raises the priority of kswapd to be the same as 
the application, although it'll never raise it to be realtime, that kicks 
it in the page allocator's slowpath.  If the application has a nice level 
of 0, it's a no-op.  That's very different from extra_free_kbytes which 
causes kswapd to do extra work regardless of the priority of the 
application that is allocating memory.  Raising the priority of kswapd for 
rt threads makes sense if they are going to deplete all memory, it makes 
no sense to allow a rt thread to allocate tons of memory and not even give 
kswapd a chance to compete.

> Furthermore, I am not sure that giving kswapd more CPU time is
> going to help, because kswapd could be stuck on some lock, held
> by a lower priority (or sleeping) context.
> 
> I agree that the BFS patch would be worth a try, and would be
> very pleasantly surprised if it worked, but I am not very
> optimistic about it...
> 

It may require a combination of Con's patch, increasing the priority of 
kswapd if a higher priority task kicks it in the page allocator, and an 
extra bonus on top of the high watermark if it was triggered by a 
rt-thread -- similar to ALLOC_HARDER but instead reclaiming to 
(high * 1.25).

If we're going to go with extra_free_kbytes, then I'd like to see the test 
case posted with a mathematical formula to show me what I should tune it 
to be depending on my machine's memory capacity and amount of free RAM 
when started (and I can use mem= to test it for various capacities).  For 
this to be merged, there should be a clear expression that shows what the 
ideal setting of the tunable should be rather than asking for trial-and-
error to see what works and what doesn't.  If such an expression doesn't 
exist, then it's clear that the necessary setting will vary significantly 
as the implementation changes from kernel to kernel.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13  5:22                   ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13  5:22 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, Andrew Morton, Hugh Dickins,
	hannes

On Thu, 13 Oct 2011, Rik van Riel wrote:

> > I suggested a patch from BFS that would raise kswapd to the same priority
> > of the task that triggered it (not completely up to rt, but the highest
> > possible in that case) and I'm waiting to hear if that helps for Satoru's
> > test case before looking at alternatives.  We could also extend the patch
> > to raise the priority of an already running kswapd if a higher priority
> > task calls into the page allocator's slowpath.
> 
> This has the distinct benefit of making kswapd most active right
> at the same time the application is most active, which returns
> us to your first objection to the extra free kbytes patch (apps
> will suffer from kswapd cpu use).
> 

Not necessarily, it only raises the priority of kswapd to be the same as 
the application, although it'll never raise it to be realtime, that kicks 
it in the page allocator's slowpath.  If the application has a nice level 
of 0, it's a no-op.  That's very different from extra_free_kbytes which 
causes kswapd to do extra work regardless of the priority of the 
application that is allocating memory.  Raising the priority of kswapd for 
rt threads makes sense if they are going to deplete all memory, it makes 
no sense to allow a rt thread to allocate tons of memory and not even give 
kswapd a chance to compete.

> Furthermore, I am not sure that giving kswapd more CPU time is
> going to help, because kswapd could be stuck on some lock, held
> by a lower priority (or sleeping) context.
> 
> I agree that the BFS patch would be worth a try, and would be
> very pleasantly surprised if it worked, but I am not very
> optimistic about it...
> 

It may require a combination of Con's patch, increasing the priority of 
kswapd if a higher priority task kicks it in the page allocator, and an 
extra bonus on top of the high watermark if it was triggered by a 
rt-thread -- similar to ALLOC_HARDER but instead reclaiming to 
(high * 1.25).

If we're going to go with extra_free_kbytes, then I'd like to see the test 
case posted with a mathematical formula to show me what I should tune it 
to be depending on my machine's memory capacity and amount of free RAM 
when started (and I can use mem= to test it for various capacities).  For 
this to be merged, there should be a clear expression that shows what the 
ideal setting of the tunable should be rather than asking for trial-and-
error to see what works and what doesn't.  If such an expression doesn't 
exist, then it's clear that the necessary setting will vary significantly 
as the implementation changes from kernel to kernel.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13  0:01                         ` David Rientjes
@ 2011-10-13  5:35                           ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 100+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-10-13  5:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Wed, 12 Oct 2011 17:01:21 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> On Wed, 12 Oct 2011, Satoru Moriya wrote:

> > I understand what you concern. But in some area such as banking,
> > stock exchange, train/power/plant control sysemts etc this kind
> > of tunable is welcomed because they can tune their systems at
> > their own risk.
> > 
> 
> You haven't tried the patch that increases the priority of kswapd when 
> such a latency sensitive thread triggers background reclaim?

I don't read full story but....how about adding a new syscall like

==
sys_mem_shrink(int nid, int nr_scan_pages, int flags)

This system call scans LRU of specified nodes and free pages on LRU.
This scan nr_scan_pages in LRU and returns the number of successfully
freed pages.
==

Then, running this progam in SCHED_IDLE, a user can make free pages while
the system is idle. If running in the highest priority, a user can keep
free pages as he want. If a user run this under a memcg, user can free
pages in a memcg. 

Maybe many guys don't want to export memory-shrink facility to userland ;)
This is just an idea.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13  5:35                           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 100+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-10-13  5:35 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Wed, 12 Oct 2011 17:01:21 -0700 (PDT)
David Rientjes <rientjes@google.com> wrote:

> On Wed, 12 Oct 2011, Satoru Moriya wrote:

> > I understand what you concern. But in some area such as banking,
> > stock exchange, train/power/plant control sysemts etc this kind
> > of tunable is welcomed because they can tune their systems at
> > their own risk.
> > 
> 
> You haven't tried the patch that increases the priority of kswapd when 
> such a latency sensitive thread triggers background reclaim?

I don't read full story but....how about adding a new syscall like

==
sys_mem_shrink(int nid, int nr_scan_pages, int flags)

This system call scans LRU of specified nodes and free pages on LRU.
This scan nr_scan_pages in LRU and returns the number of successfully
freed pages.
==

Then, running this progam in SCHED_IDLE, a user can make free pages while
the system is idle. If running in the highest priority, a user can keep
free pages as he want. If a user run this under a memcg, user can free
pages in a memcg. 

Maybe many guys don't want to export memory-shrink facility to userland ;)
This is just an idea.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-09-02 16:31         ` Satoru Moriya
@ 2011-10-13  7:33           ` Minchan Kim
  -1 siblings, 0 replies; 100+ messages in thread
From: Minchan Kim @ 2011-10-13  7:33 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes,
	KAMEZAWA Hiroyuki, David Rientjes

On Fri, Sep 02, 2011 at 12:31:14PM -0400, Satoru Moriya wrote:
> On 09/01/2011 05:58 PM, Andrew Morton wrote:
> > On Thu, 1 Sep 2011 15:26:50 -0400
> > Rik van Riel <riel@redhat.com> wrote:
> > 
> >> Add a userspace visible knob
> > 
> > argh.  Fear and hostility at new knobs which need to be maintained for 
> > ever, even if the underlying implementation changes.
> > 
> > Unfortunately, this one makes sense.
> > 
> >> to tell the VM to keep an extra amount of memory free, by increasing 
> >> the gap between each zone's min and low watermarks.
> >>
> >> This is useful for realtime applications that call system calls and 
> >> have a bound on the number of allocations that happen in any short 
> >> time period.  In this application, extra_free_kbytes would be left at 
> >> an amount equal to or larger than the maximum number of 
> >> allocations that happen in any burst.
> > 
> > _is_ it useful?  Proof?
> > 
> > Who is requesting this?  Have they tested it?  Results?
> 
> This is interesting for me.
> 
> Some of our customers have realtime applications and they are concerned 
> the fact that Linux uses free memory as pagecache. It means that
> when their application allocate memory, Linux kernel tries to reclaim
> memory at first and then allocate it. This may make memory allocation
> latency bigger.
> 
> In many cases this is not a big issue because Linux has kswapd for
> background reclaim and it is fast enough not to enter direct reclaim
> path if there are a lot of clean cache. But under some situations -
> e.g. Application allocates a lot of memory which is larger than delta
> between watermark_low and watermark_min in a short time and kswapd
> can't reclaim fast enough due to dirty page reclaim, direct reclaim
> is executed and causes big latency.
> 
> We can avoid the issue above by using preallocation and mlock.
> But it can't cover kmalloc used in systemcall. So I'd like to use
> this patch with mlock to avoid memory allocation latency issue as
> low as possible. It may not be a perfect solution but it is important
> for customers in enterprise area to configure the amount of free
> memory at their own risk.

I agree needs for such feature but don't like such primitive interface
exporting to user.

As Satoru said, we can reserve free pages for user through preallocation and mlocking.
The thing is free pages for kernel itself.
Most desirable thing is we have to avoid syscall in critical realtime section.
But if we can't avoid, my crazy idea is to use memcg for kernel pages.
Of course, we should implement it and not simple stuff but AFAIK, memcg people
always consider it and finally will do it. :)
Recently, Glauber try "Basic kernel memory functionality" but I don't have reviewed
it yet. I am not sure we can reuse it, anyway. Kame?

My simple idea is as follows,

We can assign basic revered page pool and/or size of user-determined pages pool
for each task registred at memcg-slab.
The application have to notify start of RT section to memcg before it goes to
RT section. So, memcg could fill up page pool if it is short. In this case,
application can stuck but it's okay as it doesn't go to RT section yet.
The applicatoin have to notify end of RT section to memcg, too so that memcg
could try to fill up reserved page pool in case of shortage.

Why we need such notification is kswapd high prioiry, new knob and others never
can meet application's deadline requirement in some situations(ex,
there are so many dirty pages in LRU or fill up anon pages in non-swap case and so on)
so that application might end up stuck at some point. The somepoint must be out of RT
section of the task.

For implemenation, we might need new watermark setting for each memcg or/and
kswapd prioirity promotion like thing for hurry reclaiming.
Anyway, they are just implementaions and we could enhance/add further more through
various techniques as time goes by.

Personally, I think it could a valuable featue.

-- 
Kinds regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13  7:33           ` Minchan Kim
  0 siblings, 0 replies; 100+ messages in thread
From: Minchan Kim @ 2011-10-13  7:33 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes,
	KAMEZAWA Hiroyuki, David Rientjes

On Fri, Sep 02, 2011 at 12:31:14PM -0400, Satoru Moriya wrote:
> On 09/01/2011 05:58 PM, Andrew Morton wrote:
> > On Thu, 1 Sep 2011 15:26:50 -0400
> > Rik van Riel <riel@redhat.com> wrote:
> > 
> >> Add a userspace visible knob
> > 
> > argh.  Fear and hostility at new knobs which need to be maintained for 
> > ever, even if the underlying implementation changes.
> > 
> > Unfortunately, this one makes sense.
> > 
> >> to tell the VM to keep an extra amount of memory free, by increasing 
> >> the gap between each zone's min and low watermarks.
> >>
> >> This is useful for realtime applications that call system calls and 
> >> have a bound on the number of allocations that happen in any short 
> >> time period.  In this application, extra_free_kbytes would be left at 
> >> an amount equal to or larger than the maximum number of 
> >> allocations that happen in any burst.
> > 
> > _is_ it useful?  Proof?
> > 
> > Who is requesting this?  Have they tested it?  Results?
> 
> This is interesting for me.
> 
> Some of our customers have realtime applications and they are concerned 
> the fact that Linux uses free memory as pagecache. It means that
> when their application allocate memory, Linux kernel tries to reclaim
> memory at first and then allocate it. This may make memory allocation
> latency bigger.
> 
> In many cases this is not a big issue because Linux has kswapd for
> background reclaim and it is fast enough not to enter direct reclaim
> path if there are a lot of clean cache. But under some situations -
> e.g. Application allocates a lot of memory which is larger than delta
> between watermark_low and watermark_min in a short time and kswapd
> can't reclaim fast enough due to dirty page reclaim, direct reclaim
> is executed and causes big latency.
> 
> We can avoid the issue above by using preallocation and mlock.
> But it can't cover kmalloc used in systemcall. So I'd like to use
> this patch with mlock to avoid memory allocation latency issue as
> low as possible. It may not be a perfect solution but it is important
> for customers in enterprise area to configure the amount of free
> memory at their own risk.

I agree needs for such feature but don't like such primitive interface
exporting to user.

As Satoru said, we can reserve free pages for user through preallocation and mlocking.
The thing is free pages for kernel itself.
Most desirable thing is we have to avoid syscall in critical realtime section.
But if we can't avoid, my crazy idea is to use memcg for kernel pages.
Of course, we should implement it and not simple stuff but AFAIK, memcg people
always consider it and finally will do it. :)
Recently, Glauber try "Basic kernel memory functionality" but I don't have reviewed
it yet. I am not sure we can reuse it, anyway. Kame?

My simple idea is as follows,

We can assign basic revered page pool and/or size of user-determined pages pool
for each task registred at memcg-slab.
The application have to notify start of RT section to memcg before it goes to
RT section. So, memcg could fill up page pool if it is short. In this case,
application can stuck but it's okay as it doesn't go to RT section yet.
The applicatoin have to notify end of RT section to memcg, too so that memcg
could try to fill up reserved page pool in case of shortage.

Why we need such notification is kswapd high prioiry, new knob and others never
can meet application's deadline requirement in some situations(ex,
there are so many dirty pages in LRU or fill up anon pages in non-swap case and so on)
so that application might end up stuck at some point. The somepoint must be out of RT
section of the task.

For implemenation, we might need new watermark setting for each memcg or/and
kswapd prioirity promotion like thing for hurry reclaiming.
Anyway, they are just implementaions and we could enhance/add further more through
various techniques as time goes by.

Personally, I think it could a valuable featue.

-- 
Kinds regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13  7:33           ` Minchan Kim
@ 2011-10-13  8:09             ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 100+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-10-13  8:09 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Satoru Moriya, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	hughd, hannes, David Rientjes

On Thu, 13 Oct 2011 16:33:21 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Fri, Sep 02, 2011 at 12:31:14PM -0400, Satoru Moriya wrote:
> > On 09/01/2011 05:58 PM, Andrew Morton wrote:
> > > On Thu, 1 Sep 2011 15:26:50 -0400
> > > Rik van Riel <riel@redhat.com> wrote:
> > > 
> > >> Add a userspace visible knob
> > > 
> > > argh.  Fear and hostility at new knobs which need to be maintained for 
> > > ever, even if the underlying implementation changes.
> > > 
> > > Unfortunately, this one makes sense.
> > > 
> > >> to tell the VM to keep an extra amount of memory free, by increasing 
> > >> the gap between each zone's min and low watermarks.
> > >>
> > >> This is useful for realtime applications that call system calls and 
> > >> have a bound on the number of allocations that happen in any short 
> > >> time period.  In this application, extra_free_kbytes would be left at 
> > >> an amount equal to or larger than the maximum number of 
> > >> allocations that happen in any burst.
> > > 
> > > _is_ it useful?  Proof?
> > > 
> > > Who is requesting this?  Have they tested it?  Results?
> > 
> > This is interesting for me.
> > 
> > Some of our customers have realtime applications and they are concerned 
> > the fact that Linux uses free memory as pagecache. It means that
> > when their application allocate memory, Linux kernel tries to reclaim
> > memory at first and then allocate it. This may make memory allocation
> > latency bigger.
> > 
> > In many cases this is not a big issue because Linux has kswapd for
> > background reclaim and it is fast enough not to enter direct reclaim
> > path if there are a lot of clean cache. But under some situations -
> > e.g. Application allocates a lot of memory which is larger than delta
> > between watermark_low and watermark_min in a short time and kswapd
> > can't reclaim fast enough due to dirty page reclaim, direct reclaim
> > is executed and causes big latency.
> > 
> > We can avoid the issue above by using preallocation and mlock.
> > But it can't cover kmalloc used in systemcall. So I'd like to use
> > this patch with mlock to avoid memory allocation latency issue as
> > low as possible. It may not be a perfect solution but it is important
> > for customers in enterprise area to configure the amount of free
> > memory at their own risk.
> 
> I agree needs for such feature but don't like such primitive interface
> exporting to user.
> 
> As Satoru said, we can reserve free pages for user through preallocation and mlocking.
> The thing is free pages for kernel itself.
> Most desirable thing is we have to avoid syscall in critical realtime section.
> But if we can't avoid, my crazy idea is to use memcg for kernel pages.
> Of course, we should implement it and not simple stuff but AFAIK, memcg people
> always consider it and finally will do it. :)
> Recently, Glauber try "Basic kernel memory functionality" but I don't have reviewed
> it yet. I am not sure we can reuse it, anyway. Kame?
> 

I reviewed it and it seems good. It adds kmem.limit_in_bytes then we're ready
to go forward to kernel memory cgroup.
But it adds only interfaces now.

I think  Greg Thelen <gthelen@google.com> has some idea.


> My simple idea is as follows,
> 
> We can assign basic revered page pool and/or size of user-determined pages pool
> for each task registred at memcg-slab.

Hmm, memcg-mempool ?


> The application have to notify start of RT section to memcg before it goes to
> RT section. So, memcg could fill up page pool if it is short. In this case,
> application can stuck but it's okay as it doesn't go to RT section yet.
> The applicatoin have to notify end of RT section to memcg, too so that memcg
> could try to fill up reserved page pool in case of shortage.
> 

That 'notification' doesn't sounds good to me. When application died/moved to
other group without notification, memcg will be unstable.
It should be task's state rather than memcg's state.


> Why we need such notification is kswapd high prioiry, new knob and others never
> can meet application's deadline requirement in some situations(ex,
> there are so many dirty pages in LRU or fill up anon pages in non-swap case and so on)
> so that application might end up stuck at some point. The somepoint must be out of RT
> section of the task.
> 
> For implemenation, we might need new watermark setting for each memcg or/and
> kswapd prioirity promotion like thing for hurry reclaiming.
> Anyway, they are just implementaions and we could enhance/add further more through
> various techniques as time goes by.
> 
> Personally, I think it could a valuable featue.
> 

Hmm. For avoid latency at allocation, what we can do is only pre-allocation before it's
required. But the problem is that applications cannot forecast when the 'burst' allocation
happens and we need to prepare memory pool always.

I think we need 2 implemenations.

1. free-page mempool for a memcg.
2. a background reclaim thread for a memcg. This is triggered by mempool.
   Prioritity of this thread should be able to controlled by some ways.

If we take care of memcg's limit, watermark should trigger background reclaim.

?
But the memory reclaim routine should never be in sleep...


Thanks,
-Kame





^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13  8:09             ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 100+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-10-13  8:09 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Satoru Moriya, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	hughd, hannes, David Rientjes

On Thu, 13 Oct 2011 16:33:21 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Fri, Sep 02, 2011 at 12:31:14PM -0400, Satoru Moriya wrote:
> > On 09/01/2011 05:58 PM, Andrew Morton wrote:
> > > On Thu, 1 Sep 2011 15:26:50 -0400
> > > Rik van Riel <riel@redhat.com> wrote:
> > > 
> > >> Add a userspace visible knob
> > > 
> > > argh.  Fear and hostility at new knobs which need to be maintained for 
> > > ever, even if the underlying implementation changes.
> > > 
> > > Unfortunately, this one makes sense.
> > > 
> > >> to tell the VM to keep an extra amount of memory free, by increasing 
> > >> the gap between each zone's min and low watermarks.
> > >>
> > >> This is useful for realtime applications that call system calls and 
> > >> have a bound on the number of allocations that happen in any short 
> > >> time period.  In this application, extra_free_kbytes would be left at 
> > >> an amount equal to or larger than the maximum number of 
> > >> allocations that happen in any burst.
> > > 
> > > _is_ it useful?  Proof?
> > > 
> > > Who is requesting this?  Have they tested it?  Results?
> > 
> > This is interesting for me.
> > 
> > Some of our customers have realtime applications and they are concerned 
> > the fact that Linux uses free memory as pagecache. It means that
> > when their application allocate memory, Linux kernel tries to reclaim
> > memory at first and then allocate it. This may make memory allocation
> > latency bigger.
> > 
> > In many cases this is not a big issue because Linux has kswapd for
> > background reclaim and it is fast enough not to enter direct reclaim
> > path if there are a lot of clean cache. But under some situations -
> > e.g. Application allocates a lot of memory which is larger than delta
> > between watermark_low and watermark_min in a short time and kswapd
> > can't reclaim fast enough due to dirty page reclaim, direct reclaim
> > is executed and causes big latency.
> > 
> > We can avoid the issue above by using preallocation and mlock.
> > But it can't cover kmalloc used in systemcall. So I'd like to use
> > this patch with mlock to avoid memory allocation latency issue as
> > low as possible. It may not be a perfect solution but it is important
> > for customers in enterprise area to configure the amount of free
> > memory at their own risk.
> 
> I agree needs for such feature but don't like such primitive interface
> exporting to user.
> 
> As Satoru said, we can reserve free pages for user through preallocation and mlocking.
> The thing is free pages for kernel itself.
> Most desirable thing is we have to avoid syscall in critical realtime section.
> But if we can't avoid, my crazy idea is to use memcg for kernel pages.
> Of course, we should implement it and not simple stuff but AFAIK, memcg people
> always consider it and finally will do it. :)
> Recently, Glauber try "Basic kernel memory functionality" but I don't have reviewed
> it yet. I am not sure we can reuse it, anyway. Kame?
> 

I reviewed it and it seems good. It adds kmem.limit_in_bytes then we're ready
to go forward to kernel memory cgroup.
But it adds only interfaces now.

I think  Greg Thelen <gthelen@google.com> has some idea.


> My simple idea is as follows,
> 
> We can assign basic revered page pool and/or size of user-determined pages pool
> for each task registred at memcg-slab.

Hmm, memcg-mempool ?


> The application have to notify start of RT section to memcg before it goes to
> RT section. So, memcg could fill up page pool if it is short. In this case,
> application can stuck but it's okay as it doesn't go to RT section yet.
> The applicatoin have to notify end of RT section to memcg, too so that memcg
> could try to fill up reserved page pool in case of shortage.
> 

That 'notification' doesn't sounds good to me. When application died/moved to
other group without notification, memcg will be unstable.
It should be task's state rather than memcg's state.


> Why we need such notification is kswapd high prioiry, new knob and others never
> can meet application's deadline requirement in some situations(ex,
> there are so many dirty pages in LRU or fill up anon pages in non-swap case and so on)
> so that application might end up stuck at some point. The somepoint must be out of RT
> section of the task.
> 
> For implemenation, we might need new watermark setting for each memcg or/and
> kswapd prioirity promotion like thing for hurry reclaiming.
> Anyway, they are just implementaions and we could enhance/add further more through
> various techniques as time goes by.
> 
> Personally, I think it could a valuable featue.
> 

Hmm. For avoid latency at allocation, what we can do is only pre-allocation before it's
required. But the problem is that applications cannot forecast when the 'burst' allocation
happens and we need to prepare memory pool always.

I think we need 2 implemenations.

1. free-page mempool for a memcg.
2. a background reclaim thread for a memcg. This is triggered by mempool.
   Prioritity of this thread should be able to controlled by some ways.

If we take care of memcg's limit, watermark should trigger background reclaim.

?
But the memory reclaim routine should never be in sleep...


Thanks,
-Kame




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-11 23:22             ` David Rientjes
@ 2011-10-13 16:54               ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-13 16:54 UTC (permalink / raw)
  To: David Rientjes, Con Kolivas
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/11/2011 07:22 PM, David Rientjes wrote:
> On Tue, 11 Oct 2011, Satoru Moriya wrote:
> 
> I don't know if your test case is the only thing that Rik is looking 
> at, but if so, then that statement makes me believe that this patch is 
> definitely in the wrong direction, so NACK on it until additional 
> information is presented.  The reasoning is simple: if tuning the 
> bonus given to rt-tasks in the page allocator itself would fix the 
> issue, then we can certainly add logic specifically for rt-tasks that 
> can reclaim more aggressively without needing any tunable from 
> userspace (and _certainly_ not a global tunable that affects every application!).

My test case is just a simple one (maybe too simple), and I tried
to demonstrate following issues that current kernel has with it.

1. Current kernel uses free memory as pagecache.
2. Applications may allocate memory burstly and when it happens
   they may get a latency issue because there are not enough free
   memory. Also the amount of required memory is wide-ranging.
3. Some users would like to control the amount of free memory
   to avoid the situation above.
4. User can't setup the amount of free memory explicitly.
   From user's point of view, the amount of free memory is the delta
   between high watermark - min watermark because below min watermark
   user applications incur a penalty (direct reclaim). The width of
   delta depends on min_free_kbytes, actually min watermark / 2, and
   so if we want to make free memory bigger, we must make
   min_free_kbytes bigger. It's not a intuitive and it introduces
   another problem that is possibility of direct reclaim is increased.

I think my test case is too simple and so we may be able to avoid
the latency issue with my case by setting the workload rt-task,
improving how to decide rt-task bonus and/or introducing the
Con's patches below.

But my concern described above is still alive because whether
latency issue happen or not depends on how heavily workloads
allocate memory at a short time. Of cource we can say same
things for extra_free_kbytes, but we can change it and test
an effect easily.

>>> Does there exist anything like a test case which demonstrates the 
>>> need for this feature?
>>
>> Unfortunately I don't have a real test case but just simple one.
>> And in my simple test case, I can avoid direct reclaim if we set 
>> workload as rt-task.
>>
>> The simple test case I used is following:
>> http://marc.info/?l=linux-mm&m=131605773321672&w=2
>>
> 
> I tried proposing one of Con's patches from his BFS scheduler ("mm: 
> adjust kswapd nice level for high priority page") about 1 1/2 years 
> ago that I recall and believe may significantly help your test case.  
> The thread is at http://marc.info/?t=126743860700002.  (There's a lot 
> of interesting things in Con's patchset that can be pulled into the 
> VM, this isn't the only one.)
> 
> The patch wasn't merged back then because we wanted a test case that 
> was specifically fixed by this issue, and it may be that we have just 
> found one.  If you could try it out without any extra_free_kbytes, I 
> think we may be able to help your situation.

Thank you, David. I'll try my simple workload with it.

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13 16:54               ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-13 16:54 UTC (permalink / raw)
  To: David Rientjes, Con Kolivas
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/11/2011 07:22 PM, David Rientjes wrote:
> On Tue, 11 Oct 2011, Satoru Moriya wrote:
> 
> I don't know if your test case is the only thing that Rik is looking 
> at, but if so, then that statement makes me believe that this patch is 
> definitely in the wrong direction, so NACK on it until additional 
> information is presented.  The reasoning is simple: if tuning the 
> bonus given to rt-tasks in the page allocator itself would fix the 
> issue, then we can certainly add logic specifically for rt-tasks that 
> can reclaim more aggressively without needing any tunable from 
> userspace (and _certainly_ not a global tunable that affects every application!).

My test case is just a simple one (maybe too simple), and I tried
to demonstrate following issues that current kernel has with it.

1. Current kernel uses free memory as pagecache.
2. Applications may allocate memory burstly and when it happens
   they may get a latency issue because there are not enough free
   memory. Also the amount of required memory is wide-ranging.
3. Some users would like to control the amount of free memory
   to avoid the situation above.
4. User can't setup the amount of free memory explicitly.
   From user's point of view, the amount of free memory is the delta
   between high watermark - min watermark because below min watermark
   user applications incur a penalty (direct reclaim). The width of
   delta depends on min_free_kbytes, actually min watermark / 2, and
   so if we want to make free memory bigger, we must make
   min_free_kbytes bigger. It's not a intuitive and it introduces
   another problem that is possibility of direct reclaim is increased.

I think my test case is too simple and so we may be able to avoid
the latency issue with my case by setting the workload rt-task,
improving how to decide rt-task bonus and/or introducing the
Con's patches below.

But my concern described above is still alive because whether
latency issue happen or not depends on how heavily workloads
allocate memory at a short time. Of cource we can say same
things for extra_free_kbytes, but we can change it and test
an effect easily.

>>> Does there exist anything like a test case which demonstrates the 
>>> need for this feature?
>>
>> Unfortunately I don't have a real test case but just simple one.
>> And in my simple test case, I can avoid direct reclaim if we set 
>> workload as rt-task.
>>
>> The simple test case I used is following:
>> http://marc.info/?l=linux-mm&m=131605773321672&w=2
>>
> 
> I tried proposing one of Con's patches from his BFS scheduler ("mm: 
> adjust kswapd nice level for high priority page") about 1 1/2 years 
> ago that I recall and believe may significantly help your test case.  
> The thread is at http://marc.info/?t=126743860700002.  (There's a lot 
> of interesting things in Con's patchset that can be pulled into the 
> VM, this isn't the only one.)
> 
> The patch wasn't merged back then because we wanted a test case that 
> was specifically fixed by this issue, and it may be that we have just 
> found one.  If you could try it out without any extra_free_kbytes, I 
> think we may be able to help your situation.

Thank you, David. I'll try my simple workload with it.

Regards,
Satoru

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13 16:54               ` Satoru Moriya
@ 2011-10-13 20:48                 ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13 20:48 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Con Kolivas, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Thu, 13 Oct 2011, Satoru Moriya wrote:

> My test case is just a simple one (maybe too simple), and I tried
> to demonstrate following issues that current kernel has with it.
> 
> 1. Current kernel uses free memory as pagecache.
> 2. Applications may allocate memory burstly and when it happens
>    they may get a latency issue because there are not enough free
>    memory. Also the amount of required memory is wide-ranging.

This is what the per-zone watermarks are intended to address and I 
understand that it's not doing a good enough job for your particular 
workloads.  I'm trying to find a solution that mitigates that for all 
threads that allocate faster than the kernel can reclaim, realtime or 
otherwise, without requiring the admin to set those watermarks himself, 
which is really what extra_free_kbytes is eventually leading to.

> 3. Some users would like to control the amount of free memory
>    to avoid the situation above.

The only possible way to do that is with min_free_kbytes right now and 
that would increase the amount of memory that realtime threads have 
exclusive access to.  Let's try not to add additional tunables so that 
admins need to find their own optimal watermarks for every kernel release.  
I see no reason why we can't add logic for rt-threads triggering reclaim 
to either reclaim faster (Con's patch) or more memory than normal (an 
ALLOC_HARDER type bonus in the reclaim path to reclaim 1.25 * high_wmark, 
for example).  We've had a rt-thread bonus in the page allocator for a 
long time, I'm not saying we don't need more elsewhere.

> 4. User can't setup the amount of free memory explicitly.
>    From user's point of view, the amount of free memory is the delta
>    between high watermark - min watermark because below min watermark
>    user applications incur a penalty (direct reclaim). The width of
>    delta depends on min_free_kbytes, actually min watermark / 2, and
>    so if we want to make free memory bigger, we must make
>    min_free_kbytes bigger. It's not a intuitive and it introduces
>    another problem that is possibility of direct reclaim is increased.
> 

So you're saying that we need to increase the space between high_wmark and 
min_wmark anytime that min_free_kbytes changes?  That certainly may be 
true and would hopefully mitigate direct reclaim becoming too intrusive 
for your workload.

We _really_ don't want to cause regressions for others, though, which 
extra_free_kbytes can easily do for cpu-intensive workloads if nothing is 
currently requiring that extra burst of memory (and occurs because 
extra_free_kbytes is a global tunable and not tied to any specific 
application [like testing for rt_task()] that we can identify when 
reclaiming).

> But my concern described above is still alive because whether
> latency issue happen or not depends on how heavily workloads
> allocate memory at a short time. Of cource we can say same
> things for extra_free_kbytes, but we can change it and test
> an effect easily.
> 

We'll never know the future and how much memory a latency-sensitive 
application will require 100ms from now.  The only thing that we can do is 
(i) identify the latency-sensitive app, (ii) reclaim more aggressively for 
them, and (iii) reclaim additional memory in preparation for another 
burst.  At some point, though, userspace needs to be responsible to not 
allocate enormous amounts of memory all at once and there's room for 
mitigation there too to preallocate ahead of what you actually need.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13 20:48                 ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13 20:48 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Con Kolivas, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Thu, 13 Oct 2011, Satoru Moriya wrote:

> My test case is just a simple one (maybe too simple), and I tried
> to demonstrate following issues that current kernel has with it.
> 
> 1. Current kernel uses free memory as pagecache.
> 2. Applications may allocate memory burstly and when it happens
>    they may get a latency issue because there are not enough free
>    memory. Also the amount of required memory is wide-ranging.

This is what the per-zone watermarks are intended to address and I 
understand that it's not doing a good enough job for your particular 
workloads.  I'm trying to find a solution that mitigates that for all 
threads that allocate faster than the kernel can reclaim, realtime or 
otherwise, without requiring the admin to set those watermarks himself, 
which is really what extra_free_kbytes is eventually leading to.

> 3. Some users would like to control the amount of free memory
>    to avoid the situation above.

The only possible way to do that is with min_free_kbytes right now and 
that would increase the amount of memory that realtime threads have 
exclusive access to.  Let's try not to add additional tunables so that 
admins need to find their own optimal watermarks for every kernel release.  
I see no reason why we can't add logic for rt-threads triggering reclaim 
to either reclaim faster (Con's patch) or more memory than normal (an 
ALLOC_HARDER type bonus in the reclaim path to reclaim 1.25 * high_wmark, 
for example).  We've had a rt-thread bonus in the page allocator for a 
long time, I'm not saying we don't need more elsewhere.

> 4. User can't setup the amount of free memory explicitly.
>    From user's point of view, the amount of free memory is the delta
>    between high watermark - min watermark because below min watermark
>    user applications incur a penalty (direct reclaim). The width of
>    delta depends on min_free_kbytes, actually min watermark / 2, and
>    so if we want to make free memory bigger, we must make
>    min_free_kbytes bigger. It's not a intuitive and it introduces
>    another problem that is possibility of direct reclaim is increased.
> 

So you're saying that we need to increase the space between high_wmark and 
min_wmark anytime that min_free_kbytes changes?  That certainly may be 
true and would hopefully mitigate direct reclaim becoming too intrusive 
for your workload.

We _really_ don't want to cause regressions for others, though, which 
extra_free_kbytes can easily do for cpu-intensive workloads if nothing is 
currently requiring that extra burst of memory (and occurs because 
extra_free_kbytes is a global tunable and not tied to any specific 
application [like testing for rt_task()] that we can identify when 
reclaiming).

> But my concern described above is still alive because whether
> latency issue happen or not depends on how heavily workloads
> allocate memory at a short time. Of cource we can say same
> things for extra_free_kbytes, but we can change it and test
> an effect easily.
> 

We'll never know the future and how much memory a latency-sensitive 
application will require 100ms from now.  The only thing that we can do is 
(i) identify the latency-sensitive app, (ii) reclaim more aggressively for 
them, and (iii) reclaim additional memory in preparation for another 
burst.  At some point, though, userspace needs to be responsible to not 
allocate enormous amounts of memory all at once and there's room for 
mitigation there too to preallocate ahead of what you actually need.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13  5:35                           ` KAMEZAWA Hiroyuki
@ 2011-10-13 20:55                             ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13 20:55 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Satoru Moriya, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Thu, 13 Oct 2011, KAMEZAWA Hiroyuki wrote:

> sys_mem_shrink(int nid, int nr_scan_pages, int flags)
> 
> This system call scans LRU of specified nodes and free pages on LRU.
> This scan nr_scan_pages in LRU and returns the number of successfully
> freed pages.
> ==
> 
> Then, running this progam in SCHED_IDLE, a user can make free pages while
> the system is idle. If running in the highest priority, a user can keep
> free pages as he want. If a user run this under a memcg, user can free
> pages in a memcg. 
> 

Satoru was specifically talking about the VM using free memory for 
pagecache, so doing echo echo 1 > /proc/sys/vm/drop_caches can mitigate 
that almost immediately.  I think the key to the discussion, though, is 
that even the application doesn't know it's bursty memory behavior before 
it happens and the kernel entering direct reclaim hurts latency-sensitive 
applications.

If there were a change to increase the space significantly between the 
high and min watermark when min_free_kbytes changes, that would fix the 
problem.  The problem is two-fold: that comes at a penalty for systems 
or workloads that don't need to reclaim the additional memory, and it's 
not clear how much space should exist between those watermarks.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13 20:55                             ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13 20:55 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Satoru Moriya, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Thu, 13 Oct 2011, KAMEZAWA Hiroyuki wrote:

> sys_mem_shrink(int nid, int nr_scan_pages, int flags)
> 
> This system call scans LRU of specified nodes and free pages on LRU.
> This scan nr_scan_pages in LRU and returns the number of successfully
> freed pages.
> ==
> 
> Then, running this progam in SCHED_IDLE, a user can make free pages while
> the system is idle. If running in the highest priority, a user can keep
> free pages as he want. If a user run this under a memcg, user can free
> pages in a memcg. 
> 

Satoru was specifically talking about the VM using free memory for 
pagecache, so doing echo echo 1 > /proc/sys/vm/drop_caches can mitigate 
that almost immediately.  I think the key to the discussion, though, is 
that even the application doesn't know it's bursty memory behavior before 
it happens and the kernel entering direct reclaim hurts latency-sensitive 
applications.

If there were a change to increase the space significantly between the 
high and min watermark when min_free_kbytes changes, that would fix the 
problem.  The problem is two-fold: that comes at a penalty for systems 
or workloads that don't need to reclaim the additional memory, and it's 
not clear how much space should exist between those watermarks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13 20:48                 ` David Rientjes
@ 2011-10-13 21:11                   ` Rik van Riel
  -1 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-13 21:11 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Con Kolivas, Andrew Morton, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On 10/13/2011 04:48 PM, David Rientjes wrote:

> We'll never know the future and how much memory a latency-sensitive
> application will require 100ms from now.  The only thing that we can do is
> (i) identify the latency-sensitive app, (ii) reclaim more aggressively for
> them, and (iii) reclaim additional memory in preparation for another

This is why I proposed a watermark solution.

> burst.  At some point, though, userspace needs to be responsible to not
> allocate enormous amounts of memory all at once and there's room for
> mitigation there too to preallocate ahead of what you actually need.

Userspace cannot be responsible, for the simple reason that
the allocations might be done in the kernel.

Think about an mlocked realtime program handling network
packets. Memory is allocated when packets come in, and when
the program calls sys_send(), which causes packets to get
sent.

I don't see how we can make userspace responsible for
kernel-side allocations.

I did not propose the extra_free_kbytes patch because I
like it, or out of laziness, but because I truly have not
come up with a better solution.

So far, neither this thread (which is unfortunate).

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13 21:11                   ` Rik van Riel
  0 siblings, 0 replies; 100+ messages in thread
From: Rik van Riel @ 2011-10-13 21:11 UTC (permalink / raw)
  To: David Rientjes
  Cc: Satoru Moriya, Con Kolivas, Andrew Morton, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On 10/13/2011 04:48 PM, David Rientjes wrote:

> We'll never know the future and how much memory a latency-sensitive
> application will require 100ms from now.  The only thing that we can do is
> (i) identify the latency-sensitive app, (ii) reclaim more aggressively for
> them, and (iii) reclaim additional memory in preparation for another

This is why I proposed a watermark solution.

> burst.  At some point, though, userspace needs to be responsible to not
> allocate enormous amounts of memory all at once and there's room for
> mitigation there too to preallocate ahead of what you actually need.

Userspace cannot be responsible, for the simple reason that
the allocations might be done in the kernel.

Think about an mlocked realtime program handling network
packets. Memory is allocated when packets come in, and when
the program calls sys_send(), which causes packets to get
sent.

I don't see how we can make userspace responsible for
kernel-side allocations.

I did not propose the extra_free_kbytes patch because I
like it, or out of laziness, but because I truly have not
come up with a better solution.

So far, neither this thread (which is unfortunate).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13 21:11                   ` Rik van Riel
@ 2011-10-13 22:02                     ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13 22:02 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, Con Kolivas, Andrew Morton, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Thu, 13 Oct 2011, Rik van Riel wrote:

> Userspace cannot be responsible, for the simple reason that
> the allocations might be done in the kernel.
> 
> Think about an mlocked realtime program handling network
> packets. Memory is allocated when packets come in, and when
> the program calls sys_send(), which causes packets to get
> sent.
> 
> I don't see how we can make userspace responsible for
> kernel-side allocations.
> 

All of this is what Minchan was proposing by allowing a mempool of kernel 
memory to be preallocated and managed by the memory controller.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* Re: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-13 22:02                     ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-13 22:02 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Satoru Moriya, Con Kolivas, Andrew Morton, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Thu, 13 Oct 2011, Rik van Riel wrote:

> Userspace cannot be responsible, for the simple reason that
> the allocations might be done in the kernel.
> 
> Think about an mlocked realtime program handling network
> packets. Memory is allocated when packets come in, and when
> the program calls sys_send(), which causes packets to get
> sent.
> 
> I don't see how we can make userspace responsible for
> kernel-side allocations.
> 

All of this is what Minchan was proposing by allowing a mempool of kernel 
memory to be preallocated and managed by the memory controller.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13  0:01                         ` David Rientjes
@ 2011-10-14  5:06                           ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-14  5:06 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/12/2011 08:01 PM, David Rientjes wrote:
>> I understand what you concern. But in some area such as banking,
>> > stock exchange, train/power/plant control sysemts etc this kind
>> > of tunable is welcomed because they can tune their systems at
>> > their own risk.
>> > 
> You haven't tried the patch that increases the priority of kswapd when 
> such a latency sensitive thread triggers background reclaim?

No, not yet. I'll try it.

Thanks,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-14  5:06                           ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-14  5:06 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/12/2011 08:01 PM, David Rientjes wrote:
>> I understand what you concern. But in some area such as banking,
>> > stock exchange, train/power/plant control sysemts etc this kind
>> > of tunable is welcomed because they can tune their systems at
>> > their own risk.
>> > 
> You haven't tried the patch that increases the priority of kswapd when 
> such a latency sensitive thread triggers background reclaim?

No, not yet. I'll try it.

Thanks,
Satoru
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13  5:35                           ` KAMEZAWA Hiroyuki
@ 2011-10-14  5:32                             ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-14  5:32 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, David Rientjes
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/13/2011 01:35 AM, KAMEZAWA Hiroyuki wrote:
> 
> I don't read full story but....how about adding a new syscall like
> 
> ==
> sys_mem_shrink(int nid, int nr_scan_pages, int flags)
> 
> This system call scans LRU of specified nodes and free pages on LRU.
> This scan nr_scan_pages in LRU and returns the number of successfully
> freed pages.
> ==
> 
> Then, running this progam in SCHED_IDLE, a user can make free pages while
> the system is idle. If running in the highest priority, a user can keep
> free pages as he want. If a user run this under a memcg, user can free
> pages in a memcg. 

This is interesting. We can make userspace kswapd as we like with
this syscall. But it seems harder to be accepted...

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-14  5:32                             ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-14  5:32 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, David Rientjes
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/13/2011 01:35 AM, KAMEZAWA Hiroyuki wrote:
> 
> I don't read full story but....how about adding a new syscall like
> 
> ==
> sys_mem_shrink(int nid, int nr_scan_pages, int flags)
> 
> This system call scans LRU of specified nodes and free pages on LRU.
> This scan nr_scan_pages in LRU and returns the number of successfully
> freed pages.
> ==
> 
> Then, running this progam in SCHED_IDLE, a user can make free pages while
> the system is idle. If running in the highest priority, a user can keep
> free pages as he want. If a user run this under a memcg, user can free
> pages in a memcg. 

This is interesting. We can make userspace kswapd as we like with
this syscall. But it seems harder to be accepted...

Regards,
Satoru
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13 20:55                             ` David Rientjes
@ 2011-10-14 22:16                               ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-14 22:16 UTC (permalink / raw)
  To: David Rientjes, KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/13/2011 04:55 PM, David Rientjes wrote:
> 
> Satoru was specifically talking about the VM using free memory for 
> pagecache,

Yes, because we can't stop increasing pagecache and it 
occupies RAM where some people want to keep free for bursty
memory requirement. Usually it works fine but sometimes like
my test case doesn't work well.

> so doing echo echo 1 > /proc/sys/vm/drop_caches can mitigate 
> that almost immediately.  

I know it and some admins use that kind of tuning. But is it
proper way? Should we exec the script like above periodically?
I believe that we should use it for debug only.

> I think the key to the discussion, though, is 
> that even the application doesn't know it's bursty memory behavior before 
> it happens and the kernel entering direct reclaim hurts latency-sensitive 
> applications.
>
> If there were a change to increase the space significantly between the 
> high and min watermark when min_free_kbytes changes, that would fix the 
> problem. 

Right. But min_free_kbytes changes both thresholds, foregroud reclaim
and background reclaim. I'd like to configure them separately like
dirty_bytes and dirty_background_bytes for flexibility.

> The problem is two-fold: that comes at a penalty for systems 
> or workloads that don't need to reclaim the additional memory, and it's 
> not clear how much space should exist between those watermarks.

The required size depends on a system architacture such as kernel,
applications, storage etc. and so admin who care the whole system
should configure it based on tests by his own risk.

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-14 22:16                               ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-14 22:16 UTC (permalink / raw)
  To: David Rientjes, KAMEZAWA Hiroyuki
  Cc: Andrew Morton, Rik van Riel, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, Hugh Dickins,
	hannes

On 10/13/2011 04:55 PM, David Rientjes wrote:
> 
> Satoru was specifically talking about the VM using free memory for 
> pagecache,

Yes, because we can't stop increasing pagecache and it 
occupies RAM where some people want to keep free for bursty
memory requirement. Usually it works fine but sometimes like
my test case doesn't work well.

> so doing echo echo 1 > /proc/sys/vm/drop_caches can mitigate 
> that almost immediately.  

I know it and some admins use that kind of tuning. But is it
proper way? Should we exec the script like above periodically?
I believe that we should use it for debug only.

> I think the key to the discussion, though, is 
> that even the application doesn't know it's bursty memory behavior before 
> it happens and the kernel entering direct reclaim hurts latency-sensitive 
> applications.
>
> If there were a change to increase the space significantly between the 
> high and min watermark when min_free_kbytes changes, that would fix the 
> problem. 

Right. But min_free_kbytes changes both thresholds, foregroud reclaim
and background reclaim. I'd like to configure them separately like
dirty_bytes and dirty_background_bytes for flexibility.

> The problem is two-fold: that comes at a penalty for systems 
> or workloads that don't need to reclaim the additional memory, and it's 
> not clear how much space should exist between those watermarks.

The required size depends on a system architacture such as kernel,
applications, storage etc. and so admin who care the whole system
should configure it based on tests by his own risk.

Regards,
Satoru
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-14 22:16                               ` Satoru Moriya
@ 2011-10-14 22:46                                 ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-14 22:46 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Fri, 14 Oct 2011, Satoru Moriya wrote:

> > Satoru was specifically talking about the VM using free memory for 
> > pagecache,
> 
> Yes, because we can't stop increasing pagecache and it 
> occupies RAM where some people want to keep free for bursty
> memory requirement. Usually it works fine but sometimes like
> my test case doesn't work well.
> 
> > so doing echo echo 1 > /proc/sys/vm/drop_caches can mitigate 
> > that almost immediately.  
> 
> I know it and some admins use that kind of tuning. But is it
> proper way? Should we exec the script like above periodically?
> I believe that we should use it for debug only.
> 

Agreed, this was in response to the suggestion for adding a mem_shrink() 
syscall, which would require the same periodic calls or knowledge of the 
application prior to the bursty memory allocations.  I bring up 
drop_caches just to illustrate that it is effectively the same thing for 
the entire address space when pressured by pagecache.  So I don't think 
that syscall would actually help for your scenario.

> > If there were a change to increase the space significantly between the 
> > high and min watermark when min_free_kbytes changes, that would fix the 
> > problem. 
> 
> Right. But min_free_kbytes changes both thresholds, foregroud reclaim
> and background reclaim. I'd like to configure them separately like
> dirty_bytes and dirty_background_bytes for flexibility.
> 

The point I'm trying to make is that if kswapd can be made aware that it 
was kicked by a rt_task() in the page allocator, the same criteria we use 
for ALLOC_HARDER today, or a rt_task() subsequently enters the page 
allocator slowpath while kswapd is running, then not only can we increase 
the scheduling priority of kswapd but it is also possible to reclaim above 
the high watermark for an extra bonus.  I believe we can find a sane 
middle ground that requires no userspace tunable where a _single_ realtime 
application cannot allocate memory faster than kswapd with very high 
priority and reclaiming above the high watermark, whether that's a factor 
of 1.25 or not.

> > The problem is two-fold: that comes at a penalty for systems 
> > or workloads that don't need to reclaim the additional memory, and it's 
> > not clear how much space should exist between those watermarks.
> 
> The required size depends on a system architacture such as kernel,
> applications, storage etc. and so admin who care the whole system
> should configure it based on tests by his own risk.
> 

Doing that comes at a penalty for other workloads that are running on the 
same system, which is the problem with a global tunable that doesn't 
discriminate on an allocator's priority (the min -> high watermarks for 
reclaim do well except for rt-threads, as evidenced by this thread).

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-14 22:46                                 ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-14 22:46 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, Rik van Riel, Randy Dunlap,
	Satoru Moriya, linux-kernel, linux-mm, lwoodman, Seiji Aguchi,
	Hugh Dickins, hannes

On Fri, 14 Oct 2011, Satoru Moriya wrote:

> > Satoru was specifically talking about the VM using free memory for 
> > pagecache,
> 
> Yes, because we can't stop increasing pagecache and it 
> occupies RAM where some people want to keep free for bursty
> memory requirement. Usually it works fine but sometimes like
> my test case doesn't work well.
> 
> > so doing echo echo 1 > /proc/sys/vm/drop_caches can mitigate 
> > that almost immediately.  
> 
> I know it and some admins use that kind of tuning. But is it
> proper way? Should we exec the script like above periodically?
> I believe that we should use it for debug only.
> 

Agreed, this was in response to the suggestion for adding a mem_shrink() 
syscall, which would require the same periodic calls or knowledge of the 
application prior to the bursty memory allocations.  I bring up 
drop_caches just to illustrate that it is effectively the same thing for 
the entire address space when pressured by pagecache.  So I don't think 
that syscall would actually help for your scenario.

> > If there were a change to increase the space significantly between the 
> > high and min watermark when min_free_kbytes changes, that would fix the 
> > problem. 
> 
> Right. But min_free_kbytes changes both thresholds, foregroud reclaim
> and background reclaim. I'd like to configure them separately like
> dirty_bytes and dirty_background_bytes for flexibility.
> 

The point I'm trying to make is that if kswapd can be made aware that it 
was kicked by a rt_task() in the page allocator, the same criteria we use 
for ALLOC_HARDER today, or a rt_task() subsequently enters the page 
allocator slowpath while kswapd is running, then not only can we increase 
the scheduling priority of kswapd but it is also possible to reclaim above 
the high watermark for an extra bonus.  I believe we can find a sane 
middle ground that requires no userspace tunable where a _single_ realtime 
application cannot allocate memory faster than kswapd with very high 
priority and reclaiming above the high watermark, whether that's a factor 
of 1.25 or not.

> > The problem is two-fold: that comes at a penalty for systems 
> > or workloads that don't need to reclaim the additional memory, and it's 
> > not clear how much space should exist between those watermarks.
> 
> The required size depends on a system architacture such as kernel,
> applications, storage etc. and so admin who care the whole system
> should configure it based on tests by his own risk.
> 

Doing that comes at a penalty for other workloads that are running on the 
same system, which is the problem with a global tunable that doesn't 
discriminate on an allocator's priority (the min -> high watermarks for 
reclaim do well except for rt-threads, as evidenced by this thread).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-12 19:58                       ` Rik van Riel
@ 2011-10-21 23:48                         ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-21 23:48 UTC (permalink / raw)
  To: Rik van Riel, Andrew Morton
  Cc: David Rientjes, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/12/2011 03:58 PM, Rik van Riel wrote:
> On 10/12/2011 03:20 PM, Andrew Morton wrote:
>> On Wed, 12 Oct 2011 09:09:17 -0400
>> Rik van Riel<riel@redhat.com>  wrote:
>>
>> Do we actually have a real-world application which is hurting from
>> this?
> 
> Satoru-san?

Sorry for late reply.

We do.
Basically we need this kind of feature for almost all our latency
sensitive applications to avoid latency issue in memory allocation.

Currently we run those applications on custom kernels which this
kind of patch is applied to. But it is hard for us to support every
kernel version for it. Also there are several customers who can't
accept a custom kernel and so they must use other commercial Unix.
If this feature is accepted, they will definitely use it on their
systems.

Thanks,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-21 23:48                         ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-21 23:48 UTC (permalink / raw)
  To: Rik van Riel, Andrew Morton
  Cc: David Rientjes, Randy Dunlap, Satoru Moriya, linux-kernel,
	linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On 10/12/2011 03:58 PM, Rik van Riel wrote:
> On 10/12/2011 03:20 PM, Andrew Morton wrote:
>> On Wed, 12 Oct 2011 09:09:17 -0400
>> Rik van Riel<riel@redhat.com>  wrote:
>>
>> Do we actually have a real-world application which is hurting from
>> this?
> 
> Satoru-san?

Sorry for late reply.

We do.
Basically we need this kind of feature for almost all our latency
sensitive applications to avoid latency issue in memory allocation.

Currently we run those applications on custom kernels which this
kind of patch is applied to. But it is hard for us to support every
kernel version for it. Also there are several customers who can't
accept a custom kernel and so they must use other commercial Unix.
If this feature is accepted, they will definitely use it on their
systems.

Thanks,
Satoru
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-13  5:22                   ` David Rientjes
@ 2011-10-22  0:11                     ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-22  0:11 UTC (permalink / raw)
  To: David Rientjes, Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, Andrew Morton, Hugh Dickins, hannes

On 10/13/2011 01:22 AM, David Rientjes wrote:
> On Thu, 13 Oct 2011, Rik van Riel wrote:
> 
>> Furthermore, I am not sure that giving kswapd more CPU time is
>> going to help, because kswapd could be stuck on some lock, held
>> by a lower priority (or sleeping) context.
>>
>> I agree that the BFS patch would be worth a try, and would be
>> very pleasantly surprised if it worked, but I am not very
>> optimistic about it...
>>
> 
> It may require a combination of Con's patch, increasing the priority of 
> kswapd if a higher priority task kicks it in the page allocator, and an 
> extra bonus on top of the high watermark if it was triggered by a 
> rt-thread -- similar to ALLOC_HARDER but instead reclaiming to 
> (high * 1.25).

I tested Con's patch. The results are following.

1. delayacct result

RECLAIM                     count    delay total  delay average
---------------------------------------------------------------
normal task w/o Con's patch   210       42685857        203us
rt task w/o Con's patch        32        4922368        153us
rt task w   Con's patch        29        4399320        151us


2. /proc/vmstat result
                     normal task w/o  rt task w/o  rt task w/
                         Con's patch  Con's patch  Con's patch
---------------------------------------------------------------------
nr_vmscan_write                    0        13160        14536
pgsteal_dma                        0            0            0
pgsteal_dma32                 182710       175049       169871
pgsteal_normal                 10260         9499        13077
pgsteal_movable                    0            0            0
pgscan_kswapd_dma                  0            0            0
pgscan_kswapd_dma32           127159       149096       147924
pgscan_kswapd_normal           26094        49011        33186
pgscan_kswapd_movable              0            0            0
pgscan_direct_dma                  0            0            0
pgscan_direct_dma32            55551        25923        21947
pgscan_direct_normal            7128         3624         2816
pgscan_direct_movable              0            0            0
kswapd_steal                  134481       157951       159556
kswapd_inodesteal                  0            0            0
kswapd_low_wmark_hit_quickly       0            0            6
kswapd_high_wmark_hit_quickly      0            0            0
allocstall                       324          151          128

Unfortunately, it seems that Con's patch does not improve my
testcase so much. We may need extra bonus on the high watermark if
we take the way above. But necessary bonus depends on workloads,
hardware etc., so it can't be solved with fixed bonus, I think.

> If we're going to go with extra_free_kbytes, then I'd like to see the test 
> case posted with a mathematical formula to show me what I should tune it 
> to be depending on my machine's memory capacity and amount of free RAM 
> when started (and I can use mem= to test it for various capacities).  For 
> this to be merged, there should be a clear expression that shows what the 
> ideal setting of the tunable should be rather than asking for trial-and-
> error to see what works and what doesn't.  If such an expression doesn't 
> exist, then it's clear that the necessary setting will vary significantly 
> as the implementation changes from kernel to kernel.

Hmm, try and error is tuning itself, isn't it? When we tune a system,
we usually set some knobs, run some benchmarks/tests/etc., evaluate
the results and decide which is the appropriate value.

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-22  0:11                     ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-22  0:11 UTC (permalink / raw)
  To: David Rientjes, Rik van Riel
  Cc: Randy Dunlap, Satoru Moriya, linux-kernel, linux-mm, lwoodman,
	Seiji Aguchi, Andrew Morton, Hugh Dickins, hannes

On 10/13/2011 01:22 AM, David Rientjes wrote:
> On Thu, 13 Oct 2011, Rik van Riel wrote:
> 
>> Furthermore, I am not sure that giving kswapd more CPU time is
>> going to help, because kswapd could be stuck on some lock, held
>> by a lower priority (or sleeping) context.
>>
>> I agree that the BFS patch would be worth a try, and would be
>> very pleasantly surprised if it worked, but I am not very
>> optimistic about it...
>>
> 
> It may require a combination of Con's patch, increasing the priority of 
> kswapd if a higher priority task kicks it in the page allocator, and an 
> extra bonus on top of the high watermark if it was triggered by a 
> rt-thread -- similar to ALLOC_HARDER but instead reclaiming to 
> (high * 1.25).

I tested Con's patch. The results are following.

1. delayacct result

RECLAIM                     count    delay total  delay average
---------------------------------------------------------------
normal task w/o Con's patch   210       42685857        203us
rt task w/o Con's patch        32        4922368        153us
rt task w   Con's patch        29        4399320        151us


2. /proc/vmstat result
                     normal task w/o  rt task w/o  rt task w/
                         Con's patch  Con's patch  Con's patch
---------------------------------------------------------------------
nr_vmscan_write                    0        13160        14536
pgsteal_dma                        0            0            0
pgsteal_dma32                 182710       175049       169871
pgsteal_normal                 10260         9499        13077
pgsteal_movable                    0            0            0
pgscan_kswapd_dma                  0            0            0
pgscan_kswapd_dma32           127159       149096       147924
pgscan_kswapd_normal           26094        49011        33186
pgscan_kswapd_movable              0            0            0
pgscan_direct_dma                  0            0            0
pgscan_direct_dma32            55551        25923        21947
pgscan_direct_normal            7128         3624         2816
pgscan_direct_movable              0            0            0
kswapd_steal                  134481       157951       159556
kswapd_inodesteal                  0            0            0
kswapd_low_wmark_hit_quickly       0            0            6
kswapd_high_wmark_hit_quickly      0            0            0
allocstall                       324          151          128

Unfortunately, it seems that Con's patch does not improve my
testcase so much. We may need extra bonus on the high watermark if
we take the way above. But necessary bonus depends on workloads,
hardware etc., so it can't be solved with fixed bonus, I think.

> If we're going to go with extra_free_kbytes, then I'd like to see the test 
> case posted with a mathematical formula to show me what I should tune it 
> to be depending on my machine's memory capacity and amount of free RAM 
> when started (and I can use mem= to test it for various capacities).  For 
> this to be merged, there should be a clear expression that shows what the 
> ideal setting of the tunable should be rather than asking for trial-and-
> error to see what works and what doesn't.  If such an expression doesn't 
> exist, then it's clear that the necessary setting will vary significantly 
> as the implementation changes from kernel to kernel.

Hmm, try and error is tuning itself, isn't it? When we tune a system,
we usually set some knobs, run some benchmarks/tests/etc., evaluate
the results and decide which is the appropriate value.

Regards,
Satoru
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-21 23:48                         ` Satoru Moriya
@ 2011-10-23 21:22                           ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-23 21:22 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Fri, 21 Oct 2011, Satoru Moriya wrote:

> We do.
> Basically we need this kind of feature for almost all our latency
> sensitive applications to avoid latency issue in memory allocation.
> 

These are all realtime?

> Currently we run those applications on custom kernels which this
> kind of patch is applied to. But it is hard for us to support every
> kernel version for it. Also there are several customers who can't
> accept a custom kernel and so they must use other commercial Unix.
> If this feature is accepted, they will definitely use it on their
> systems.
> 

That's precisely the problem, it's behavior is going to vary widely from 
version to version as the implementation changes for reclaim and 
compaction.  I think we can do much better with the priority of kswapd and 
reclaiming above the high watermark for threads that need a surplus of 
extra memory because they are realtime, two things we can easily do.

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-23 21:22                           ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-23 21:22 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Fri, 21 Oct 2011, Satoru Moriya wrote:

> We do.
> Basically we need this kind of feature for almost all our latency
> sensitive applications to avoid latency issue in memory allocation.
> 

These are all realtime?

> Currently we run those applications on custom kernels which this
> kind of patch is applied to. But it is hard for us to support every
> kernel version for it. Also there are several customers who can't
> accept a custom kernel and so they must use other commercial Unix.
> If this feature is accepted, they will definitely use it on their
> systems.
> 

That's precisely the problem, it's behavior is going to vary widely from 
version to version as the implementation changes for reclaim and 
compaction.  I think we can do much better with the priority of kswapd and 
reclaiming above the high watermark for threads that need a surplus of 
extra memory because they are realtime, two things we can easily do.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-23 21:22                           ` David Rientjes
@ 2011-10-25  2:04                             ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-25  2:04 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

saOn 10/23/2011 05:22 PM, David Rientjes wrote:
> On Fri, 21 Oct 2011, Satoru Moriya wrote:
> 
>> We do.
>> Basically we need this kind of feature for almost all our latency
>> sensitive applications to avoid latency issue in memory allocation.
>>
> 
> These are all realtime?

Do you mean that these are all realtime process?

If so, answer is depending on the situation. In the some situations,
we can set these applications as rt-task. But the other situation,
e.g. using some middlewares, package softwares etc, we can't set them
as rt-task because they are not built for running as rt-task. And also
it is difficult to rebuilt them for working as rt-task because they
usually have huge code base.

>> Currently we run those applications on custom kernels which this
>> kind of patch is applied to. But it is hard for us to support every
>> kernel version for it. Also there are several customers who can't
>> accept a custom kernel and so they must use other commercial Unix.
>> If this feature is accepted, they will definitely use it on their
>> systems.
>>
> 
> That's precisely the problem, it's behavior is going to vary widely from 
> version to version as the implementation changes for reclaim and 
> compaction.  I think we can do much better with the priority of kswapd and 
> reclaiming above the high watermark for threads that need a surplus of 
> extra memory because they are realtime, two things we can easily do.

As I reported another mail, changing kswapd priority does not mitigate
even my simple testcase very much. Of course, reclaiming above the high
wmark may solve the issue on some workloads but if an application can
allocate memory more than high wmark - min wmark which is extended and
fast enough, latency issue will happen.
Unless this latency concern is fixed, customers doesn't use vanilla
kernel.

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-25  2:04                             ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-25  2:04 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

saOn 10/23/2011 05:22 PM, David Rientjes wrote:
> On Fri, 21 Oct 2011, Satoru Moriya wrote:
> 
>> We do.
>> Basically we need this kind of feature for almost all our latency
>> sensitive applications to avoid latency issue in memory allocation.
>>
> 
> These are all realtime?

Do you mean that these are all realtime process?

If so, answer is depending on the situation. In the some situations,
we can set these applications as rt-task. But the other situation,
e.g. using some middlewares, package softwares etc, we can't set them
as rt-task because they are not built for running as rt-task. And also
it is difficult to rebuilt them for working as rt-task because they
usually have huge code base.

>> Currently we run those applications on custom kernels which this
>> kind of patch is applied to. But it is hard for us to support every
>> kernel version for it. Also there are several customers who can't
>> accept a custom kernel and so they must use other commercial Unix.
>> If this feature is accepted, they will definitely use it on their
>> systems.
>>
> 
> That's precisely the problem, it's behavior is going to vary widely from 
> version to version as the implementation changes for reclaim and 
> compaction.  I think we can do much better with the priority of kswapd and 
> reclaiming above the high watermark for threads that need a surplus of 
> extra memory because they are realtime, two things we can easily do.

As I reported another mail, changing kswapd priority does not mitigate
even my simple testcase very much. Of course, reclaiming above the high
wmark may solve the issue on some workloads but if an application can
allocate memory more than high wmark - min wmark which is extended and
fast enough, latency issue will happen.
Unless this latency concern is fixed, customers doesn't use vanilla
kernel.

Regards,
Satoru
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-25  2:04                             ` Satoru Moriya
@ 2011-10-25 21:50                               ` David Rientjes
  -1 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-25 21:50 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Mon, 24 Oct 2011, Satoru Moriya wrote:

> >> We do.
> >> Basically we need this kind of feature for almost all our latency
> >> sensitive applications to avoid latency issue in memory allocation.
> >>
> > 
> > These are all realtime?
> 
> Do you mean that these are all realtime process?
> 
> If so, answer is depending on the situation. In the some situations,
> we can set these applications as rt-task. But the other situation,
> e.g. using some middlewares, package softwares etc, we can't set them
> as rt-task because they are not built for running as rt-task. And also
> it is difficult to rebuilt them for working as rt-task because they
> usually have huge code base.
> 

If this problem affects processes that aren't realtime, then your only 
option is to increase /proc/sys/vm/min_free_kbytes.  It's unreasonable to 
believe that the VM should be able to reclaim in the background at the 
same rate that an application is allocating huge amounts of memory without 
allowing there to be a buffer.  Adding another tunable isn't going to 
address that situation better than min_free_kbytes.

> As I reported another mail, changing kswapd priority does not mitigate
> even my simple testcase very much. Of course, reclaiming above the high
> wmark may solve the issue on some workloads but if an application can
> allocate memory more than high wmark - min wmark which is extended and
> fast enough, latency issue will happen.
> Unless this latency concern is fixed, customers doesn't use vanilla
> kernel.
> 

And you have yet to provide an expression that shows what a sane setting 
for this tunable will be.  In fact, it seems like you're just doing trial 
and error and finding where it works pretty well for a certain VM 
implementation in a certain kernel.  That's simply not a maintainable 
userspace interface!

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-25 21:50                               ` David Rientjes
  0 siblings, 0 replies; 100+ messages in thread
From: David Rientjes @ 2011-10-25 21:50 UTC (permalink / raw)
  To: Satoru Moriya
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

On Mon, 24 Oct 2011, Satoru Moriya wrote:

> >> We do.
> >> Basically we need this kind of feature for almost all our latency
> >> sensitive applications to avoid latency issue in memory allocation.
> >>
> > 
> > These are all realtime?
> 
> Do you mean that these are all realtime process?
> 
> If so, answer is depending on the situation. In the some situations,
> we can set these applications as rt-task. But the other situation,
> e.g. using some middlewares, package softwares etc, we can't set them
> as rt-task because they are not built for running as rt-task. And also
> it is difficult to rebuilt them for working as rt-task because they
> usually have huge code base.
> 

If this problem affects processes that aren't realtime, then your only 
option is to increase /proc/sys/vm/min_free_kbytes.  It's unreasonable to 
believe that the VM should be able to reclaim in the background at the 
same rate that an application is allocating huge amounts of memory without 
allowing there to be a buffer.  Adding another tunable isn't going to 
address that situation better than min_free_kbytes.

> As I reported another mail, changing kswapd priority does not mitigate
> even my simple testcase very much. Of course, reclaiming above the high
> wmark may solve the issue on some workloads but if an application can
> allocate memory more than high wmark - min wmark which is extended and
> fast enough, latency issue will happen.
> Unless this latency concern is fixed, customers doesn't use vanilla
> kernel.
> 

And you have yet to provide an expression that shows what a sane setting 
for this tunable will be.  In fact, it seems like you're just doing trial 
and error and finding where it works pretty well for a certain VM 
implementation in a certain kernel.  That's simply not a maintainable 
userspace interface!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
  2011-10-25 21:50                               ` David Rientjes
@ 2011-10-26 18:59                                 ` Satoru Moriya
  -1 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-26 18:59 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

emaOn 10/25/2011 05:50 PM, David Rientjes wrote:
> On Mon, 24 Oct 2011, Satoru Moriya wrote:
> 
>>>> We do.
>>>> Basically we need this kind of feature for almost all our latency
>>>> sensitive applications to avoid latency issue in memory allocation.
>>>
>>> These are all realtime?
>>
>> Do you mean that these are all realtime process?
>>
>> If so, answer is depending on the situation. In the some situations,
>> we can set these applications as rt-task. But the other situation,
>> e.g. using some middlewares, package softwares etc, we can't set them
>> as rt-task because they are not built for running as rt-task. And also
>> it is difficult to rebuilt them for working as rt-task because they
>> usually have huge code base.
>>
> 
> If this problem affects processes that aren't realtime, then your only 
> option is to increase /proc/sys/vm/min_free_kbytes.  It's unreasonable to 
> believe that the VM should be able to reclaim in the background at the 
> same rate that an application is allocating huge amounts of memory without 
> allowing there to be a buffer.  Adding another tunable isn't going to 
> address that situation better than min_free_kbytes.


Even if allocating memory in user space causes latency issues, usually
allocation itself doesn't continue for a long time. Therefore if we
can keep enough free memory, we can avoid latency issue in this situation.

min_free_kbytes makes min wmark bigger too. It means that the amount of
memory user processes can use without penalty(direct reclaim) decrease
unnecessarily, this is what we'd like to avoid.


>> As I reported another mail, changing kswapd priority does not mitigate
>> even my simple testcase very much. Of course, reclaiming above the high
>> wmark may solve the issue on some workloads but if an application can
>> allocate memory more than high wmark - min wmark which is extended and
>> fast enough, latency issue will happen.
>> Unless this latency concern is fixed, customers doesn't use vanilla
>> kernel.
>>
> And you have yet to provide an expression that shows what a sane setting 
> for this tunable will be.  In fact, it seems like you're just doing trial 
> and error and finding where it works pretty well for a certain VM 
> implementation in a certain kernel.  That's simply not a maintainable 
> userspace interface!


Try and error is tuning itself. When we tune a system, we usually set
some knobs, run some benchmarks/tests/etc., evaluate results and
decide which is the best configuration.

Regards,
Satoru

^ permalink raw reply	[flat|nested] 100+ messages in thread

* RE: [PATCH -v2 -mm] add extra free kbytes tunable
@ 2011-10-26 18:59                                 ` Satoru Moriya
  0 siblings, 0 replies; 100+ messages in thread
From: Satoru Moriya @ 2011-10-26 18:59 UTC (permalink / raw)
  To: David Rientjes
  Cc: Rik van Riel, Andrew Morton, Randy Dunlap, Satoru Moriya,
	linux-kernel, linux-mm, lwoodman, Seiji Aguchi, hughd, hannes

emaOn 10/25/2011 05:50 PM, David Rientjes wrote:
> On Mon, 24 Oct 2011, Satoru Moriya wrote:
> 
>>>> We do.
>>>> Basically we need this kind of feature for almost all our latency
>>>> sensitive applications to avoid latency issue in memory allocation.
>>>
>>> These are all realtime?
>>
>> Do you mean that these are all realtime process?
>>
>> If so, answer is depending on the situation. In the some situations,
>> we can set these applications as rt-task. But the other situation,
>> e.g. using some middlewares, package softwares etc, we can't set them
>> as rt-task because they are not built for running as rt-task. And also
>> it is difficult to rebuilt them for working as rt-task because they
>> usually have huge code base.
>>
> 
> If this problem affects processes that aren't realtime, then your only 
> option is to increase /proc/sys/vm/min_free_kbytes.  It's unreasonable to 
> believe that the VM should be able to reclaim in the background at the 
> same rate that an application is allocating huge amounts of memory without 
> allowing there to be a buffer.  Adding another tunable isn't going to 
> address that situation better than min_free_kbytes.


Even if allocating memory in user space causes latency issues, usually
allocation itself doesn't continue for a long time. Therefore if we
can keep enough free memory, we can avoid latency issue in this situation.

min_free_kbytes makes min wmark bigger too. It means that the amount of
memory user processes can use without penalty(direct reclaim) decrease
unnecessarily, this is what we'd like to avoid.


>> As I reported another mail, changing kswapd priority does not mitigate
>> even my simple testcase very much. Of course, reclaiming above the high
>> wmark may solve the issue on some workloads but if an application can
>> allocate memory more than high wmark - min wmark which is extended and
>> fast enough, latency issue will happen.
>> Unless this latency concern is fixed, customers doesn't use vanilla
>> kernel.
>>
> And you have yet to provide an expression that shows what a sane setting 
> for this tunable will be.  In fact, it seems like you're just doing trial 
> and error and finding where it works pretty well for a certain VM 
> implementation in a certain kernel.  That's simply not a maintainable 
> userspace interface!


Try and error is tuning itself. When we tune a system, we usually set
some knobs, run some benchmarks/tests/etc., evaluate results and
decide which is the best configuration.

Regards,
Satoru
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 100+ messages in thread

end of thread, other threads:[~2011-10-26 19:00 UTC | newest]

Thread overview: 100+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-09-01 14:52 [PATCH -mm] add extra free kbytes tunable Rik van Riel
2011-09-01 14:52 ` Rik van Riel
2011-09-01 17:06 ` Randy Dunlap
2011-09-01 17:06   ` Randy Dunlap
2011-09-01 19:26   ` [PATCH -v2 " Rik van Riel
2011-09-01 19:26     ` Rik van Riel
2011-09-01 21:58     ` Andrew Morton
2011-09-01 21:58       ` Andrew Morton
2011-09-01 22:08       ` David Rientjes
2011-09-01 22:08         ` David Rientjes
2011-09-01 22:16         ` Andrew Morton
2011-09-01 22:16           ` Andrew Morton
2011-09-02 16:31       ` Satoru Moriya
2011-09-02 16:31         ` Satoru Moriya
2011-10-13  7:33         ` Minchan Kim
2011-10-13  7:33           ` Minchan Kim
2011-10-13  8:09           ` KAMEZAWA Hiroyuki
2011-10-13  8:09             ` KAMEZAWA Hiroyuki
     [not found]       ` <E1FA588BC672D846BDBB452FCA1E308C2389B4@USINDEVS02.corp.hds.com>
2011-09-15  3:33         ` Satoru Moriya
2011-09-15  3:33           ` Satoru Moriya
2011-09-01 22:09     ` Andrew Morton
2011-09-01 22:09       ` Andrew Morton
2011-09-02 16:26       ` [PATCH -mm] fixes & cleanups for "add extra free kbytes tunable" Rik van Riel
2011-09-02 16:26         ` Rik van Riel
2011-09-30 21:43     ` [PATCH -v2 -mm] add extra free kbytes tunable Johannes Weiner
2011-09-30 21:43       ` Johannes Weiner
2011-10-08  3:08     ` David Rientjes
2011-10-08  3:08       ` David Rientjes
2011-10-10 22:37       ` Andrew Morton
2011-10-10 22:37         ` Andrew Morton
2011-10-11 19:32         ` Satoru Moriya
2011-10-11 19:32           ` Satoru Moriya
2011-10-11 19:54           ` Andrew Morton
2011-10-11 19:54             ` Andrew Morton
2011-10-11 20:23             ` Satoru Moriya
2011-10-11 20:23               ` Satoru Moriya
2011-10-11 20:54               ` Andrew Morton
2011-10-11 20:54                 ` Andrew Morton
2011-10-12 13:09                 ` Rik van Riel
2011-10-12 13:09                   ` Rik van Riel
2011-10-12 19:20                   ` Andrew Morton
2011-10-12 19:20                     ` Andrew Morton
2011-10-12 19:58                     ` Rik van Riel
2011-10-12 19:58                       ` Rik van Riel
2011-10-12 20:26                       ` David Rientjes
2011-10-12 20:26                         ` David Rientjes
2011-10-21 23:48                       ` Satoru Moriya
2011-10-21 23:48                         ` Satoru Moriya
2011-10-23 21:22                         ` David Rientjes
2011-10-23 21:22                           ` David Rientjes
2011-10-25  2:04                           ` Satoru Moriya
2011-10-25  2:04                             ` Satoru Moriya
2011-10-25 21:50                             ` David Rientjes
2011-10-25 21:50                               ` David Rientjes
2011-10-26 18:59                               ` Satoru Moriya
2011-10-26 18:59                                 ` Satoru Moriya
2011-10-12 21:08                 ` Satoru Moriya
2011-10-12 21:08                   ` Satoru Moriya
2011-10-12 22:41                   ` David Rientjes
2011-10-12 22:41                     ` David Rientjes
2011-10-12 23:52                     ` Satoru Moriya
2011-10-12 23:52                       ` Satoru Moriya
2011-10-13  0:01                       ` David Rientjes
2011-10-13  0:01                         ` David Rientjes
2011-10-13  5:35                         ` KAMEZAWA Hiroyuki
2011-10-13  5:35                           ` KAMEZAWA Hiroyuki
2011-10-13 20:55                           ` David Rientjes
2011-10-13 20:55                             ` David Rientjes
2011-10-14 22:16                             ` Satoru Moriya
2011-10-14 22:16                               ` Satoru Moriya
2011-10-14 22:46                               ` David Rientjes
2011-10-14 22:46                                 ` David Rientjes
2011-10-14  5:32                           ` Satoru Moriya
2011-10-14  5:32                             ` Satoru Moriya
2011-10-14  5:06                         ` Satoru Moriya
2011-10-14  5:06                           ` Satoru Moriya
2011-10-11 23:22           ` David Rientjes
2011-10-11 23:22             ` David Rientjes
2011-10-13 16:54             ` Satoru Moriya
2011-10-13 16:54               ` Satoru Moriya
2011-10-13 20:48               ` David Rientjes
2011-10-13 20:48                 ` David Rientjes
2011-10-13 21:11                 ` Rik van Riel
2011-10-13 21:11                   ` Rik van Riel
2011-10-13 22:02                   ` David Rientjes
2011-10-13 22:02                     ` David Rientjes
2011-10-11 19:20       ` Satoru Moriya
2011-10-11 19:20         ` Satoru Moriya
2011-10-11 21:04         ` David Rientjes
2011-10-11 21:04           ` David Rientjes
2011-10-12 13:13           ` Rik van Riel
2011-10-12 13:13             ` Rik van Riel
2011-10-12 20:21             ` David Rientjes
2011-10-12 20:21               ` David Rientjes
2011-10-13  4:13               ` Rik van Riel
2011-10-13  4:13                 ` Rik van Riel
2011-10-13  5:22                 ` David Rientjes
2011-10-13  5:22                   ` David Rientjes
2011-10-22  0:11                   ` Satoru Moriya
2011-10-22  0:11                     ` Satoru Moriya

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.