linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] mm: scale kswapd watermarks in proportion to memory
@ 2016-02-22 23:33 Johannes Weiner
  2016-02-23  1:36 ` Rik van Riel
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Johannes Weiner @ 2016-02-22 23:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Rik van Riel, linux-mm, linux-kernel, kernel-team

In machines with 140G of memory and enterprise flash storage, we have
seen read and write bursts routinely exceed the kswapd watermarks and
cause thundering herds in direct reclaim. Unfortunately, the only way
to tune kswapd aggressiveness is through adjusting min_free_kbytes -
the system's emergency reserves - which is entirely unrelated to the
system's latency requirements. In order to get kswapd to maintain a
250M buffer of free memory, the emergency reserves need to be set to
1G. That is a lot of memory wasted for no good reason.

On the other hand, it's reasonable to assume that allocation bursts
and overall allocation concurrency scale with memory capacity, so it
makes sense to make kswapd aggressiveness a function of that as well.

Change the kswapd watermark scale factor from the currently fixed 25%
of the tunable emergency reserve to a tunable 0.001% of memory.

Beyond 1G of memory, this will produce bigger watermark steps than the
current formula in default settings. Ensure that the new formula never
chooses steps smaller than that, i.e. 25% of the emergency reserve.

On a 140G machine, this raises the default watermark steps - the
distance between min and low, and low and high - from 16M to 143M.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Mel Gorman <mgorman@suse.de>
---
 Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
 include/linux/mm.h          |  1 +
 include/linux/mmzone.h      |  2 ++
 kernel/sysctl.c             | 10 ++++++++++
 mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 58 insertions(+), 2 deletions(-)

v2: Ensure 25% of emergency reserves as a minimum on small machines -Rik

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89a887c..b02d940 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
 directory and inode objects. With vfs_cache_pressure=1000, it will look for
 ten times more freeable objects than there are.
 
+=============================================================
+
+watermark_scale_factor:
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.001% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
 ==============================================================
 
 zone_reclaim_mode:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0ad7af..d330cbb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1869,6 +1869,7 @@ extern void zone_pcp_reset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
+extern int watermark_scale_factor;
 
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 03cbdd9..85d6702 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -833,6 +833,8 @@ static inline int is_highmem(struct zone *zone)
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d479707..780769e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -1393,6 +1394,15 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "watermark_scale_factor",
+		.data		= &watermark_scale_factor,
+		.maxlen		= sizeof(watermark_scale_factor),
+		.mode		= 0644,
+		.proc_handler	= watermark_scale_factor_sysctl_handler,
+		.extra1		= &one,
+		.extra2		= &one_thousand,
+	},
+	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0c3eba3..0f457bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -6330,8 +6331,17 @@ static void __setup_per_zone_wmarks(void)
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		/*
+		 * Set the kswapd watermarks distance according to the
+		 * scale factor in proportion to available memory, but
+		 * ensure a minimum size on small systems.
+		 */
+		tmp = max_t(u64, tmp >> 2,
+			    mult_frac(zone->managed_pages,
+				      watermark_scale_factor, 10000));
+
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 			high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6472,6 +6482,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	if (write)
+		setup_per_zone_wmarks();
+
+	return 0;
+}
+
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
-- 
2.7.0

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: scale kswapd watermarks in proportion to memory
  2016-02-22 23:33 [PATCH v2] mm: scale kswapd watermarks in proportion to memory Johannes Weiner
@ 2016-02-23  1:36 ` Rik van Riel
  2016-02-23  1:53   ` Johannes Weiner
  2016-02-23  2:23 ` David Rientjes
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 8+ messages in thread
From: Rik van Riel @ 2016-02-23  1:36 UTC (permalink / raw)
  To: Johannes Weiner, Andrew Morton
  Cc: Mel Gorman, linux-mm, linux-kernel, kernel-team

[-- Attachment #1: Type: text/plain, Size: 646 bytes --]

On Mon, 2016-02-22 at 15:33 -0800, Johannes Weiner wrote:

> Beyond 1G of memory, this will produce bigger watermark steps than 

Is that supposed to be beyond 16GB?

> the
> current formula in default settings. Ensure that the new formula
> never
> chooses steps smaller than that, i.e. 25% of the emergency reserve.
> 
> On a 140G machine, this raises the default watermark steps - the
> distance between min and low, and low and high - from 16M to 143M.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> Acked-by: Mel Gorman <mgorman@suse.de>

Acked-by: Rik van Riel <riel@redhat.com>

-- 
All Rights Reversed.


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: scale kswapd watermarks in proportion to memory
  2016-02-23  1:36 ` Rik van Riel
@ 2016-02-23  1:53   ` Johannes Weiner
  0 siblings, 0 replies; 8+ messages in thread
From: Johannes Weiner @ 2016-02-23  1:53 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Andrew Morton, Mel Gorman, linux-mm, linux-kernel, kernel-team

On Mon, Feb 22, 2016 at 08:36:33PM -0500, Rik van Riel wrote:
> On Mon, 2016-02-22 at 15:33 -0800, Johannes Weiner wrote:
> 
> > Beyond 1G of memory, this will produce bigger watermark steps than 
> 
> Is that supposed to be beyond 16GB?

The old formula formula is

  min = sqrt(kb * 16)
  low = min >> 2

and the new one is

  low = kb * 0.001

and

  sqrt(x/1024 * 16) >> 2 = x/1024 * 0.001

puts x at

  x = 1 << 30

Did I miss something?

> Acked-by: Rik van Riel <riel@redhat.com>

Thanks!

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: scale kswapd watermarks in proportion to memory
  2016-02-22 23:33 [PATCH v2] mm: scale kswapd watermarks in proportion to memory Johannes Weiner
  2016-02-23  1:36 ` Rik van Riel
@ 2016-02-23  2:23 ` David Rientjes
  2016-02-24  0:36   ` Johannes Weiner
  2016-02-24  0:39 ` David Rientjes
  2016-02-25  0:37 ` Joonsoo Kim
  3 siblings, 1 reply; 8+ messages in thread
From: David Rientjes @ 2016-02-23  2:23 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, linux-mm, linux-kernel,
	kernel-team

On Mon, 22 Feb 2016, Johannes Weiner wrote:

> In machines with 140G of memory and enterprise flash storage, we have
> seen read and write bursts routinely exceed the kswapd watermarks and
> cause thundering herds in direct reclaim. Unfortunately, the only way
> to tune kswapd aggressiveness is through adjusting min_free_kbytes -
> the system's emergency reserves - which is entirely unrelated to the
> system's latency requirements. In order to get kswapd to maintain a
> 250M buffer of free memory, the emergency reserves need to be set to
> 1G. That is a lot of memory wasted for no good reason.
> 
> On the other hand, it's reasonable to assume that allocation bursts
> and overall allocation concurrency scale with memory capacity, so it
> makes sense to make kswapd aggressiveness a function of that as well.
> 
> Change the kswapd watermark scale factor from the currently fixed 25%
> of the tunable emergency reserve to a tunable 0.001% of memory.
> 

Making this tunable independent of min_free_kbytes is great.

I'm wondering how the choice of 0.001% was picked for default?  One of my 
workstations currently has step sizes of about 0.0005% so this will be 
doubling the steps from min to low and low to high.  I'm not objecting to 
that since it's definitely in the right direction (more free memory) but I 
wonder if it will make a difference for some users.

> Beyond 1G of memory, this will produce bigger watermark steps than the
> current formula in default settings. Ensure that the new formula never
> chooses steps smaller than that, i.e. 25% of the emergency reserve.
> 
> On a 140G machine, this raises the default watermark steps - the
> distance between min and low, and low and high - from 16M to 143M.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> Acked-by: Mel Gorman <mgorman@suse.de>
> ---
>  Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
>  include/linux/mm.h          |  1 +
>  include/linux/mmzone.h      |  2 ++
>  kernel/sysctl.c             | 10 ++++++++++
>  mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
>  5 files changed, 58 insertions(+), 2 deletions(-)
> 
> v2: Ensure 25% of emergency reserves as a minimum on small machines -Rik
> 
> diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> index 89a887c..b02d940 100644
> --- a/Documentation/sysctl/vm.txt
> +++ b/Documentation/sysctl/vm.txt
> @@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
>  directory and inode objects. With vfs_cache_pressure=1000, it will look for
>  ten times more freeable objects than there are.
>  
> +=============================================================
> +
> +watermark_scale_factor:
> +
> +This factor controls the aggressiveness of kswapd. It defines the
> +amount of memory left in a node/system before kswapd is woken up and
> +how much memory needs to be free before kswapd goes back to sleep.
> +
> +The unit is in fractions of 10,000. The default value of 10 means the
> +distances between watermarks are 0.001% of the available memory in the
> +node/system. The maximum value is 1000, or 10% of memory.
> +

The effective maximum value can be different than the tunable, though,
correct?  It seems like you'd want to document why watermark_scale_factor
and the actual watermarks in /proc/zoneinfo may be different on some
systems.

> +A high rate of threads entering direct reclaim (allocstall) or kswapd
> +going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
> +that the number of free pages kswapd maintains for latency reasons is
> +too small for the allocation bursts occurring in the system. This knob
> +can then be used to tune kswapd aggressiveness accordingly.
> +
>  ==============================================================
>  
>  zone_reclaim_mode:
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a0ad7af..d330cbb 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1869,6 +1869,7 @@ extern void zone_pcp_reset(struct zone *zone);
>  
>  /* page_alloc.c */
>  extern int min_free_kbytes;
> +extern int watermark_scale_factor;
>  
>  /* nommu.c */
>  extern atomic_long_t mmap_pages_allocated;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 03cbdd9..85d6702 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -833,6 +833,8 @@ static inline int is_highmem(struct zone *zone)
>  struct ctl_table;
>  int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
>  					void __user *, size_t *, loff_t *);
> +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
> +					void __user *, size_t *, loff_t *);
>  extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
>  int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
>  					void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index d479707..780769e 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -126,6 +126,7 @@ static int __maybe_unused two = 2;
>  static int __maybe_unused four = 4;
>  static unsigned long one_ul = 1;
>  static int one_hundred = 100;
> +static int one_thousand = 1000;
>  #ifdef CONFIG_PRINTK
>  static int ten_thousand = 10000;
>  #endif
> @@ -1393,6 +1394,15 @@ static struct ctl_table vm_table[] = {
>  		.extra1		= &zero,
>  	},
>  	{
> +		.procname	= "watermark_scale_factor",
> +		.data		= &watermark_scale_factor,
> +		.maxlen		= sizeof(watermark_scale_factor),
> +		.mode		= 0644,
> +		.proc_handler	= watermark_scale_factor_sysctl_handler,
> +		.extra1		= &one,
> +		.extra2		= &one_thousand,
> +	},
> +	{
>  		.procname	= "percpu_pagelist_fraction",
>  		.data		= &percpu_pagelist_fraction,
>  		.maxlen		= sizeof(percpu_pagelist_fraction),
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 0c3eba3..0f457bb 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
>  
>  int min_free_kbytes = 1024;
>  int user_min_free_kbytes = -1;
> +int watermark_scale_factor = 10;
>  
>  static unsigned long __meminitdata nr_kernel_pages;
>  static unsigned long __meminitdata nr_all_pages;
> @@ -6330,8 +6331,17 @@ static void __setup_per_zone_wmarks(void)
>  			zone->watermark[WMARK_MIN] = tmp;
>  		}
>  
> -		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
> -		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
> +		/*
> +		 * Set the kswapd watermarks distance according to the
> +		 * scale factor in proportion to available memory, but
> +		 * ensure a minimum size on small systems.
> +		 */
> +		tmp = max_t(u64, tmp >> 2,
> +			    mult_frac(zone->managed_pages,
> +				      watermark_scale_factor, 10000));
> +
> +		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
> +		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
>  
>  		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
>  			high_wmark_pages(zone) - low_wmark_pages(zone) -
> @@ -6472,6 +6482,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
>  	return 0;
>  }
>  
> +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
> +	void __user *buffer, size_t *length, loff_t *ppos)
> +{
> +	int rc;
> +
> +	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +	if (rc)
> +		return rc;
> +
> +	if (write)
> +		setup_per_zone_wmarks();
> +
> +	return 0;
> +}
> +
>  #ifdef CONFIG_NUMA
>  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
>  	void __user *buffer, size_t *length, loff_t *ppos)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: scale kswapd watermarks in proportion to memory
  2016-02-23  2:23 ` David Rientjes
@ 2016-02-24  0:36   ` Johannes Weiner
  0 siblings, 0 replies; 8+ messages in thread
From: Johannes Weiner @ 2016-02-24  0:36 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, linux-mm, linux-kernel,
	kernel-team

On Mon, Feb 22, 2016 at 06:23:19PM -0800, David Rientjes wrote:
> On Mon, 22 Feb 2016, Johannes Weiner wrote:
> 
> > In machines with 140G of memory and enterprise flash storage, we have
> > seen read and write bursts routinely exceed the kswapd watermarks and
> > cause thundering herds in direct reclaim. Unfortunately, the only way
> > to tune kswapd aggressiveness is through adjusting min_free_kbytes -
> > the system's emergency reserves - which is entirely unrelated to the
> > system's latency requirements. In order to get kswapd to maintain a
> > 250M buffer of free memory, the emergency reserves need to be set to
> > 1G. That is a lot of memory wasted for no good reason.
> > 
> > On the other hand, it's reasonable to assume that allocation bursts
> > and overall allocation concurrency scale with memory capacity, so it
> > makes sense to make kswapd aggressiveness a function of that as well.
> > 
> > Change the kswapd watermark scale factor from the currently fixed 25%
> > of the tunable emergency reserve to a tunable 0.001% of memory.
> > 
> 
> Making this tunable independent of min_free_kbytes is great.
> 
> I'm wondering how the choice of 0.001% was picked for default?  One of my 
> workstations currently has step sizes of about 0.0005% so this will be 
> doubling the steps from min to low and low to high.  I'm not objecting to 
> that since it's definitely in the right direction (more free memory) but I 
> wonder if it will make a difference for some users.

I wish it were a bit more scientific, but I basically picked an order
of magnitude that sounds like a reasonable balance between wasted
memory and expected allocation bursts before kswapd can ramp up.

On a 10G machine, a 10M latency buffer sounds adequate, whereas 1M
might get overwhelmed and 100M is almost certainly a waste of RAM.

> > Beyond 1G of memory, this will produce bigger watermark steps than the
> > current formula in default settings. Ensure that the new formula never
> > chooses steps smaller than that, i.e. 25% of the emergency reserve.
> > 
> > On a 140G machine, this raises the default watermark steps - the
> > distance between min and low, and low and high - from 16M to 143M.
> > 
> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> > Acked-by: Mel Gorman <mgorman@suse.de>
> > ---
> >  Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
> >  include/linux/mm.h          |  1 +
> >  include/linux/mmzone.h      |  2 ++
> >  kernel/sysctl.c             | 10 ++++++++++
> >  mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
> >  5 files changed, 58 insertions(+), 2 deletions(-)
> > 
> > v2: Ensure 25% of emergency reserves as a minimum on small machines -Rik
> > 
> > diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> > index 89a887c..b02d940 100644
> > --- a/Documentation/sysctl/vm.txt
> > +++ b/Documentation/sysctl/vm.txt
> > @@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
> >  directory and inode objects. With vfs_cache_pressure=1000, it will look for
> >  ten times more freeable objects than there are.
> >  
> > +=============================================================
> > +
> > +watermark_scale_factor:
> > +
> > +This factor controls the aggressiveness of kswapd. It defines the
> > +amount of memory left in a node/system before kswapd is woken up and
> > +how much memory needs to be free before kswapd goes back to sleep.
> > +
> > +The unit is in fractions of 10,000. The default value of 10 means the
> > +distances between watermarks are 0.001% of the available memory in the
> > +node/system. The maximum value is 1000, or 10% of memory.
> > +
> 
> The effective maximum value can be different than the tunable, though,
> correct?  It seems like you'd want to document why watermark_scale_factor
> and the actual watermarks in /proc/zoneinfo may be different on some
> systems.

You mean because of the enforced minimum? I wondered about that, but
it seems more like an implementation detail rather than part of the
API. I doubt that in practice anybody would intentionally set the
scale factor low enough for the kernel minimum to kick in.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: scale kswapd watermarks in proportion to memory
  2016-02-22 23:33 [PATCH v2] mm: scale kswapd watermarks in proportion to memory Johannes Weiner
  2016-02-23  1:36 ` Rik van Riel
  2016-02-23  2:23 ` David Rientjes
@ 2016-02-24  0:39 ` David Rientjes
  2016-02-25  0:37 ` Joonsoo Kim
  3 siblings, 0 replies; 8+ messages in thread
From: David Rientjes @ 2016-02-24  0:39 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, linux-mm, linux-kernel,
	kernel-team

On Mon, 22 Feb 2016, Johannes Weiner wrote:

> In machines with 140G of memory and enterprise flash storage, we have
> seen read and write bursts routinely exceed the kswapd watermarks and
> cause thundering herds in direct reclaim. Unfortunately, the only way
> to tune kswapd aggressiveness is through adjusting min_free_kbytes -
> the system's emergency reserves - which is entirely unrelated to the
> system's latency requirements. In order to get kswapd to maintain a
> 250M buffer of free memory, the emergency reserves need to be set to
> 1G. That is a lot of memory wasted for no good reason.
> 
> On the other hand, it's reasonable to assume that allocation bursts
> and overall allocation concurrency scale with memory capacity, so it
> makes sense to make kswapd aggressiveness a function of that as well.
> 
> Change the kswapd watermark scale factor from the currently fixed 25%
> of the tunable emergency reserve to a tunable 0.001% of memory.
> 
> Beyond 1G of memory, this will produce bigger watermark steps than the
> current formula in default settings. Ensure that the new formula never
> chooses steps smaller than that, i.e. 25% of the emergency reserve.
> 
> On a 140G machine, this raises the default watermark steps - the
> distance between min and low, and low and high - from 16M to 143M.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> Acked-by: Mel Gorman <mgorman@suse.de>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: scale kswapd watermarks in proportion to memory
  2016-02-22 23:33 [PATCH v2] mm: scale kswapd watermarks in proportion to memory Johannes Weiner
                   ` (2 preceding siblings ...)
  2016-02-24  0:39 ` David Rientjes
@ 2016-02-25  0:37 ` Joonsoo Kim
  2016-02-25 20:07   ` Johannes Weiner
  3 siblings, 1 reply; 8+ messages in thread
From: Joonsoo Kim @ 2016-02-25  0:37 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, linux-mm, linux-kernel,
	kernel-team

Hello, Johannes.

Just nitpick below.

On Mon, Feb 22, 2016 at 03:33:22PM -0800, Johannes Weiner wrote:
> In machines with 140G of memory and enterprise flash storage, we have
> seen read and write bursts routinely exceed the kswapd watermarks and
> cause thundering herds in direct reclaim. Unfortunately, the only way
> to tune kswapd aggressiveness is through adjusting min_free_kbytes -
> the system's emergency reserves - which is entirely unrelated to the
> system's latency requirements. In order to get kswapd to maintain a
> 250M buffer of free memory, the emergency reserves need to be set to
> 1G. That is a lot of memory wasted for no good reason.
> 
> On the other hand, it's reasonable to assume that allocation bursts
> and overall allocation concurrency scale with memory capacity, so it
> makes sense to make kswapd aggressiveness a function of that as well.
> 
> Change the kswapd watermark scale factor from the currently fixed 25%
> of the tunable emergency reserve to a tunable 0.001% of memory.

s/0.001%/0.1%

> Beyond 1G of memory, this will produce bigger watermark steps than the
> current formula in default settings. Ensure that the new formula never
> chooses steps smaller than that, i.e. 25% of the emergency reserve.
> 
> On a 140G machine, this raises the default watermark steps - the
> distance between min and low, and low and high - from 16M to 143M.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> Acked-by: Mel Gorman <mgorman@suse.de>
> ---
>  Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
>  include/linux/mm.h          |  1 +
>  include/linux/mmzone.h      |  2 ++
>  kernel/sysctl.c             | 10 ++++++++++
>  mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
>  5 files changed, 58 insertions(+), 2 deletions(-)
> 
> v2: Ensure 25% of emergency reserves as a minimum on small machines -Rik
> 
> diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> index 89a887c..b02d940 100644
> --- a/Documentation/sysctl/vm.txt
> +++ b/Documentation/sysctl/vm.txt
> @@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
>  directory and inode objects. With vfs_cache_pressure=1000, it will look for
>  ten times more freeable objects than there are.
>  
> +=============================================================
> +
> +watermark_scale_factor:
> +
> +This factor controls the aggressiveness of kswapd. It defines the
> +amount of memory left in a node/system before kswapd is woken up and
> +how much memory needs to be free before kswapd goes back to sleep.
> +
> +The unit is in fractions of 10,000. The default value of 10 means the
> +distances between watermarks are 0.001% of the available memory in the
> +node/system. The maximum value is 1000, or 10% of memory.

Ditto for 0.001%.

Thanks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] mm: scale kswapd watermarks in proportion to memory
  2016-02-25  0:37 ` Joonsoo Kim
@ 2016-02-25 20:07   ` Johannes Weiner
  0 siblings, 0 replies; 8+ messages in thread
From: Johannes Weiner @ 2016-02-25 20:07 UTC (permalink / raw)
  To: Joonsoo Kim
  Cc: Andrew Morton, Mel Gorman, Rik van Riel, linux-mm, linux-kernel,
	kernel-team

Hi Joonsoo,

On Thu, Feb 25, 2016 at 09:37:44AM +0900, Joonsoo Kim wrote:
> On Mon, Feb 22, 2016 at 03:33:22PM -0800, Johannes Weiner wrote:
> > In machines with 140G of memory and enterprise flash storage, we have
> > seen read and write bursts routinely exceed the kswapd watermarks and
> > cause thundering herds in direct reclaim. Unfortunately, the only way
> > to tune kswapd aggressiveness is through adjusting min_free_kbytes -
> > the system's emergency reserves - which is entirely unrelated to the
> > system's latency requirements. In order to get kswapd to maintain a
> > 250M buffer of free memory, the emergency reserves need to be set to
> > 1G. That is a lot of memory wasted for no good reason.
> > 
> > On the other hand, it's reasonable to assume that allocation bursts
> > and overall allocation concurrency scale with memory capacity, so it
> > makes sense to make kswapd aggressiveness a function of that as well.
> > 
> > Change the kswapd watermark scale factor from the currently fixed 25%
> > of the tunable emergency reserve to a tunable 0.001% of memory.
> 
> s/0.001%/0.1%

Of course, you are right. Thanks for pointing it out.

Andrew, I'm attaching a drop-in replacement for what you have, since
it includes fixing the changelog. But it might be easier to edit the
patch for these two instances in place.

> > @@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
> >  directory and inode objects. With vfs_cache_pressure=1000, it will look for
> >  ten times more freeable objects than there are.
> >  
> > +=============================================================
> > +
> > +watermark_scale_factor:
> > +
> > +This factor controls the aggressiveness of kswapd. It defines the
> > +amount of memory left in a node/system before kswapd is woken up and
> > +how much memory needs to be free before kswapd goes back to sleep.
> > +
> > +The unit is in fractions of 10,000. The default value of 10 means the
> > +distances between watermarks are 0.001% of the available memory in the
> > +node/system. The maximum value is 1000, or 10% of memory.
> 
> Ditto for 0.001%.

>From 8e97efd64ef8491a7a4e7326f7c0d3cb7a5eb264 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 18 Feb 2016 10:04:21 -0500
Subject: [PATCH V3] mm: scale kswapd watermarks in proportion to memory

In machines with 140G of memory and enterprise flash storage, we have
seen read and write bursts routinely exceed the kswapd watermarks and
cause thundering herds in direct reclaim. Unfortunately, the only way
to tune kswapd aggressiveness is through adjusting min_free_kbytes -
the system's emergency reserves - which is entirely unrelated to the
system's latency requirements. In order to get kswapd to maintain a
250M buffer of free memory, the emergency reserves need to be set to
1G. That is a lot of memory wasted for no good reason.

On the other hand, it's reasonable to assume that allocation bursts
and overall allocation concurrency scale with memory capacity, so it
makes sense to make kswapd aggressiveness a function of that as well.

Change the kswapd watermark scale factor from the currently fixed 25%
of the tunable emergency reserve to a tunable 0.1% of memory.

Beyond 1G of memory, this will produce bigger watermark steps than the
current formula in default settings. Ensure that the new formula never
chooses steps smaller than that, i.e. 25% of the emergency reserve.

On a 140G machine, this raises the default watermark steps - the
distance between min and low, and low and high - from 16M to 143M.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
---
 Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
 include/linux/mm.h          |  1 +
 include/linux/mmzone.h      |  2 ++
 kernel/sysctl.c             | 10 ++++++++++
 mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89a887c..cb03684 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
 directory and inode objects. With vfs_cache_pressure=1000, it will look for
 ten times more freeable objects than there are.
 
+=============================================================
+
+watermark_scale_factor:
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.1% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
 ==============================================================
 
 zone_reclaim_mode:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fe4d988..f2f4a6c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1896,6 +1896,7 @@ extern void zone_pcp_reset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
+extern int watermark_scale_factor;
 
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bdd9a27..c60df92 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone)
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f930ec2..96ec234 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -1404,6 +1405,15 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "watermark_scale_factor",
+		.data		= &watermark_scale_factor,
+		.maxlen		= sizeof(watermark_scale_factor),
+		.mode		= 0644,
+		.proc_handler	= watermark_scale_factor_sysctl_handler,
+		.extra1		= &one,
+		.extra2		= &one_thousand,
+	},
+	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
 		.maxlen		= sizeof(percpu_pagelist_fraction),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9cd427c..9bede0b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -6453,8 +6454,17 @@ static void __setup_per_zone_wmarks(void)
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		/*
+		 * Set the kswapd watermarks distance according to the
+		 * scale factor in proportion to available memory, but
+		 * ensure a minimum size on small systems.
+		 */
+		tmp = max_t(u64, tmp >> 2,
+			    mult_frac(zone->managed_pages,
+				      watermark_scale_factor, 10000));
+
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 			high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6551,6 +6561,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	if (write)
+		setup_per_zone_wmarks();
+
+	return 0;
+}
+
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
-- 
2.7.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-02-25 20:07 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-22 23:33 [PATCH v2] mm: scale kswapd watermarks in proportion to memory Johannes Weiner
2016-02-23  1:36 ` Rik van Riel
2016-02-23  1:53   ` Johannes Weiner
2016-02-23  2:23 ` David Rientjes
2016-02-24  0:36   ` Johannes Weiner
2016-02-24  0:39 ` David Rientjes
2016-02-25  0:37 ` Joonsoo Kim
2016-02-25 20:07   ` Johannes Weiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).