linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] mm: Don't overwrite user min_free_kbytes, consider THP when adjusting
@ 2020-02-10 19:01 Mike Kravetz
  2020-02-10 22:19 ` Andrew Morton
  0 siblings, 1 reply; 3+ messages in thread
From: Mike Kravetz @ 2020-02-10 19:01 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Song Liu, Kirill A . Shutemov, Mel Gorman, Vlastimil Babka,
	Khalid Aziz, Matthew Wilcox, David Rientjes, Andrew Morton,
	Mike Kravetz

The value of min_free_kbytes is calculated in two routines:
1) init_per_zone_wmark_min based on available memory
2) set_recommended_min_free_kbytes may reserve extra space for
   THP allocations

In both of these routines, a user defined min_free_kbytes value will
be overwritten if the value calculated in the code is larger. No message
is logged if the user value is overwritten.

Change code to never overwrite user defined value.  However, do log a
message (once per value) showing the value calculated in code.

At system initialization time, both init_per_zone_wmark_min and
set_recommended_min_free_kbytes are called to set the initial value
for min_free_kbytes.  When memory is offlined or onlined, min_free_kbytes
is recalculated and adjusted based on the amount of memory.  However,
the adjustment for THP is not considered.  Here is an example from a 2
node system with 8GB of memory.

 # cat /proc/sys/vm/min_free_kbytes
 90112
 # echo 0 > /sys/devices/system/node/node1/memory56/online
 # cat /proc/sys/vm/min_free_kbytes
 11243
 # echo 1 > /sys/devices/system/node/node1/memory56/online
 # cat /proc/sys/vm/min_free_kbytes
 11412

One would expect that min_free_kbytes would return to it's original
value after the offline/online operations.

Create a simple interface for THP/khugepaged based adjustment and
call this whenever min_free_kbytes is adjusted.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com>
---
v2 - Don't ever overwrite user defined value.  Always log message if we
     calculate a value different than user value.  As in existing code,
     do not attempt to validate user defined values.

 include/linux/khugepaged.h |  5 ++++
 mm/internal.h              |  2 ++
 mm/khugepaged.c            | 56 ++++++++++++++++++++++++++++++++------
 mm/page_alloc.c            | 35 ++++++++++++++++--------
 4 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index bc45ea1efbf7..faa92923dbe5 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
 				      unsigned long vm_flags);
+extern int khugepaged_adjust_min_free_kbytes(int mfk_value);
 #ifdef CONFIG_SHMEM
 extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
 #else
@@ -81,6 +82,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
 {
 	return 0;
 }
+static inline int khugepaged_adjust_min_free_kbytes(int mfk_value)
+{
+	return 0;
+}
 static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
 					   unsigned long addr)
 {
diff --git a/mm/internal.h b/mm/internal.h
index 3cf20ab3ca01..57bbc157124e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -164,6 +164,8 @@ extern void prep_compound_page(struct page *page, unsigned int order);
 extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
+#define UNSET_USER_MFK_VALUE -1
+extern int calc_min_free_kbytes;
 
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b679908743cb..6af72cb7e337 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2138,8 +2138,9 @@ static int khugepaged(void *none)
 	return 0;
 }
 
-static void set_recommended_min_free_kbytes(void)
+int __khugepaged_adjust_min_free_kbytes(int mfk_value)
 {
+	int ret = 0;
 	struct zone *zone;
 	int nr_zones = 0;
 	unsigned long recommended_min;
@@ -2172,19 +2173,40 @@ static void set_recommended_min_free_kbytes(void)
 			      (unsigned long) nr_free_buffer_pages() / 20);
 	recommended_min <<= (PAGE_SHIFT-10);
 
-	if (recommended_min > min_free_kbytes) {
-		if (user_min_free_kbytes >= 0)
-			pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
-				min_free_kbytes, recommended_min);
+	if (recommended_min > mfk_value)
+		ret = (int)recommended_min - mfk_value;
+
+	return ret;
+}
 
-		min_free_kbytes = recommended_min;
+static void set_recommended_min_free_kbytes(void)
+{
+	int av = __khugepaged_adjust_min_free_kbytes(min_free_kbytes);
+
+	if (av) {
+		if (user_min_free_kbytes != UNSET_USER_MFK_VALUE) {
+			/* Do not change user defined value. */
+			if ((min_free_kbytes + av) != calc_min_free_kbytes) {
+				/*
+				 * Save calculated value so we only print
+				 * warning once per value.
+				 */
+				calc_min_free_kbytes = min_free_kbytes + av;
+				pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+					calc_min_free_kbytes,
+					user_min_free_kbytes);
+			}
+		} else {
+			min_free_kbytes += av;
+			setup_per_zone_wmarks();
+		}
 	}
-	setup_per_zone_wmarks();
 }
 
-int start_stop_khugepaged(void)
+static struct task_struct *khugepaged_thread __read_mostly;
+
+int __ref start_stop_khugepaged(void)
 {
-	static struct task_struct *khugepaged_thread __read_mostly;
 	static DEFINE_MUTEX(khugepaged_mutex);
 	int err = 0;
 
@@ -2207,8 +2229,24 @@ int start_stop_khugepaged(void)
 	} else if (khugepaged_thread) {
 		kthread_stop(khugepaged_thread);
 		khugepaged_thread = NULL;
+		init_per_zone_wmark_min();
 	}
 fail:
 	mutex_unlock(&khugepaged_mutex);
 	return err;
 }
+
+int khugepaged_adjust_min_free_kbytes(int mfk_value)
+{
+	int ret = 0;
+
+	/*
+	 * This is a bit racy, and we could miss transitions.  However,
+	 * start/stop code above will make additional adjustments at the
+	 * end of transitions.
+	 */
+	if (khugepaged_enabled() && khugepaged_thread)
+		ret = __khugepaged_adjust_min_free_kbytes(mfk_value);
+
+	return ret;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d047bf7d8fd4..73162a5bf296 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <linux/psi.h>
+#include <linux/khugepaged.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -314,7 +315,8 @@ compound_page_dtor * const compound_page_dtors[] = {
 };
 
 int min_free_kbytes = 1024;
-int user_min_free_kbytes = -1;
+int user_min_free_kbytes = UNSET_USER_MFK_VALUE;
+int calc_min_free_kbytes = -1;
 #ifdef CONFIG_DISCONTIGMEM
 /*
  * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
@@ -7819,17 +7821,28 @@ int __meminit init_per_zone_wmark_min(void)
 
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
-
-	if (new_min_free_kbytes > user_min_free_kbytes) {
-		min_free_kbytes = new_min_free_kbytes;
-		if (min_free_kbytes < 128)
-			min_free_kbytes = 128;
-		if (min_free_kbytes > 65536)
-			min_free_kbytes = 65536;
-	} else {
-		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+	if (new_min_free_kbytes < 128)
+		new_min_free_kbytes = 128;
+	if (new_min_free_kbytes > 65536)
+		new_min_free_kbytes = 65536;
+	new_min_free_kbytes +=
+		khugepaged_adjust_min_free_kbytes(new_min_free_kbytes);
+
+	if (user_min_free_kbytes != UNSET_USER_MFK_VALUE) {
+		/* Do not change user defined value. */
+		if (new_min_free_kbytes != calc_min_free_kbytes) {
+			/*
+			 * Save calculated value so we only print warning
+			 * once per value.
+			 */
+			calc_min_free_kbytes = new_min_free_kbytes;
+			pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
+		}
+		goto out;
 	}
+
+	min_free_kbytes = new_min_free_kbytes;
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
@@ -7838,7 +7851,7 @@ int __meminit init_per_zone_wmark_min(void)
 	setup_min_unmapped_ratio();
 	setup_min_slab_ratio();
 #endif
-
+out:
 	return 0;
 }
 core_initcall(init_per_zone_wmark_min)
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] mm: Don't overwrite user min_free_kbytes, consider THP when adjusting
  2020-02-10 19:01 [PATCH v2] mm: Don't overwrite user min_free_kbytes, consider THP when adjusting Mike Kravetz
@ 2020-02-10 22:19 ` Andrew Morton
  2020-02-11  0:02   ` Mike Kravetz
  0 siblings, 1 reply; 3+ messages in thread
From: Andrew Morton @ 2020-02-10 22:19 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-mm, linux-kernel, Song Liu, Kirill A . Shutemov,
	Mel Gorman, Vlastimil Babka, Khalid Aziz, Matthew Wilcox,
	David Rientjes

On Mon, 10 Feb 2020 11:01:21 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:

> The value of min_free_kbytes is calculated in two routines:
> 1) init_per_zone_wmark_min based on available memory
> 2) set_recommended_min_free_kbytes may reserve extra space for
>    THP allocations
> 
> In both of these routines, a user defined min_free_kbytes value will
> be overwritten if the value calculated in the code is larger. No message
> is logged if the user value is overwritten.

Could we provide a detailed description of why this is considered to be
a problem?  This is fairly easily guessable, but is there a real
in-field bad user experience we can point at?

> Change code to never overwrite user defined value.  However, do log a
> message (once per value) showing the value calculated in code.
> 
> At system initialization time, both init_per_zone_wmark_min and
> set_recommended_min_free_kbytes are called to set the initial value
> for min_free_kbytes.  When memory is offlined or onlined, min_free_kbytes
> is recalculated and adjusted based on the amount of memory.  However,
> the adjustment for THP is not considered.  Here is an example from a 2
> node system with 8GB of memory.
> 
>  # cat /proc/sys/vm/min_free_kbytes
>  90112
>  # echo 0 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11243
>  # echo 1 > /sys/devices/system/node/node1/memory56/online
>  # cat /proc/sys/vm/min_free_kbytes
>  11412
> 
> One would expect that min_free_kbytes would return to it's original
> value after the offline/online operations.
> 
> Create a simple interface for THP/khugepaged based adjustment and
> call this whenever min_free_kbytes is adjusted.
> 
> ...
>
>  include/linux/khugepaged.h |  5 ++++
>  mm/internal.h              |  2 ++
>  mm/khugepaged.c            | 56 ++++++++++++++++++++++++++++++++------
>  mm/page_alloc.c            | 35 ++++++++++++++++--------

min_free_kbytes gets a few mentions in Documentation/.  Should we make
the appropriate updates there to bring this behavior to people's
attention?


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] mm: Don't overwrite user min_free_kbytes, consider THP when adjusting
  2020-02-10 22:19 ` Andrew Morton
@ 2020-02-11  0:02   ` Mike Kravetz
  0 siblings, 0 replies; 3+ messages in thread
From: Mike Kravetz @ 2020-02-11  0:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, Song Liu, Kirill A.Shutemov, Mel Gorman,
	Vlastimil Babka, Khalid Aziz, Matthew Wilcox, David Rientjes

On 2/10/20 2:19 PM, Andrew Morton wrote:
> On Mon, 10 Feb 2020 11:01:21 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
>> The value of min_free_kbytes is calculated in two routines:
>> 1) init_per_zone_wmark_min based on available memory
>> 2) set_recommended_min_free_kbytes may reserve extra space for
>>    THP allocations
>>
>> In both of these routines, a user defined min_free_kbytes value will
>> be overwritten if the value calculated in the code is larger. No message
>> is logged if the user value is overwritten.
> 
> Could we provide a detailed description of why this is considered to be
> a problem?  This is fairly easily guessable, but is there a real
> in-field bad user experience we can point at?

I do not have an example of a bad user experience.  This observation came
about when I was modifying min_free_kbytes while working on some other
issue.  To me, it seems like the current behavior is not what one would
expect.  Logging messages when we do not overwrite the user value and not
logging anything when we do seems a bit backward.

Changing this is not important to me.  It has been like this for quite some
time and I am not aware of any user complaints/issues.

>> Change code to never overwrite user defined value.  However, do log a
>> message (once per value) showing the value calculated in code.
>>
>> At system initialization time, both init_per_zone_wmark_min and
>> set_recommended_min_free_kbytes are called to set the initial value
>> for min_free_kbytes.  When memory is offlined or onlined, min_free_kbytes
>> is recalculated and adjusted based on the amount of memory.  However,
>> the adjustment for THP is not considered.  Here is an example from a 2
>> node system with 8GB of memory.
>>
>>  # cat /proc/sys/vm/min_free_kbytes
>>  90112
>>  # echo 0 > /sys/devices/system/node/node1/memory56/online
>>  # cat /proc/sys/vm/min_free_kbytes
>>  11243
>>  # echo 1 > /sys/devices/system/node/node1/memory56/online
>>  # cat /proc/sys/vm/min_free_kbytes
>>  11412
>>
>> One would expect that min_free_kbytes would return to it's original
>> value after the offline/online operations.
>>
>> Create a simple interface for THP/khugepaged based adjustment and
>> call this whenever min_free_kbytes is adjusted.
>>
>> ...
>>
>>  include/linux/khugepaged.h |  5 ++++
>>  mm/internal.h              |  2 ++
>>  mm/khugepaged.c            | 56 ++++++++++++++++++++++++++++++++------
>>  mm/page_alloc.c            | 35 ++++++++++++++++--------
> 
> min_free_kbytes gets a few mentions in Documentation/.  Should we make
> the appropriate updates there to bring this behavior to people's
> attention?

I'm happy to update the documentation to match current behavior.  Changing
the documentation may prompt people to ask if we should change the code. :)
-- 
Mike Kravetz

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-02-11  0:05 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-10 19:01 [PATCH v2] mm: Don't overwrite user min_free_kbytes, consider THP when adjusting Mike Kravetz
2020-02-10 22:19 ` Andrew Morton
2020-02-11  0:02   ` Mike Kravetz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).