[PATCH 0/4] mm/page_reporting: Some knobs and fixes

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4] mm/page_reporting: Some knobs and fixes
@ 2021-03-26  9:44 Xunlei Pang
  2021-03-26  9:44 ` [PATCH 1/4] mm/page_reporting: Introduce free page reported counters Xunlei Pang
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Xunlei Pang @ 2021-03-26  9:44 UTC (permalink / raw)
  To: Andrew Morton, Alexander Duyck, Mel Gorman
  Cc: linux-kernel, linux-mm, Xunlei Pang

Add the following knobs in PATCH 1~3:
 /sys/kernel/mm/page_reporting/reported_kbytes
 /sys/kernel/mm/page_reporting/refault_kbytes
 /sys/kernel/mm/page_reporting/reporting_factor

Fix unexpected user OOM in PATCH 4.

Xunlei Pang (4):
  mm/page_reporting: Introduce free page reported counters
  mm/page_reporting: Introduce free page reporting factor
  mm/page_reporting: Introduce "page_reporting_factor=" boot parameter
  mm/page_reporting: Fix possible user allocation failure

 Documentation/admin-guide/kernel-parameters.txt |   3 +
 include/linux/mmzone.h                          |   3 +
 mm/page_alloc.c                                 |   6 +-
 mm/page_reporting.c                             | 268 ++++++++++++++++++++++--
 4 files changed, 260 insertions(+), 20 deletions(-)

-- 
1.8.3.1



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] mm/page_reporting: Introduce free page reported counters
  2021-03-26  9:44 [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
@ 2021-03-26  9:44 ` Xunlei Pang
  2021-04-02 18:29   ` Alexander Duyck
  2021-03-26  9:44 ` [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor Xunlei Pang
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Xunlei Pang @ 2021-03-26  9:44 UTC (permalink / raw)
  To: Andrew Morton, Alexander Duyck, Mel Gorman
  Cc: linux-kernel, linux-mm, Xunlei Pang

It's useful to know how many memory has been actually reported,
so add new zone::reported_pages to record that.

Add "/sys/kernel/mm/page_reporting/reported_kbytes" for the
actual memory has been reported.

Add "/sys/kernel/mm/page_reporting/refault_kbytes" for the
accumulated memory has refaulted in after been reported out.

Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 include/linux/mmzone.h |   3 ++
 mm/page_alloc.c        |   4 +-
 mm/page_reporting.c    | 112 +++++++++++++++++++++++++++++++++++++++++++++++--
 mm/page_reporting.h    |   5 +++
 4 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946ce..ebd169f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -530,6 +530,9 @@ struct zone {
 	atomic_long_t		managed_pages;
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
+#ifdef CONFIG_PAGE_REPORTING
+	unsigned long		reported_pages;
+#endif
 #ifdef CONFIG_CMA
 	unsigned long		cma_pages;
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e4b29ee..c2c5688 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -930,8 +930,10 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
 					   unsigned int order)
 {
 	/* clear reported state and update reported page count */
-	if (page_reported(page))
+	if (page_reported(page)) {
 		__ClearPageReported(page);
+		page_reporting_update_refault(zone, 1 << order);
+	}
 
 	list_del(&page->lru);
 	__ClearPageBuddy(page);
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index c50d93f..ba195ea 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/page_reporting.h>
@@ -19,6 +20,22 @@ enum {
 	PAGE_REPORTING_ACTIVE
 };
 
+#ifdef CONFIG_SYSFS
+static struct percpu_counter refault_pages;
+
+void page_reporting_update_refault(struct zone *zone, unsigned int pages)
+{
+	zone->reported_pages -= pages;
+	percpu_counter_add_batch(&refault_pages, pages, INT_MAX / 2);
+}
+#else
+void page_reporting_update_refault(struct zone *zone, unsigned int pages)
+{
+	zone->reported_pages -= pages;
+}
+#endif
+
+
 /* request page reporting */
 static void
 __page_reporting_request(struct page_reporting_dev_info *prdev)
@@ -66,7 +83,8 @@ void __page_reporting_notify(void)
 
 static void
 page_reporting_drain(struct page_reporting_dev_info *prdev,
-		     struct scatterlist *sgl, unsigned int nents, bool reported)
+		     struct scatterlist *sgl, struct zone *zone,
+		     unsigned int nents, bool reported)
 {
 	struct scatterlist *sg = sgl;
 
@@ -92,8 +110,10 @@ void __page_reporting_notify(void)
 		 * report on the new larger page when we make our way
 		 * up to that higher order.
 		 */
-		if (PageBuddy(page) && buddy_order(page) == order)
+		if (PageBuddy(page) && buddy_order(page) == order) {
 			__SetPageReported(page);
+			zone->reported_pages += (1 << order);
+		}
 	} while ((sg = sg_next(sg)));
 
 	/* reinitialize scatterlist now that it is empty */
@@ -197,7 +217,7 @@ void __page_reporting_notify(void)
 		spin_lock_irq(&zone->lock);
 
 		/* flush reported pages from the sg list */
-		page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
+		page_reporting_drain(prdev, sgl, zone, PAGE_REPORTING_CAPACITY, !err);
 
 		/*
 		 * Reset next to first entry, the old next isn't valid
@@ -260,7 +280,7 @@ void __page_reporting_notify(void)
 
 		/* flush any remaining pages out from the last report */
 		spin_lock_irq(&zone->lock);
-		page_reporting_drain(prdev, sgl, leftover, !err);
+		page_reporting_drain(prdev, sgl, zone, leftover, !err);
 		spin_unlock_irq(&zone->lock);
 	}
 
@@ -362,3 +382,87 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
 	mutex_unlock(&page_reporting_mutex);
 }
 EXPORT_SYMBOL_GPL(page_reporting_unregister);
+
+#ifdef CONFIG_SYSFS
+#define REPORTING_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static unsigned long get_reported_kbytes(void)
+{
+	struct zone *z;
+	unsigned long nr_reported = 0;
+
+	for_each_populated_zone(z)
+		nr_reported += z->reported_pages;
+
+	return nr_reported << (PAGE_SHIFT - 10);
+}
+
+static ssize_t reported_kbytes_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", get_reported_kbytes());
+}
+
+static ssize_t reported_kbytes_store(struct kobject *kobj,
+		struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	return -EINVAL;
+}
+REPORTING_ATTR(reported_kbytes);
+
+static u64 get_refault_kbytes(void)
+{
+	u64 sum;
+
+	sum = percpu_counter_sum_positive(&refault_pages);
+	return sum << (PAGE_SHIFT - 10);
+}
+
+static ssize_t refault_kbytes_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n", get_refault_kbytes());
+}
+
+static ssize_t refault_kbytes_store(struct kobject *kobj,
+		struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	return -EINVAL;
+}
+REPORTING_ATTR(refault_kbytes);
+
+static struct attribute *reporting_attrs[] = {
+	&reported_kbytes_attr.attr,
+	&refault_kbytes_attr.attr,
+	NULL,
+};
+
+static struct attribute_group reporting_attr_group = {
+	.attrs = reporting_attrs,
+	.name = "page_reporting",
+};
+#endif
+
+static int __init page_reporting_init(void)
+{
+#ifdef CONFIG_SYSFS
+	int err;
+
+	if (percpu_counter_init(&refault_pages, 0, GFP_KERNEL))
+		panic("Failed to allocate refault_pages percpu counter\n");
+
+	err = sysfs_create_group(mm_kobj, &reporting_attr_group);
+	if (err) {
+		pr_err("%s: Unable to populate sysfs files\n", __func__);
+		return err;
+	}
+#endif
+
+	return 0;
+}
+
+module_init(page_reporting_init);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index 2c385dd..19549c7 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -44,11 +44,16 @@ static inline void page_reporting_notify_free(unsigned int order)
 	/* This will add a few cycles, but should be called infrequently */
 	__page_reporting_notify();
 }
+
+void page_reporting_update_refault(struct zone *zone, unsigned int pages);
 #else /* CONFIG_PAGE_REPORTING */
 #define page_reported(_page)	false
 
 static inline void page_reporting_notify_free(unsigned int order)
 {
 }
+
+static inline void
+page_reporting_update_refault(struct zone *zone, unsigned int pages) { }
 #endif /* CONFIG_PAGE_REPORTING */
 #endif /*_MM_PAGE_REPORTING_H */
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor
  2021-03-26  9:44 [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
  2021-03-26  9:44 ` [PATCH 1/4] mm/page_reporting: Introduce free page reported counters Xunlei Pang
@ 2021-03-26  9:44 ` Xunlei Pang
  2021-04-02 18:56   ` Alexander Duyck
  2021-03-26  9:44 ` [PATCH 3/4] mm/page_reporting: Introduce "page_reporting_factor=" boot parameter Xunlei Pang
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Xunlei Pang @ 2021-03-26  9:44 UTC (permalink / raw)
  To: Andrew Morton, Alexander Duyck, Mel Gorman
  Cc: linux-kernel, linux-mm, Xunlei Pang

Add new "/sys/kernel/mm/page_reporting/reporting_factor"
within [0, 100], and stop page reporting when it reaches
the configured threshold. Default is 100 which means no
limitation is imposed. Percentile is adopted to reflect
the fact that it reports on the per-zone basis.

We can control the total number of reporting pages via
this knob to avoid EPT violations which may affect the
performance of the business, imagine the guest memory
allocation burst or host long-tail memory reclaiming
really hurt.

This knob can help make customized control policies according
to VM priority, it is also useful for testing, gray-release, etc.

Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 mm/page_reporting.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index ba195ea..86c6479 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -11,6 +11,8 @@
 #include "page_reporting.h"
 #include "internal.h"
 
+static int reporting_factor = 100;
+
 #define PAGE_REPORTING_DELAY	(2 * HZ)
 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
 
@@ -134,6 +136,7 @@ void __page_reporting_notify(void)
 	struct list_head *list = &area->free_list[mt];
 	unsigned int page_len = PAGE_SIZE << order;
 	struct page *page, *next;
+	unsigned long threshold;
 	long budget;
 	int err = 0;
 
@@ -144,6 +147,7 @@ void __page_reporting_notify(void)
 	if (list_empty(list))
 		return err;
 
+	threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
 	spin_lock_irq(&zone->lock);
 
 	/*
@@ -181,6 +185,8 @@ void __page_reporting_notify(void)
 
 		/* Attempt to pull page from list and place in scatterlist */
 		if (*offset) {
+			unsigned long nr_pages;
+
 			if (!__isolate_free_page(page, order)) {
 				next = page;
 				break;
@@ -190,6 +196,12 @@ void __page_reporting_notify(void)
 			--(*offset);
 			sg_set_page(&sgl[*offset], page, page_len, 0);
 
+			nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << order;
+			if (zone->reported_pages + nr_pages >= threshold) {
+				err = 1;
+				break;
+			}
+
 			continue;
 		}
 
@@ -244,9 +256,13 @@ void __page_reporting_notify(void)
 			    struct scatterlist *sgl, struct zone *zone)
 {
 	unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
-	unsigned long watermark;
+	unsigned long watermark, threshold;
 	int err = 0;
 
+	threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
+	if (zone->reported_pages >= threshold)
+		return err;
+
 	/* Generate minimum watermark to be able to guarantee progress */
 	watermark = low_wmark_pages(zone) +
 		    (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
@@ -267,11 +283,18 @@ void __page_reporting_notify(void)
 
 			err = page_reporting_cycle(prdev, zone, order, mt,
 						   sgl, &offset);
+			/* Exceed threshold go to report leftover */
+			if (err > 0) {
+				err = 0;
+				goto leftover;
+			}
+
 			if (err)
 				return err;
 		}
 	}
 
+leftover:
 	/* report the leftover pages before going idle */
 	leftover = PAGE_REPORTING_CAPACITY - offset;
 	if (leftover) {
@@ -435,9 +458,44 @@ static ssize_t refault_kbytes_store(struct kobject *kobj,
 }
 REPORTING_ATTR(refault_kbytes);
 
+static ssize_t reporting_factor_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", reporting_factor);
+}
+
+static ssize_t reporting_factor_store(struct kobject *kobj,
+		struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	int new, old, err;
+	struct page *page;
+
+	err = kstrtoint(buf, 10, &new);
+	if (err || (new < 0 || new > 100))
+		return -EINVAL;
+
+	old = reporting_factor;
+	reporting_factor = new;
+
+	if (new <= old)
+		goto out;
+
+	/* Trigger reporting with new larger reporting_factor */
+	page = alloc_pages(__GFP_HIGHMEM | __GFP_NOWARN,
+			PAGE_REPORTING_MIN_ORDER);
+	if (page)
+		__free_pages(page, PAGE_REPORTING_MIN_ORDER);
+
+out:
+	return count;
+}
+REPORTING_ATTR(reporting_factor);
+
 static struct attribute *reporting_attrs[] = {
 	&reported_kbytes_attr.attr,
 	&refault_kbytes_attr.attr,
+	&reporting_factor_attr.attr,
 	NULL,
 };
 
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] mm/page_reporting: Introduce "page_reporting_factor=" boot parameter
  2021-03-26  9:44 [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
  2021-03-26  9:44 ` [PATCH 1/4] mm/page_reporting: Introduce free page reported counters Xunlei Pang
  2021-03-26  9:44 ` [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor Xunlei Pang
@ 2021-03-26  9:44 ` Xunlei Pang
  2021-03-26  9:44 ` [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure Xunlei Pang
  2021-04-02  4:08 ` [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
  4 siblings, 0 replies; 12+ messages in thread
From: Xunlei Pang @ 2021-03-26  9:44 UTC (permalink / raw)
  To: Andrew Morton, Alexander Duyck, Mel Gorman
  Cc: linux-kernel, linux-mm, Xunlei Pang

Currently the default behaviour(value 100) is to do full report,
although we can control it after boot via "page_reporting_factor"
sysfs knob, it could still be late because some amount of memory
has already been reported before operating this knob.

Sometimes we really want it safely off by default and turn it on
as needed at runtime, so "page_reporting_factor=" boot parameter
is that guarantee and meets different default setting requirements.

There's also a real-world problem that I noticed on tiny instances,
it always reports some memory at boot stage before application
starts and uses up memory which retriggers EPT fault after boot.

The following data(right after boot) indicates that 172032KiB pages
were unnecessarily reported and refault in:
 $ cat /sys/kernel/mm/page_reporting/refault_kbytes
 172032
 $ cat /sys/kernel/mm/page_reporting/reported_kbytes
 0

Thus it's reasonable to turn the page reporting off by default and
enable it at runtime as needed.

Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  3 +++
 mm/page_reporting.c                             | 13 +++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0454572..46e296c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3524,6 +3524,9 @@
 			off: turn off poisoning (default)
 			on: turn on poisoning
 
+	page_reporting_factor= [KNL] Guest Free Page Reporting percentile.
+			[0, 100]: 0 - off, not report; 100 - default, full report.
+
 	panic=		[KNL] Kernel behaviour on panic: delay <timeout>
 			timeout > 0: seconds before rebooting
 			timeout = 0: wait forever
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 86c6479..6ffedb8 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -524,3 +524,16 @@ static int __init page_reporting_init(void)
 }
 
 module_init(page_reporting_init);
+
+static int __init setup_reporting_factor(char *str)
+{
+	int v;
+
+	if (kstrtoint(str, 10, &v))
+		return -EINVAL;
+	if (v >= 0 && v <= 100)
+		reporting_factor = v;
+
+	return 0;
+}
+__setup("page_reporting_factor=", setup_reporting_factor);
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure
  2021-03-26  9:44 [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
                   ` (2 preceding siblings ...)
  2021-03-26  9:44 ` [PATCH 3/4] mm/page_reporting: Introduce "page_reporting_factor=" boot parameter Xunlei Pang
@ 2021-03-26  9:44 ` Xunlei Pang
  2021-04-02 19:55   ` Alexander Duyck
  2021-04-02  4:08 ` [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
  4 siblings, 1 reply; 12+ messages in thread
From: Xunlei Pang @ 2021-03-26  9:44 UTC (permalink / raw)
  To: Andrew Morton, Alexander Duyck, Mel Gorman
  Cc: linux-kernel, linux-mm, Xunlei Pang

We encountered user memory allocation failure(OOM) on our
512MiB tiny instances, it didn't happen after turning off
the page reporting.

After some debugging, it turns out 32*4MB=128MB(order-10)
free pages were isolated during reporting window resulting
in no free available.

Actually this might also happen on large instances when
having a few free memory.

This patch introduces a rule to limit reporting capacity
according to current free memory, and reduce accordingly
for higher orders which could break this rule.

For example,
 100MiB free, sgl capacity for different orders are:
   order-9 : 32
   order-10: 16

Reported-by: Helin Guo <helinguo@linux.alibaba.com>
Tested-by: Helin Guo <helinguo@linux.alibaba.com>
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 mm/page_reporting.c | 89 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 72 insertions(+), 17 deletions(-)

diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 6ffedb8..2ec0ec0 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -129,8 +129,8 @@ void __page_reporting_notify(void)
  */
 static int
 page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
-		     unsigned int order, unsigned int mt,
-		     struct scatterlist *sgl, unsigned int *offset)
+		     unsigned int order, unsigned int mt, struct scatterlist *sgl,
+		     const unsigned int capacity, unsigned int *offset)
 {
 	struct free_area *area = &zone->free_area[order];
 	struct list_head *list = &area->free_list[mt];
@@ -161,10 +161,10 @@ void __page_reporting_notify(void)
 	 * list processed. This should result in us reporting all pages on
 	 * an idle system in about 30 seconds.
 	 *
-	 * The division here should be cheap since PAGE_REPORTING_CAPACITY
-	 * should always be a power of 2.
+	 * The division here should be cheap since capacity should
+	 * always be a power of 2.
 	 */
-	budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
+	budget = DIV_ROUND_UP(area->nr_free, capacity * 16);
 
 	/* loop through free list adding unreported pages to sg list */
 	list_for_each_entry_safe(page, next, list, lru) {
@@ -196,7 +196,7 @@ void __page_reporting_notify(void)
 			--(*offset);
 			sg_set_page(&sgl[*offset], page, page_len, 0);
 
-			nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << order;
+			nr_pages = (capacity - *offset) << order;
 			if (zone->reported_pages + nr_pages >= threshold) {
 				err = 1;
 				break;
@@ -217,10 +217,10 @@ void __page_reporting_notify(void)
 		spin_unlock_irq(&zone->lock);
 
 		/* begin processing pages in local list */
-		err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
+		err = prdev->report(prdev, sgl, capacity);
 
 		/* reset offset since the full list was reported */
-		*offset = PAGE_REPORTING_CAPACITY;
+		*offset = capacity;
 
 		/* update budget to reflect call to report function */
 		budget--;
@@ -229,7 +229,7 @@ void __page_reporting_notify(void)
 		spin_lock_irq(&zone->lock);
 
 		/* flush reported pages from the sg list */
-		page_reporting_drain(prdev, sgl, zone, PAGE_REPORTING_CAPACITY, !err);
+		page_reporting_drain(prdev, sgl, zone, capacity, !err);
 
 		/*
 		 * Reset next to first entry, the old next isn't valid
@@ -251,12 +251,39 @@ void __page_reporting_notify(void)
 	return err;
 }
 
+/*
+ * For guest with little free memory, we should tune reporting capacity
+ * correctly to avoid reporting too much once, otherwise user allocation
+ * may fail and OOM during reporting window between __isolate_free_page()
+ * and page_reporting_drain().
+ *
+ * Calculate from which order we begin to reduce the scatterlist capacity,
+ * in order not to isolate too many pages to fail the user allocation.
+ */
+static unsigned int calculate_zone_order_threshold(struct zone *z)
+{
+	unsigned int order;
+	long pages_threshold;
+
+	pages_threshold = zone_page_state(z, NR_FREE_PAGES) - low_wmark_pages(z);
+	for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+		if ((PAGE_REPORTING_CAPACITY << order) > pages_threshold)
+			break;
+	}
+
+	return order;
+}
+
 static int
 page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 			    struct scatterlist *sgl, struct zone *zone)
 {
-	unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
+	unsigned int order, mt, leftover, offset;
 	unsigned long watermark, threshold;
+	unsigned int capacity = PAGE_REPORTING_CAPACITY;
+	unsigned int capacity_curr;
+	struct scatterlist *sgl_curr;
+	unsigned int order_threshold;
 	int err = 0;
 
 	threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
@@ -274,15 +301,28 @@ void __page_reporting_notify(void)
 	if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 		return err;
 
+	sgl_curr = sgl;
+	capacity_curr = offset = capacity;
+	order_threshold = calculate_zone_order_threshold(zone);
 	/* Process each free list starting from lowest order/mt */
 	for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+		/* try to reduce unexpected high order's reporting capacity */
+		if (order >= order_threshold) {
+			capacity_curr = capacity >> (order - order_threshold + 1);
+			if (capacity_curr == 0)
+				capacity_curr = 1;
+			sgl_curr = sgl + capacity - capacity_curr;
+			offset = capacity_curr;
+			sg_init_table(sgl_curr, capacity_curr);
+		}
+
 		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
 			/* We do not pull pages from the isolate free list */
 			if (is_migrate_isolate(mt))
 				continue;
 
 			err = page_reporting_cycle(prdev, zone, order, mt,
-						   sgl, &offset);
+						   sgl_curr, capacity_curr, &offset);
 			/* Exceed threshold go to report leftover */
 			if (err > 0) {
 				err = 0;
@@ -292,18 +332,34 @@ void __page_reporting_notify(void)
 			if (err)
 				return err;
 		}
+
+		/* report the leftover pages for next orders with reduced capacity */
+		leftover = capacity_curr - offset;
+		if (leftover && order + 1 >= order_threshold) {
+			sgl_curr = &sgl_curr[offset];
+			err = prdev->report(prdev, sgl_curr, leftover);
+			offset = capacity_curr;
+
+			/* flush any remaining pages out from the last report */
+			spin_lock_irq(&zone->lock);
+			page_reporting_drain(prdev, sgl_curr, zone, leftover, !err);
+			spin_unlock_irq(&zone->lock);
+
+			if (err)
+				return err;
+		}
 	}
 
 leftover:
 	/* report the leftover pages before going idle */
-	leftover = PAGE_REPORTING_CAPACITY - offset;
+	leftover = capacity_curr - offset;
 	if (leftover) {
-		sgl = &sgl[offset];
-		err = prdev->report(prdev, sgl, leftover);
+		sgl_curr = &sgl_curr[offset];
+		err = prdev->report(prdev, sgl_curr, leftover);
 
 		/* flush any remaining pages out from the last report */
 		spin_lock_irq(&zone->lock);
-		page_reporting_drain(prdev, sgl, zone, leftover, !err);
+		page_reporting_drain(prdev, sgl_curr, zone, leftover, !err);
 		spin_unlock_irq(&zone->lock);
 	}
 
@@ -332,9 +388,8 @@ static void page_reporting_process(struct work_struct *work)
 	if (!sgl)
 		goto err_out;
 
-	sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
-
 	for_each_zone(zone) {
+		sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
 		err = page_reporting_process_zone(prdev, sgl, zone);
 		if (err)
 			break;
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] mm/page_reporting: Some knobs and fixes
  2021-03-26  9:44 [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
                   ` (3 preceding siblings ...)
  2021-03-26  9:44 ` [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure Xunlei Pang
@ 2021-04-02  4:08 ` Xunlei Pang
  2021-04-02 18:17   ` Alexander Duyck
  4 siblings, 1 reply; 12+ messages in thread
From: Xunlei Pang @ 2021-04-02  4:08 UTC (permalink / raw)
  To: Xunlei Pang, Andrew Morton, Mel Gorman, David Hildenbrand,
	Alexander Duyck
  Cc: linux-kernel, linux-mm

On 3/26/21 5:44 PM, Xunlei Pang wrote:
> Add the following knobs in PATCH 1~3:
>  /sys/kernel/mm/page_reporting/reported_kbytes
>  /sys/kernel/mm/page_reporting/refault_kbytes
>  /sys/kernel/mm/page_reporting/reporting_factor
> 
> Fix unexpected user OOM in PATCH 4.
> 
> Xunlei Pang (4):
>   mm/page_reporting: Introduce free page reported counters
>   mm/page_reporting: Introduce free page reporting factor
>   mm/page_reporting: Introduce "page_reporting_factor=" boot parameter
>   mm/page_reporting: Fix possible user allocation failure
> 
>  Documentation/admin-guide/kernel-parameters.txt |   3 +
>  include/linux/mmzone.h                          |   3 +
>  mm/page_alloc.c                                 |   6 +-
>  mm/page_reporting.c                             | 268 ++++++++++++++++++++++--
>  4 files changed, 260 insertions(+), 20 deletions(-)
> 

Hi guys,

Looks "Alexander Duyck <alexander.h.duyck@linux.intel.com>" was not
available, so Cced more, any comment?

Thanks!


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] mm/page_reporting: Some knobs and fixes
  2021-04-02  4:08 ` [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
@ 2021-04-02 18:17   ` Alexander Duyck
  0 siblings, 0 replies; 12+ messages in thread
From: Alexander Duyck @ 2021-04-02 18:17 UTC (permalink / raw)
  To: xlpang
  Cc: Andrew Morton, Mel Gorman, David Hildenbrand, Alexander Duyck,
	LKML, linux-mm

On Thu, Apr 1, 2021 at 9:09 PM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>
> On 3/26/21 5:44 PM, Xunlei Pang wrote:
> > Add the following knobs in PATCH 1~3:
> >  /sys/kernel/mm/page_reporting/reported_kbytes
> >  /sys/kernel/mm/page_reporting/refault_kbytes
> >  /sys/kernel/mm/page_reporting/reporting_factor
> >
> > Fix unexpected user OOM in PATCH 4.
> >
> > Xunlei Pang (4):
> >   mm/page_reporting: Introduce free page reported counters
> >   mm/page_reporting: Introduce free page reporting factor
> >   mm/page_reporting: Introduce "page_reporting_factor=" boot parameter
> >   mm/page_reporting: Fix possible user allocation failure
> >
> >  Documentation/admin-guide/kernel-parameters.txt |   3 +
> >  include/linux/mmzone.h                          |   3 +
> >  mm/page_alloc.c                                 |   6 +-
> >  mm/page_reporting.c                             | 268 ++++++++++++++++++++++--
> >  4 files changed, 260 insertions(+), 20 deletions(-)
> >
>
> Hi guys,
>
> Looks "Alexander Duyck <alexander.h.duyck@linux.intel.com>" was not
> available, so Cced more, any comment?
>
> Thanks!

Yes, my Intel account has been offline since October. If you need to
reach me, my gmail is the best way to go.

As far as the patch series itself I am not exactly thrilled with it.
There seems to be a number of spots where things are being changed
such that the CPU overhead will be much more significant.

The cover page should actually say what the patch set is attempting to
accomplish. In the patch descriptions you have told us what you are
doing, but the why isn't completely clear. For example I am not sure
if the issue addressed in patch 4 was present before patches 1-3 were
introduced.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] mm/page_reporting: Introduce free page reported counters
  2021-03-26  9:44 ` [PATCH 1/4] mm/page_reporting: Introduce free page reported counters Xunlei Pang
@ 2021-04-02 18:29   ` Alexander Duyck
  0 siblings, 0 replies; 12+ messages in thread
From: Alexander Duyck @ 2021-04-02 18:29 UTC (permalink / raw)
  To: Xunlei Pang; +Cc: Andrew Morton, Alexander Duyck, Mel Gorman, LKML, linux-mm

On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>
> It's useful to know how many memory has been actually reported,
> so add new zone::reported_pages to record that.
>
> Add "/sys/kernel/mm/page_reporting/reported_kbytes" for the
> actual memory has been reported.
>
> Add "/sys/kernel/mm/page_reporting/refault_kbytes" for the
> accumulated memory has refaulted in after been reported out.
>
> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
> ---
>  include/linux/mmzone.h |   3 ++
>  mm/page_alloc.c        |   4 +-
>  mm/page_reporting.c    | 112 +++++++++++++++++++++++++++++++++++++++++++++++--
>  mm/page_reporting.h    |   5 +++
>  4 files changed, 119 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 47946ce..ebd169f 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -530,6 +530,9 @@ struct zone {
>         atomic_long_t           managed_pages;
>         unsigned long           spanned_pages;
>         unsigned long           present_pages;
> +#ifdef CONFIG_PAGE_REPORTING
> +       unsigned long           reported_pages;
> +#endif
>  #ifdef CONFIG_CMA
>         unsigned long           cma_pages;
>  #endif
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3e4b29ee..c2c5688 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -930,8 +930,10 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
>                                            unsigned int order)
>  {
>         /* clear reported state and update reported page count */
> -       if (page_reported(page))
> +       if (page_reported(page)) {
>                 __ClearPageReported(page);
> +               page_reporting_update_refault(zone, 1 << order);
> +       }
>
>         list_del(&page->lru);
>         __ClearPageBuddy(page);
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index c50d93f..ba195ea 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -1,4 +1,5 @@
>  // SPDX-License-Identifier: GPL-2.0
> +#include <linux/module.h>
>  #include <linux/mm.h>
>  #include <linux/mmzone.h>
>  #include <linux/page_reporting.h>
> @@ -19,6 +20,22 @@ enum {
>         PAGE_REPORTING_ACTIVE
>  };
>
> +#ifdef CONFIG_SYSFS
> +static struct percpu_counter refault_pages;
> +
> +void page_reporting_update_refault(struct zone *zone, unsigned int pages)
> +{
> +       zone->reported_pages -= pages;
> +       percpu_counter_add_batch(&refault_pages, pages, INT_MAX / 2);
> +}
> +#else
> +void page_reporting_update_refault(struct zone *zone, unsigned int pages)
> +{
> +       zone->reported_pages -= pages;
> +}
> +#endif
> +
> +

I don't see the value added from the refault_pages counter.
Essentially all it will tell you is how many reported pages were
allocated. If you are really wanting to track a value such as this it
might make more sense to just track the total number of reported pages
over the lifetime of the system. At least with that you would once
again be able to take advantage of batching so it isn't occurring as
often.

>  /* request page reporting */
>  static void
>  __page_reporting_request(struct page_reporting_dev_info *prdev)
> @@ -66,7 +83,8 @@ void __page_reporting_notify(void)
>
>  static void
>  page_reporting_drain(struct page_reporting_dev_info *prdev,
> -                    struct scatterlist *sgl, unsigned int nents, bool reported)
> +                    struct scatterlist *sgl, struct zone *zone,
> +                    unsigned int nents, bool reported)
>  {
>         struct scatterlist *sg = sgl;
>
> @@ -92,8 +110,10 @@ void __page_reporting_notify(void)
>                  * report on the new larger page when we make our way
>                  * up to that higher order.
>                  */
> -               if (PageBuddy(page) && buddy_order(page) == order)
> +               if (PageBuddy(page) && buddy_order(page) == order) {
>                         __SetPageReported(page);
> +                       zone->reported_pages += (1 << order);
> +               }

The parenthesis around "1 << order" is redundant.

>         } while ((sg = sg_next(sg)));
>
>         /* reinitialize scatterlist now that it is empty */
> @@ -197,7 +217,7 @@ void __page_reporting_notify(void)
>                 spin_lock_irq(&zone->lock);
>
>                 /* flush reported pages from the sg list */
> -               page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
> +               page_reporting_drain(prdev, sgl, zone, PAGE_REPORTING_CAPACITY, !err);
>
>                 /*
>                  * Reset next to first entry, the old next isn't valid
> @@ -260,7 +280,7 @@ void __page_reporting_notify(void)
>
>                 /* flush any remaining pages out from the last report */
>                 spin_lock_irq(&zone->lock);
> -               page_reporting_drain(prdev, sgl, leftover, !err);
> +               page_reporting_drain(prdev, sgl, zone, leftover, !err);
>                 spin_unlock_irq(&zone->lock);
>         }
>
> @@ -362,3 +382,87 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
>         mutex_unlock(&page_reporting_mutex);
>  }
>  EXPORT_SYMBOL_GPL(page_reporting_unregister);
> +
> +#ifdef CONFIG_SYSFS
> +#define REPORTING_ATTR(_name) \
> +       static struct kobj_attribute _name##_attr = \
> +               __ATTR(_name, 0644, _name##_show, _name##_store)
> +

You would be better off defining a read only attribute so you don't
have to define the dummy store functions.

> +static unsigned long get_reported_kbytes(void)
> +{
> +       struct zone *z;
> +       unsigned long nr_reported = 0;
> +
> +       for_each_populated_zone(z)
> +               nr_reported += z->reported_pages;
> +
> +       return nr_reported << (PAGE_SHIFT - 10);
> +}
> +
> +static ssize_t reported_kbytes_show(struct kobject *kobj,
> +               struct kobj_attribute *attr, char *buf)
> +{
> +       return sprintf(buf, "%lu\n", get_reported_kbytes());
> +}
> +
> +static ssize_t reported_kbytes_store(struct kobject *kobj,
> +               struct kobj_attribute *attr,
> +               const char *buf, size_t count)
> +{
> +       return -EINVAL;
> +}

Get rid of this dummy store function.

> +REPORTING_ATTR(reported_kbytes);
> +
> +static u64 get_refault_kbytes(void)
> +{
> +       u64 sum;
> +
> +       sum = percpu_counter_sum_positive(&refault_pages);
> +       return sum << (PAGE_SHIFT - 10);
> +}
> +
> +static ssize_t refault_kbytes_show(struct kobject *kobj,
> +               struct kobj_attribute *attr, char *buf)
> +{
> +       return sprintf(buf, "%llu\n", get_refault_kbytes());
> +}
> +
> +static ssize_t refault_kbytes_store(struct kobject *kobj,
> +               struct kobj_attribute *attr,
> +               const char *buf, size_t count)
> +{
> +       return -EINVAL;
> +}

Also, no need for this store function.

> +REPORTING_ATTR(refault_kbytes);
> +
> +static struct attribute *reporting_attrs[] = {
> +       &reported_kbytes_attr.attr,
> +       &refault_kbytes_attr.attr,
> +       NULL,
> +};
> +
> +static struct attribute_group reporting_attr_group = {
> +       .attrs = reporting_attrs,
> +       .name = "page_reporting",
> +};
> +#endif
> +
> +static int __init page_reporting_init(void)
> +{
> +#ifdef CONFIG_SYSFS
> +       int err;
> +
> +       if (percpu_counter_init(&refault_pages, 0, GFP_KERNEL))
> +               panic("Failed to allocate refault_pages percpu counter\n");
> +
> +       err = sysfs_create_group(mm_kobj, &reporting_attr_group);
> +       if (err) {
> +               pr_err("%s: Unable to populate sysfs files\n", __func__);
> +               return err;
> +       }
> +#endif
> +
> +       return 0;
> +}
> +
> +module_init(page_reporting_init);
> diff --git a/mm/page_reporting.h b/mm/page_reporting.h
> index 2c385dd..19549c7 100644
> --- a/mm/page_reporting.h
> +++ b/mm/page_reporting.h
> @@ -44,11 +44,16 @@ static inline void page_reporting_notify_free(unsigned int order)
>         /* This will add a few cycles, but should be called infrequently */
>         __page_reporting_notify();
>  }
> +
> +void page_reporting_update_refault(struct zone *zone, unsigned int pages);
>  #else /* CONFIG_PAGE_REPORTING */
>  #define page_reported(_page)   false
>
>  static inline void page_reporting_notify_free(unsigned int order)
>  {
>  }
> +
> +static inline void
> +page_reporting_update_refault(struct zone *zone, unsigned int pages) { }
>  #endif /* CONFIG_PAGE_REPORTING */
>  #endif /*_MM_PAGE_REPORTING_H */
> --
> 1.8.3.1
>
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor
  2021-03-26  9:44 ` [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor Xunlei Pang
@ 2021-04-02 18:56   ` Alexander Duyck
  2021-04-06  6:53     ` Xunlei Pang
  0 siblings, 1 reply; 12+ messages in thread
From: Alexander Duyck @ 2021-04-02 18:56 UTC (permalink / raw)
  To: Xunlei Pang; +Cc: Andrew Morton, Alexander Duyck, Mel Gorman, LKML, linux-mm

On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>
> Add new "/sys/kernel/mm/page_reporting/reporting_factor"
> within [0, 100], and stop page reporting when it reaches
> the configured threshold. Default is 100 which means no
> limitation is imposed. Percentile is adopted to reflect
> the fact that it reports on the per-zone basis.
>
> We can control the total number of reporting pages via
> this knob to avoid EPT violations which may affect the
> performance of the business, imagine the guest memory
> allocation burst or host long-tail memory reclaiming
> really hurt.

I'm not a fan of the concept as I don't think it really does what it
was meant to do. The way page reporting was meant to work is that when
we have enough free pages we will cycle through memory a few pages at
a time reporting what is unused to the hypervisor. It was meant to be
a scan more than something that just would stop once it touched a
certain part of the memory.

If you are wanting to truly reserve some amount of memory so that it
is always left held by the guest then it might make more sense to make
the value a fixed amount of memory rather than trying to do it as a
percentage.

Also we may need to look at adding some sort of
linearization/defragmentation logic for the reported pages. One issue
is that there are several things that will add pages to the end of the
free page lists. One of the reasons why I was processing the entire
list when I was processing reported pages was because the page freeing
functions will normally cause pages to be interleaved with the
reported pages on the end of the list. So if you are wanting to
reserve some pages as being non-reported we may need to add something
sort the lists periodically.

> This knob can help make customized control policies according
> to VM priority, it is also useful for testing, gray-release, etc.

As far as the knob itself it would make sense to combine this with
patch 3 since they are just different versions of the same control

> ---
>  mm/page_reporting.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 59 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index ba195ea..86c6479 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -11,6 +11,8 @@
>  #include "page_reporting.h"
>  #include "internal.h"
>
> +static int reporting_factor = 100;
> +
>  #define PAGE_REPORTING_DELAY   (2 * HZ)
>  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>
> @@ -134,6 +136,7 @@ void __page_reporting_notify(void)
>         struct list_head *list = &area->free_list[mt];
>         unsigned int page_len = PAGE_SIZE << order;
>         struct page *page, *next;
> +       unsigned long threshold;
>         long budget;
>         int err = 0;
>
> @@ -144,6 +147,7 @@ void __page_reporting_notify(void)
>         if (list_empty(list))
>                 return err;
>
> +       threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;

So at 0 you are setting this threshold to 0, however based on the code
below you are still pulling at least one page.

>         spin_lock_irq(&zone->lock);
>
>         /*
> @@ -181,6 +185,8 @@ void __page_reporting_notify(void)
>
>                 /* Attempt to pull page from list and place in scatterlist */
>                 if (*offset) {
> +                       unsigned long nr_pages;
> +
>                         if (!__isolate_free_page(page, order)) {
>                                 next = page;
>                                 break;
> @@ -190,6 +196,12 @@ void __page_reporting_notify(void)
>                         --(*offset);
>                         sg_set_page(&sgl[*offset], page, page_len, 0);
>
> +                       nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << order;
> +                       if (zone->reported_pages + nr_pages >= threshold) {
> +                               err = 1;
> +                               break;
> +                       }
> +

So here we are checking the threshold after we have already pulled the
page. With this being the case it might make more sense to either
allow for the full capacity of pages to be pulled and then check this
after they have been reported, or to move this check up to somewhere
before you start processing the pages. What you want to avoid is
having to perform this check for every individual page.

>                         continue;
>                 }
>
> @@ -244,9 +256,13 @@ void __page_reporting_notify(void)
>                             struct scatterlist *sgl, struct zone *zone)
>  {
>         unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
> -       unsigned long watermark;
> +       unsigned long watermark, threshold;
>         int err = 0;
>
> +       threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
> +       if (zone->reported_pages >= threshold)
> +               return err;
> +

Rather than having to calculate the threshold in multiple spots it
might make more sense to move this to the start of
page_reporting_cycle and have it performed again before we reacquire
the spinlock and run page_reporting_drain.

>         /* Generate minimum watermark to be able to guarantee progress */
>         watermark = low_wmark_pages(zone) +
>                     (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
> @@ -267,11 +283,18 @@ void __page_reporting_notify(void)
>
>                         err = page_reporting_cycle(prdev, zone, order, mt,
>                                                    sgl, &offset);
> +                       /* Exceed threshold go to report leftover */
> +                       if (err > 0) {
> +                               err = 0;
> +                               goto leftover;
> +                       }
> +
>                         if (err)
>                                 return err;
>                 }
>         }
>
> +leftover:
>         /* report the leftover pages before going idle */
>         leftover = PAGE_REPORTING_CAPACITY - offset;
>         if (leftover) {

You should optimize for processing full batches rather than chopping
things up into smaller groupings.

> @@ -435,9 +458,44 @@ static ssize_t refault_kbytes_store(struct kobject *kobj,
>  }
>  REPORTING_ATTR(refault_kbytes);
>
> +static ssize_t reporting_factor_show(struct kobject *kobj,
> +               struct kobj_attribute *attr, char *buf)
> +{
> +       return sprintf(buf, "%u\n", reporting_factor);
> +}
> +
> +static ssize_t reporting_factor_store(struct kobject *kobj,
> +               struct kobj_attribute *attr,
> +               const char *buf, size_t count)
> +{
> +       int new, old, err;
> +       struct page *page;
> +
> +       err = kstrtoint(buf, 10, &new);
> +       if (err || (new < 0 || new > 100))
> +               return -EINVAL;
> +
> +       old = reporting_factor;
> +       reporting_factor = new;
> +
> +       if (new <= old)
> +               goto out;
> +
> +       /* Trigger reporting with new larger reporting_factor */
> +       page = alloc_pages(__GFP_HIGHMEM | __GFP_NOWARN,
> +                       PAGE_REPORTING_MIN_ORDER);
> +       if (page)
> +               __free_pages(page, PAGE_REPORTING_MIN_ORDER);
> +
> +out:
> +       return count;
> +}
> +REPORTING_ATTR(reporting_factor);
> +
>  static struct attribute *reporting_attrs[] = {
>         &reported_kbytes_attr.attr,
>         &refault_kbytes_attr.attr,
> +       &reporting_factor_attr.attr,
>         NULL,
>  };
>
> --
> 1.8.3.1
>
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure
  2021-03-26  9:44 ` [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure Xunlei Pang
@ 2021-04-02 19:55   ` Alexander Duyck
  2021-04-06  6:55     ` Xunlei Pang
  0 siblings, 1 reply; 12+ messages in thread
From: Alexander Duyck @ 2021-04-02 19:55 UTC (permalink / raw)
  To: Xunlei Pang; +Cc: Andrew Morton, Mel Gorman, LKML, linux-mm

On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>
> We encountered user memory allocation failure(OOM) on our
> 512MiB tiny instances, it didn't happen after turning off
> the page reporting.
>
> After some debugging, it turns out 32*4MB=128MB(order-10)
> free pages were isolated during reporting window resulting
> in no free available.
>
> Actually this might also happen on large instances when
> having a few free memory.
>
> This patch introduces a rule to limit reporting capacity
> according to current free memory, and reduce accordingly
> for higher orders which could break this rule.
>
> For example,
>  100MiB free, sgl capacity for different orders are:
>    order-9 : 32
>    order-10: 16
>
> Reported-by: Helin Guo <helinguo@linux.alibaba.com>
> Tested-by: Helin Guo <helinguo@linux.alibaba.com>
> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>

I'm curious how much of this would be solved by just making it so that
we reduce the capacity by half if we increase the order? So
specifically if we did something such as:
  capacity = (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER) >> order;

We just have to make sure the capacity is greater than zero before
entering the processing loop.

An alternative that occured to me while I reviewed this is to look at
just adding a reserve. That would be something like:
  reserve = PAGE_REPORTING_CAPACITY - capacity;

Basically the reserve would take up some space at the start of the
list so that you wouldn't need to actually change the capacity
directly. It would just be a matter of making certain we deducted it
and updated the offsets of the scatterlist as necessary.


> ---
>  mm/page_reporting.c | 89 +++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 72 insertions(+), 17 deletions(-)
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 6ffedb8..2ec0ec0 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -129,8 +129,8 @@ void __page_reporting_notify(void)
>   */
>  static int
>  page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
> -                    unsigned int order, unsigned int mt,
> -                    struct scatterlist *sgl, unsigned int *offset)
> +                    unsigned int order, unsigned int mt, struct scatterlist *sgl,
> +                    const unsigned int capacity, unsigned int *offset)
>  {
>         struct free_area *area = &zone->free_area[order];
>         struct list_head *list = &area->free_list[mt];
> @@ -161,10 +161,10 @@ void __page_reporting_notify(void)
>          * list processed. This should result in us reporting all pages on
>          * an idle system in about 30 seconds.
>          *
> -        * The division here should be cheap since PAGE_REPORTING_CAPACITY
> -        * should always be a power of 2.
> +        * The division here should be cheap since capacity should
> +        * always be a power of 2.
>          */
> -       budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
> +       budget = DIV_ROUND_UP(area->nr_free, capacity * 16);

So the comment here is no longer valid when capacity became a
variable. An alternative to look at if we were to assume the shift
approach I mentioned above would be to then shift the budget based on
the reduced capacity.

>         /* loop through free list adding unreported pages to sg list */
>         list_for_each_entry_safe(page, next, list, lru) {
> @@ -196,7 +196,7 @@ void __page_reporting_notify(void)
>                         --(*offset);
>                         sg_set_page(&sgl[*offset], page, page_len, 0);
>
> -                       nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << order;
> +                       nr_pages = (capacity - *offset) << order;
>                         if (zone->reported_pages + nr_pages >= threshold) {
>                                 err = 1;
>                                 break;

Rather than adding a capacity value it might work better to add a
"reserve" value so that we are just padding the start of the
scatterlist rather than having to reset it every time we change the
total capacity of the scatterlist. The advantage to that is that you
could drop all the changes where you are having to reset the list and
change the capacity.

Instead you would just need to update the check to "*offset <=
reserve" and the call to report/drain so that they take into account
the reserve offset.

> @@ -217,10 +217,10 @@ void __page_reporting_notify(void)
>                 spin_unlock_irq(&zone->lock);
>
>                 /* begin processing pages in local list */
> -               err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
> +               err = prdev->report(prdev, sgl, capacity);
>

Assuming the change to "reserve" then this would be "&sgl[*offset],
PAGE_REPORTING_CAPACITY - *offset", or you could look at copying the
approach taken in the "leftover" path in page_reporting_process_zone.

>                 /* reset offset since the full list was reported */
> -               *offset = PAGE_REPORTING_CAPACITY;
> +               *offset = capacity;
>
>                 /* update budget to reflect call to report function */
>                 budget--;
> @@ -229,7 +229,7 @@ void __page_reporting_notify(void)
>                 spin_lock_irq(&zone->lock);
>
>                 /* flush reported pages from the sg list */
> -               page_reporting_drain(prdev, sgl, zone, PAGE_REPORTING_CAPACITY, !err);
> +               page_reporting_drain(prdev, sgl, zone, capacity, !err);

Same here. The general idea is you want to avoid having to flush every
time you want to change the reserve and instead just trigger a flush
should your offset value fall below what is reserved.

>                 /*
>                  * Reset next to first entry, the old next isn't valid
> @@ -251,12 +251,39 @@ void __page_reporting_notify(void)
>         return err;
>  }

So all of the code below seems to be the result of the added
complexity I mentioned above due to the capacity being changed rather
than some portion of the list becoming reserved.

I think it would be much more interesting to explore the approach of
just reserving some portion of the start of the scatterlist rather
than trying to change the capacity. By doing that much of the code
change seen here can be avoided as you are having to restructure the
entire list and are introducing other possible issues since one of the
things I was doing by using the approach I did is always making sure
the sg_end was already set for the scatterlist end whereas that is
gone now with these changes.

> +/*
> + * For guest with little free memory, we should tune reporting capacity
> + * correctly to avoid reporting too much once, otherwise user allocation
> + * may fail and OOM during reporting window between __isolate_free_page()
> + * and page_reporting_drain().
> + *
> + * Calculate from which order we begin to reduce the scatterlist capacity,
> + * in order not to isolate too many pages to fail the user allocation.
> + */
> +static unsigned int calculate_zone_order_threshold(struct zone *z)
> +{
> +       unsigned int order;
> +       long pages_threshold;
> +
> +       pages_threshold = zone_page_state(z, NR_FREE_PAGES) - low_wmark_pages(z);
> +       for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
> +               if ((PAGE_REPORTING_CAPACITY << order) > pages_threshold)
> +                       break;
> +       }
> +
> +       return order;
> +}
> +
>  static int
>  page_reporting_process_zone(struct page_reporting_dev_info *prdev,
>                             struct scatterlist *sgl, struct zone *zone)
>  {
> -       unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
> +       unsigned int order, mt, leftover, offset;
>         unsigned long watermark, threshold;
> +       unsigned int capacity = PAGE_REPORTING_CAPACITY;
> +       unsigned int capacity_curr;
> +       struct scatterlist *sgl_curr;
> +       unsigned int order_threshold;
>         int err = 0;
>
>         threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
> @@ -274,15 +301,28 @@ void __page_reporting_notify(void)
>         if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
>                 return err;
>
> +       sgl_curr = sgl;
> +       capacity_curr = offset = capacity;
> +       order_threshold = calculate_zone_order_threshold(zone);
>         /* Process each free list starting from lowest order/mt */
>         for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
> +               /* try to reduce unexpected high order's reporting capacity */
> +               if (order >= order_threshold) {
> +                       capacity_curr = capacity >> (order - order_threshold + 1);
> +                       if (capacity_curr == 0)
> +                               capacity_curr = 1;
> +                       sgl_curr = sgl + capacity - capacity_curr;
> +                       offset = capacity_curr;
> +                       sg_init_table(sgl_curr, capacity_curr);
> +               }
> +

The problem here is you are assuming the order threshold will not
change during processing. Your order_threshold value could become
stale while you are processing the zone so I am not sure having it
provides much value.

I think we might be better off just assuming that we need to halve
capacity as the order increases.

>                 for (mt = 0; mt < MIGRATE_TYPES; mt++) {
>                         /* We do not pull pages from the isolate free list */
>                         if (is_migrate_isolate(mt))
>                                 continue;
>
>                         err = page_reporting_cycle(prdev, zone, order, mt,
> -                                                  sgl, &offset);
> +                                                  sgl_curr, capacity_curr, &offset);
>                         /* Exceed threshold go to report leftover */
>                         if (err > 0) {
>                                 err = 0;
> @@ -292,18 +332,34 @@ void __page_reporting_notify(void)
>                         if (err)
>                                 return err;
>                 }
> +
> +               /* report the leftover pages for next orders with reduced capacity */
> +               leftover = capacity_curr - offset;
> +               if (leftover && order + 1 >= order_threshold) {
> +                       sgl_curr = &sgl_curr[offset];
> +                       err = prdev->report(prdev, sgl_curr, leftover);
> +                       offset = capacity_curr;
> +
> +                       /* flush any remaining pages out from the last report */
> +                       spin_lock_irq(&zone->lock);
> +                       page_reporting_drain(prdev, sgl_curr, zone, leftover, !err);
> +                       spin_unlock_irq(&zone->lock);
> +
> +                       if (err)
> +                               return err;
> +               }
>         }
>
>  leftover:
>         /* report the leftover pages before going idle */
> -       leftover = PAGE_REPORTING_CAPACITY - offset;
> +       leftover = capacity_curr - offset;
>         if (leftover) {
> -               sgl = &sgl[offset];
> -               err = prdev->report(prdev, sgl, leftover);
> +               sgl_curr = &sgl_curr[offset];
> +               err = prdev->report(prdev, sgl_curr, leftover);
>
>                 /* flush any remaining pages out from the last report */
>                 spin_lock_irq(&zone->lock);
> -               page_reporting_drain(prdev, sgl, zone, leftover, !err);
> +               page_reporting_drain(prdev, sgl_curr, zone, leftover, !err);
>                 spin_unlock_irq(&zone->lock);
>         }
>
> @@ -332,9 +388,8 @@ static void page_reporting_process(struct work_struct *work)
>         if (!sgl)
>                 goto err_out;
>
> -       sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
> -
>         for_each_zone(zone) {
> +               sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
>                 err = page_reporting_process_zone(prdev, sgl, zone);
>                 if (err)
>                         break;
> --
> 1.8.3.1
>
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor
  2021-04-02 18:56   ` Alexander Duyck
@ 2021-04-06  6:53     ` Xunlei Pang
  0 siblings, 0 replies; 12+ messages in thread
From: Xunlei Pang @ 2021-04-06  6:53 UTC (permalink / raw)
  To: Alexander Duyck, Xunlei Pang
  Cc: Andrew Morton, Alexander Duyck, Mel Gorman, LKML, linux-mm

On 4/3/21 2:56 AM, Alexander Duyck wrote:
> On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>>
>> Add new "/sys/kernel/mm/page_reporting/reporting_factor"
>> within [0, 100], and stop page reporting when it reaches
>> the configured threshold. Default is 100 which means no
>> limitation is imposed. Percentile is adopted to reflect
>> the fact that it reports on the per-zone basis.
>>
>> We can control the total number of reporting pages via
>> this knob to avoid EPT violations which may affect the
>> performance of the business, imagine the guest memory
>> allocation burst or host long-tail memory reclaiming
>> really hurt.
> 
> I'm not a fan of the concept as I don't think it really does what it
> was meant to do. The way page reporting was meant to work is that when
> we have enough free pages we will cycle through memory a few pages at
> a time reporting what is unused to the hypervisor. It was meant to be
> a scan more than something that just would stop once it touched a
> certain part of the memory.
> 
> If you are wanting to truly reserve some amount of memory so that it
> is always left held by the guest then it might make more sense to make
> the value a fixed amount of memory rather than trying to do it as a
> percentage.
> 
> Also we may need to look at adding some sort of
> linearization/defragmentation logic for the reported pages. One issue
> is that there are several things that will add pages to the end of the
> free page lists. One of the reasons why I was processing the entire
> list when I was processing reported pages was because the page freeing
> functions will normally cause pages to be interleaved with the
> reported pages on the end of the list. So if you are wanting to
> reserve some pages as being non-reported we may need to add something
> sort the lists periodically.

Yes, agreed. To make the counter accurate, I also noticed this problem,
I'm going to figure out a way to handle it, e.g. maybe adding a new
migratetype for reported free_list is a good choice, this also helps
reduce the zone lock latency during the report procedue on large VM,
which can be in milliseconds.

> 
>> This knob can help make customized control policies according
>> to VM priority, it is also useful for testing, gray-release, etc.
> 
> As far as the knob itself it would make sense to combine this with
> patch 3 since they are just different versions of the same control
> 
>> ---
>>  mm/page_reporting.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 59 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
>> index ba195ea..86c6479 100644
>> --- a/mm/page_reporting.c
>> +++ b/mm/page_reporting.c
>> @@ -11,6 +11,8 @@
>>  #include "page_reporting.h"
>>  #include "internal.h"
>>
>> +static int reporting_factor = 100;
>> +
>>  #define PAGE_REPORTING_DELAY   (2 * HZ)
>>  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>>
>> @@ -134,6 +136,7 @@ void __page_reporting_notify(void)
>>         struct list_head *list = &area->free_list[mt];
>>         unsigned int page_len = PAGE_SIZE << order;
>>         struct page *page, *next;
>> +       unsigned long threshold;
>>         long budget;
>>         int err = 0;
>>
>> @@ -144,6 +147,7 @@ void __page_reporting_notify(void)
>>         if (list_empty(list))
>>                 return err;
>>
>> +       threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
> 
> So at 0 you are setting this threshold to 0, however based on the code
> below you are still pulling at least one page.
> 
>>         spin_lock_irq(&zone->lock);
>>
>>         /*
>> @@ -181,6 +185,8 @@ void __page_reporting_notify(void)
>>
>>                 /* Attempt to pull page from list and place in scatterlist */
>>                 if (*offset) {
>> +                       unsigned long nr_pages;
>> +
>>                         if (!__isolate_free_page(page, order)) {
>>                                 next = page;
>>                                 break;
>> @@ -190,6 +196,12 @@ void __page_reporting_notify(void)
>>                         --(*offset);
>>                         sg_set_page(&sgl[*offset], page, page_len, 0);
>>
>> +                       nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << order;
>> +                       if (zone->reported_pages + nr_pages >= threshold) {
>> +                               err = 1;
>> +                               break;
>> +                       }
>> +
> 
> So here we are checking the threshold after we have already pulled the
> page. With this being the case it might make more sense to either
> allow for the full capacity of pages to be pulled and then check this
> after they have been reported, or to move this check up to somewhere
> before you start processing the pages. What you want to avoid is
> having to perform this check for every individual page.
> 
>>                         continue;
>>                 }
>>
>> @@ -244,9 +256,13 @@ void __page_reporting_notify(void)
>>                             struct scatterlist *sgl, struct zone *zone)
>>  {
>>         unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
>> -       unsigned long watermark;
>> +       unsigned long watermark, threshold;
>>         int err = 0;
>>
>> +       threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
>> +       if (zone->reported_pages >= threshold)
>> +               return err;
>> +
> 
> Rather than having to calculate the threshold in multiple spots it
> might make more sense to move this to the start of
> page_reporting_cycle and have it performed again before we reacquire
> the spinlock and run page_reporting_drain.
> 
>>         /* Generate minimum watermark to be able to guarantee progress */
>>         watermark = low_wmark_pages(zone) +
>>                     (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
>> @@ -267,11 +283,18 @@ void __page_reporting_notify(void)
>>
>>                         err = page_reporting_cycle(prdev, zone, order, mt,
>>                                                    sgl, &offset);
>> +                       /* Exceed threshold go to report leftover */
>> +                       if (err > 0) {
>> +                               err = 0;
>> +                               goto leftover;
>> +                       }
>> +
>>                         if (err)
>>                                 return err;
>>                 }
>>         }
>>
>> +leftover:
>>         /* report the leftover pages before going idle */
>>         leftover = PAGE_REPORTING_CAPACITY - offset;
>>         if (leftover) {
> 
> You should optimize for processing full batches rather than chopping
> things up into smaller groupings.
> 
>> @@ -435,9 +458,44 @@ static ssize_t refault_kbytes_store(struct kobject *kobj,
>>  }
>>  REPORTING_ATTR(refault_kbytes);
>>
>> +static ssize_t reporting_factor_show(struct kobject *kobj,
>> +               struct kobj_attribute *attr, char *buf)
>> +{
>> +       return sprintf(buf, "%u\n", reporting_factor);
>> +}
>> +
>> +static ssize_t reporting_factor_store(struct kobject *kobj,
>> +               struct kobj_attribute *attr,
>> +               const char *buf, size_t count)
>> +{
>> +       int new, old, err;
>> +       struct page *page;
>> +
>> +       err = kstrtoint(buf, 10, &new);
>> +       if (err || (new < 0 || new > 100))
>> +               return -EINVAL;
>> +
>> +       old = reporting_factor;
>> +       reporting_factor = new;
>> +
>> +       if (new <= old)
>> +               goto out;
>> +
>> +       /* Trigger reporting with new larger reporting_factor */
>> +       page = alloc_pages(__GFP_HIGHMEM | __GFP_NOWARN,
>> +                       PAGE_REPORTING_MIN_ORDER);
>> +       if (page)
>> +               __free_pages(page, PAGE_REPORTING_MIN_ORDER);
>> +
>> +out:
>> +       return count;
>> +}
>> +REPORTING_ATTR(reporting_factor);
>> +
>>  static struct attribute *reporting_attrs[] = {
>>         &reported_kbytes_attr.attr,
>>         &refault_kbytes_attr.attr,
>> +       &reporting_factor_attr.attr,
>>         NULL,
>>  };
>>
>> --
>> 1.8.3.1
>>
>>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure
  2021-04-02 19:55   ` Alexander Duyck
@ 2021-04-06  6:55     ` Xunlei Pang
  0 siblings, 0 replies; 12+ messages in thread
From: Xunlei Pang @ 2021-04-06  6:55 UTC (permalink / raw)
  To: Alexander Duyck, Xunlei Pang; +Cc: Andrew Morton, Mel Gorman, LKML, linux-mm

On 4/3/21 3:55 AM, Alexander Duyck wrote:
> On Fri, Mar 26, 2021 at 2:45 AM Xunlei Pang <xlpang@linux.alibaba.com> wrote:
>>
>> We encountered user memory allocation failure(OOM) on our
>> 512MiB tiny instances, it didn't happen after turning off
>> the page reporting.
>>
>> After some debugging, it turns out 32*4MB=128MB(order-10)
>> free pages were isolated during reporting window resulting
>> in no free available.
>>
>> Actually this might also happen on large instances when
>> having a few free memory.
>>
>> This patch introduces a rule to limit reporting capacity
>> according to current free memory, and reduce accordingly
>> for higher orders which could break this rule.
>>
>> For example,
>>  100MiB free, sgl capacity for different orders are:
>>    order-9 : 32
>>    order-10: 16
>>
>> Reported-by: Helin Guo <helinguo@linux.alibaba.com>
>> Tested-by: Helin Guo <helinguo@linux.alibaba.com>
>> Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
> 
> I'm curious how much of this would be solved by just making it so that
> we reduce the capacity by half if we increase the order? So
> specifically if we did something such as:
>   capacity = (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER) >> order;
> 
> We just have to make sure the capacity is greater than zero before
> entering the processing loop.
> 
> An alternative that occured to me while I reviewed this is to look at
> just adding a reserve. That would be something like:
>   reserve = PAGE_REPORTING_CAPACITY - capacity;
> 
> Basically the reserve would take up some space at the start of the
> list so that you wouldn't need to actually change the capacity
> directly. It would just be a matter of making certain we deducted it
> and updated the offsets of the scatterlist as necessary.
> 
> 
>> ---
>>  mm/page_reporting.c | 89 +++++++++++++++++++++++++++++++++++++++++++----------
>>  1 file changed, 72 insertions(+), 17 deletions(-)
>>
>> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
>> index 6ffedb8..2ec0ec0 100644
>> --- a/mm/page_reporting.c
>> +++ b/mm/page_reporting.c
>> @@ -129,8 +129,8 @@ void __page_reporting_notify(void)
>>   */
>>  static int
>>  page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
>> -                    unsigned int order, unsigned int mt,
>> -                    struct scatterlist *sgl, unsigned int *offset)
>> +                    unsigned int order, unsigned int mt, struct scatterlist *sgl,
>> +                    const unsigned int capacity, unsigned int *offset)
>>  {
>>         struct free_area *area = &zone->free_area[order];
>>         struct list_head *list = &area->free_list[mt];
>> @@ -161,10 +161,10 @@ void __page_reporting_notify(void)
>>          * list processed. This should result in us reporting all pages on
>>          * an idle system in about 30 seconds.
>>          *
>> -        * The division here should be cheap since PAGE_REPORTING_CAPACITY
>> -        * should always be a power of 2.
>> +        * The division here should be cheap since capacity should
>> +        * always be a power of 2.
>>          */
>> -       budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
>> +       budget = DIV_ROUND_UP(area->nr_free, capacity * 16);
> 
> So the comment here is no longer valid when capacity became a
> variable. An alternative to look at if we were to assume the shift
> approach I mentioned above would be to then shift the budget based on
> the reduced capacity.
> 
>>         /* loop through free list adding unreported pages to sg list */
>>         list_for_each_entry_safe(page, next, list, lru) {
>> @@ -196,7 +196,7 @@ void __page_reporting_notify(void)
>>                         --(*offset);
>>                         sg_set_page(&sgl[*offset], page, page_len, 0);
>>
>> -                       nr_pages = (PAGE_REPORTING_CAPACITY - *offset) << order;
>> +                       nr_pages = (capacity - *offset) << order;
>>                         if (zone->reported_pages + nr_pages >= threshold) {
>>                                 err = 1;
>>                                 break;
> 
> Rather than adding a capacity value it might work better to add a
> "reserve" value so that we are just padding the start of the
> scatterlist rather than having to reset it every time we change the
> total capacity of the scatterlist. The advantage to that is that you
> could drop all the changes where you are having to reset the list and
> change the capacity.
> 
> Instead you would just need to update the check to "*offset <=
> reserve" and the call to report/drain so that they take into account
> the reserve offset.
> 
>> @@ -217,10 +217,10 @@ void __page_reporting_notify(void)
>>                 spin_unlock_irq(&zone->lock);
>>
>>                 /* begin processing pages in local list */
>> -               err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
>> +               err = prdev->report(prdev, sgl, capacity);
>>
> 
> Assuming the change to "reserve" then this would be "&sgl[*offset],
> PAGE_REPORTING_CAPACITY - *offset", or you could look at copying the
> approach taken in the "leftover" path in page_reporting_process_zone.
> 
>>                 /* reset offset since the full list was reported */
>> -               *offset = PAGE_REPORTING_CAPACITY;
>> +               *offset = capacity;
>>
>>                 /* update budget to reflect call to report function */
>>                 budget--;
>> @@ -229,7 +229,7 @@ void __page_reporting_notify(void)
>>                 spin_lock_irq(&zone->lock);
>>
>>                 /* flush reported pages from the sg list */
>> -               page_reporting_drain(prdev, sgl, zone, PAGE_REPORTING_CAPACITY, !err);
>> +               page_reporting_drain(prdev, sgl, zone, capacity, !err);
> 
> Same here. The general idea is you want to avoid having to flush every
> time you want to change the reserve and instead just trigger a flush
> should your offset value fall below what is reserved.
> 
>>                 /*
>>                  * Reset next to first entry, the old next isn't valid
>> @@ -251,12 +251,39 @@ void __page_reporting_notify(void)
>>         return err;
>>  }
> 
> So all of the code below seems to be the result of the added
> complexity I mentioned above due to the capacity being changed rather
> than some portion of the list becoming reserved.
> 
> I think it would be much more interesting to explore the approach of
> just reserving some portion of the start of the scatterlist rather
> than trying to change the capacity. By doing that much of the code
> change seen here can be avoided as you are having to restructure the
> entire list and are introducing other possible issues since one of the
> things I was doing by using the approach I did is always making sure
> the sg_end was already set for the scatterlist end whereas that is
> gone now with these changes.
> 
>> +/*
>> + * For guest with little free memory, we should tune reporting capacity
>> + * correctly to avoid reporting too much once, otherwise user allocation
>> + * may fail and OOM during reporting window between __isolate_free_page()
>> + * and page_reporting_drain().
>> + *
>> + * Calculate from which order we begin to reduce the scatterlist capacity,
>> + * in order not to isolate too many pages to fail the user allocation.
>> + */
>> +static unsigned int calculate_zone_order_threshold(struct zone *z)
>> +{
>> +       unsigned int order;
>> +       long pages_threshold;
>> +
>> +       pages_threshold = zone_page_state(z, NR_FREE_PAGES) - low_wmark_pages(z);
>> +       for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
>> +               if ((PAGE_REPORTING_CAPACITY << order) > pages_threshold)
>> +                       break;
>> +       }
>> +
>> +       return order;
>> +}
>> +
>>  static int
>>  page_reporting_process_zone(struct page_reporting_dev_info *prdev,
>>                             struct scatterlist *sgl, struct zone *zone)
>>  {
>> -       unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
>> +       unsigned int order, mt, leftover, offset;
>>         unsigned long watermark, threshold;
>> +       unsigned int capacity = PAGE_REPORTING_CAPACITY;
>> +       unsigned int capacity_curr;
>> +       struct scatterlist *sgl_curr;
>> +       unsigned int order_threshold;
>>         int err = 0;
>>
>>         threshold = atomic_long_read(&zone->managed_pages) * reporting_factor / 100;
>> @@ -274,15 +301,28 @@ void __page_reporting_notify(void)
>>         if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
>>                 return err;
>>
>> +       sgl_curr = sgl;
>> +       capacity_curr = offset = capacity;
>> +       order_threshold = calculate_zone_order_threshold(zone);
>>         /* Process each free list starting from lowest order/mt */
>>         for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
>> +               /* try to reduce unexpected high order's reporting capacity */
>> +               if (order >= order_threshold) {
>> +                       capacity_curr = capacity >> (order - order_threshold + 1);
>> +                       if (capacity_curr == 0)
>> +                               capacity_curr = 1;
>> +                       sgl_curr = sgl + capacity - capacity_curr;
>> +                       offset = capacity_curr;
>> +                       sg_init_table(sgl_curr, capacity_curr);
>> +               }
>> +
> 
> The problem here is you are assuming the order threshold will not
> change during processing. Your order_threshold value could become
> stale while you are processing the zone so I am not sure having it
> provides much value.
> 
> I think we might be better off just assuming that we need to halve
> capacity as the order increases.
> 
>>                 for (mt = 0; mt < MIGRATE_TYPES; mt++) {
>>                         /* We do not pull pages from the isolate free list */
>>                         if (is_migrate_isolate(mt))
>>                                 continue;
>>
>>                         err = page_reporting_cycle(prdev, zone, order, mt,
>> -                                                  sgl, &offset);
>> +                                                  sgl_curr, capacity_curr, &offset);
>>                         /* Exceed threshold go to report leftover */
>>                         if (err > 0) {
>>                                 err = 0;
>> @@ -292,18 +332,34 @@ void __page_reporting_notify(void)
>>                         if (err)
>>                                 return err;
>>                 }
>> +
>> +               /* report the leftover pages for next orders with reduced capacity */
>> +               leftover = capacity_curr - offset;
>> +               if (leftover && order + 1 >= order_threshold) {
>> +                       sgl_curr = &sgl_curr[offset];
>> +                       err = prdev->report(prdev, sgl_curr, leftover);
>> +                       offset = capacity_curr;
>> +
>> +                       /* flush any remaining pages out from the last report */
>> +                       spin_lock_irq(&zone->lock);
>> +                       page_reporting_drain(prdev, sgl_curr, zone, leftover, !err);
>> +                       spin_unlock_irq(&zone->lock);
>> +
>> +                       if (err)
>> +                               return err;
>> +               }
>>         }
>>
>>  leftover:
>>         /* report the leftover pages before going idle */
>> -       leftover = PAGE_REPORTING_CAPACITY - offset;
>> +       leftover = capacity_curr - offset;
>>         if (leftover) {
>> -               sgl = &sgl[offset];
>> -               err = prdev->report(prdev, sgl, leftover);
>> +               sgl_curr = &sgl_curr[offset];
>> +               err = prdev->report(prdev, sgl_curr, leftover);
>>
>>                 /* flush any remaining pages out from the last report */
>>                 spin_lock_irq(&zone->lock);
>> -               page_reporting_drain(prdev, sgl, zone, leftover, !err);
>> +               page_reporting_drain(prdev, sgl_curr, zone, leftover, !err);
>>                 spin_unlock_irq(&zone->lock);
>>         }
>>
>> @@ -332,9 +388,8 @@ static void page_reporting_process(struct work_struct *work)
>>         if (!sgl)
>>                 goto err_out;
>>
>> -       sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
>> -
>>         for_each_zone(zone) {
>> +               sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
>>                 err = page_reporting_process_zone(prdev, sgl, zone);
>>                 if (err)
>>                         break;
>> --
>> 1.8.3.1
>>
>>

Great, will try to improve it according to your suggestions.
Thanks for all the valuable comments.


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-04-06  6:56 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-26  9:44 [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
2021-03-26  9:44 ` [PATCH 1/4] mm/page_reporting: Introduce free page reported counters Xunlei Pang
2021-04-02 18:29   ` Alexander Duyck
2021-03-26  9:44 ` [PATCH 2/4] mm/page_reporting: Introduce free page reporting factor Xunlei Pang
2021-04-02 18:56   ` Alexander Duyck
2021-04-06  6:53     ` Xunlei Pang
2021-03-26  9:44 ` [PATCH 3/4] mm/page_reporting: Introduce "page_reporting_factor=" boot parameter Xunlei Pang
2021-03-26  9:44 ` [PATCH 4/4] mm/page_reporting: Fix possible user allocation failure Xunlei Pang
2021-04-02 19:55   ` Alexander Duyck
2021-04-06  6:55     ` Xunlei Pang
2021-04-02  4:08 ` [PATCH 0/4] mm/page_reporting: Some knobs and fixes Xunlei Pang
2021-04-02 18:17   ` Alexander Duyck

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).