All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 0/5] disable pcplists during page isolation
@ 2020-09-07 16:36 Vlastimil Babka
  2020-09-07 16:36 ` [RFC 1/5] mm, page_alloc: clean up pageset high and batch update Vlastimil Babka
                   ` (5 more replies)
  0 siblings, 6 replies; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-07 16:36 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim, Vlastimil Babka

As per the discussions [1] [2] this is an attempt to implement David's
suggestion that page isolation should disable pcplists to avoid races. This is
done without extra checks in fast paths, as I mentioned should be possible in
the discussion, and explained in patch 5. Patches 1-4 are preparatory cleanups.

Note this is untested RFC for now. Based on v5.9-rc4 plus Pavel's patch [2]
(slated as a quick fix for mainline+stable).

[1] https://lore.kernel.org/linux-mm/20200901124615.137200-1-pasha.tatashin@soleen.com/
[2] https://lore.kernel.org/linux-mm/20200903140032.380431-1-pasha.tatashin@soleen.com/

Vlastimil Babka (5):
  mm, page_alloc: clean up pageset high and batch update
  mm, page_alloc: calculate pageset high and batch once per zone
  mm, page_alloc(): remove setup_pageset()
  mm, page_alloc: cache pageset high and batch in struct zone
  mm, page_alloc: disable pcplists during page isolation

 include/linux/gfp.h    |   1 +
 include/linux/mmzone.h |   2 +
 mm/internal.h          |   4 ++
 mm/memory_hotplug.c    |  24 +++----
 mm/page_alloc.c        | 138 ++++++++++++++++++++++-------------------
 mm/page_isolation.c    |  45 +++++++++++---
 6 files changed, 127 insertions(+), 87 deletions(-)

-- 
2.28.0


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [RFC 1/5] mm, page_alloc: clean up pageset high and batch update
  2020-09-07 16:36 [RFC 0/5] disable pcplists during page isolation Vlastimil Babka
@ 2020-09-07 16:36 ` Vlastimil Babka
  2020-09-10  8:31   ` Oscar Salvador
  2020-09-07 16:36 ` [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone Vlastimil Babka
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-07 16:36 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim, Vlastimil Babka

The updates to pcplists' high and batch valued are handled by multiple
functions that make the calculations hard to follow. Consolidate everything
to pageset_set_high_and_batch() and remove pageset_set_batch() and
pageset_set_high() wrappers.

The only special case using one of the removed wrappers was:
build_all_zonelists_init()
  setup_pageset()
    pageset_set_batch()
which was hardcoding batch as 0, so we can just open-code a call to
pageset_update() with constant parameters instead.

No functional change.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/page_alloc.c | 51 +++++++++++++++++++------------------------------
 1 file changed, 20 insertions(+), 31 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fab5e97dc9ca..0b516208afda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5834,7 +5834,7 @@ static void build_zonelists(pg_data_t *pgdat)
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static void setup_pageset(struct per_cpu_pageset *p);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
@@ -5902,7 +5902,7 @@ build_all_zonelists_init(void)
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu)
-		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+		setup_pageset(&per_cpu(boot_pageset, cpu));
 
 	mminit_verify_zonelist();
 	cpuset_init_current_mems_allowed();
@@ -6218,12 +6218,6 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 	pcp->batch = batch;
 }
 
-/* a companion to pageset_set_high() */
-static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
-{
-	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
-}
-
 static void pageset_init(struct per_cpu_pageset *p)
 {
 	struct per_cpu_pages *pcp;
@@ -6236,35 +6230,30 @@ static void pageset_init(struct per_cpu_pageset *p)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void setup_pageset(struct per_cpu_pageset *p)
 {
 	pageset_init(p);
-	pageset_set_batch(p, batch);
-}
-
-/*
- * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-static void pageset_set_high(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	unsigned long batch = max(1UL, high / 4);
-	if ((high / 4) > (PAGE_SHIFT * 8))
-		batch = PAGE_SHIFT * 8;
-
-	pageset_update(&p->pcp, high, batch);
+	pageset_update(&p->pcp, 0, 1);
 }
 
 static void pageset_set_high_and_batch(struct zone *zone,
-				       struct per_cpu_pageset *pcp)
+				       struct per_cpu_pageset *p)
 {
-	if (percpu_pagelist_fraction)
-		pageset_set_high(pcp,
-			(zone_managed_pages(zone) /
-				percpu_pagelist_fraction));
-	else
-		pageset_set_batch(pcp, zone_batchsize(zone));
+	unsigned long new_high;
+	unsigned long new_batch;
+	int fraction = READ_ONCE(percpu_pagelist_fraction);
+
+	if (fraction) {
+		new_high = zone_managed_pages(zone) / fraction;
+		new_batch = max(1UL, new_high / 4);
+		if ((new_high / 4) > (PAGE_SHIFT * 8))
+			new_batch = PAGE_SHIFT * 8;
+	} else {
+		new_batch = zone_batchsize(zone);
+		new_high = 6 * new_batch;
+		new_batch = max(1UL, 1 * new_batch);
+	}
+	pageset_update(&p->pcp, new_high, new_batch);
 }
 
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone
  2020-09-07 16:36 [RFC 0/5] disable pcplists during page isolation Vlastimil Babka
  2020-09-07 16:36 ` [RFC 1/5] mm, page_alloc: clean up pageset high and batch update Vlastimil Babka
@ 2020-09-07 16:36 ` Vlastimil Babka
  2020-09-10  9:00   ` Oscar Salvador
  2020-09-10 10:56   ` David Hildenbrand
  2020-09-07 16:36 ` [RFC 3/5] mm, page_alloc(): remove setup_pageset() Vlastimil Babka
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-07 16:36 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim, Vlastimil Babka

We currently call pageset_set_high_and_batch() for each possible cpu,
which repeats the same calculations of high and batch values.

Instead call it once per zone, and it applies the calculated values
to all per-cpu pagesets of the zone.

This also allows removing zone_pageset_init() and __zone_pcp_update() wrappers.

No functional change.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/page_alloc.c | 40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0b516208afda..f669a251f654 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6236,12 +6236,13 @@ static void setup_pageset(struct per_cpu_pageset *p)
 	pageset_update(&p->pcp, 0, 1);
 }
 
-static void pageset_set_high_and_batch(struct zone *zone,
-				       struct per_cpu_pageset *p)
+static void zone_set_pageset_high_and_batch(struct zone *zone)
 {
 	unsigned long new_high;
 	unsigned long new_batch;
 	int fraction = READ_ONCE(percpu_pagelist_fraction);
+	int cpu;
+	struct per_cpu_pageset *p;
 
 	if (fraction) {
 		new_high = zone_managed_pages(zone) / fraction;
@@ -6253,23 +6254,25 @@ static void pageset_set_high_and_batch(struct zone *zone,
 		new_high = 6 * new_batch;
 		new_batch = max(1UL, 1 * new_batch);
 	}
-	pageset_update(&p->pcp, new_high, new_batch);
-}
-
-static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-{
-	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 
-	pageset_init(pcp);
-	pageset_set_high_and_batch(zone, pcp);
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(zone->pageset, cpu);
+		pageset_update(&p->pcp, new_high, new_batch);
+	}
 }
 
 void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
+	struct per_cpu_pageset *p;
+
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
-	for_each_possible_cpu(cpu)
-		zone_pageset_init(zone, cpu);
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(zone->pageset, cpu);
+		pageset_init(p);
+	}
+
+	zone_set_pageset_high_and_batch(zone);
 }
 
 /*
@@ -8002,15 +8005,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
-static void __zone_pcp_update(struct zone *zone)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu)
-		pageset_set_high_and_batch(zone,
-				per_cpu_ptr(zone->pageset, cpu));
-}
-
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
@@ -8043,7 +8037,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 		goto out;
 
 	for_each_populated_zone(zone)
-		__zone_pcp_update(zone);
+		zone_set_pageset_high_and_batch(zone);
 out:
 	mutex_unlock(&pcp_batch_high_lock);
 	return ret;
@@ -8659,7 +8653,7 @@ EXPORT_SYMBOL(free_contig_range);
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	mutex_lock(&pcp_batch_high_lock);
-	__zone_pcp_update(zone);
+	zone_set_pageset_high_and_batch(zone);
 	mutex_unlock(&pcp_batch_high_lock);
 }
 
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 3/5] mm, page_alloc(): remove setup_pageset()
  2020-09-07 16:36 [RFC 0/5] disable pcplists during page isolation Vlastimil Babka
  2020-09-07 16:36 ` [RFC 1/5] mm, page_alloc: clean up pageset high and batch update Vlastimil Babka
  2020-09-07 16:36 ` [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone Vlastimil Babka
@ 2020-09-07 16:36 ` Vlastimil Babka
  2020-09-10  9:23   ` Oscar Salvador
  2020-09-10 11:00   ` David Hildenbrand
  2020-09-07 16:36 ` [RFC 4/5] mm, page_alloc: cache pageset high and batch in struct zone Vlastimil Babka
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-07 16:36 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim, Vlastimil Babka

We initialize boot-time pagesets with setup_pageset(), which sets high and
batch values that effectively disable pcplists.

We can remove this wrapper if we just set these values for all pagesets in
pageset_init(). Non-boot pagesets then subsequently update them to specific
values.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/page_alloc.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f669a251f654..a0cab2c6055e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5902,7 +5902,7 @@ build_all_zonelists_init(void)
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu)
-		setup_pageset(&per_cpu(boot_pageset, cpu));
+		pageset_init(&per_cpu(boot_pageset, cpu));
 
 	mminit_verify_zonelist();
 	cpuset_init_current_mems_allowed();
@@ -6228,12 +6228,13 @@ static void pageset_init(struct per_cpu_pageset *p)
 	pcp = &p->pcp;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
-}
 
-static void setup_pageset(struct per_cpu_pageset *p)
-{
-	pageset_init(p);
-	pageset_update(&p->pcp, 0, 1);
+	/*
+	 * Set batch and high values safe for a boot pageset. Proper pageset's
+	 * initialization will update them.
+	 */
+	pcp->high = 0;
+	pcp->batch  = 1;
 }
 
 static void zone_set_pageset_high_and_batch(struct zone *zone)
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 4/5] mm, page_alloc: cache pageset high and batch in struct zone
  2020-09-07 16:36 [RFC 0/5] disable pcplists during page isolation Vlastimil Babka
                   ` (2 preceding siblings ...)
  2020-09-07 16:36 ` [RFC 3/5] mm, page_alloc(): remove setup_pageset() Vlastimil Babka
@ 2020-09-07 16:36 ` Vlastimil Babka
  2020-09-10 11:30   ` Oscar Salvador
  2020-09-07 16:36 ` [RFC 5/5] mm, page_alloc: disable pcplists during page isolation Vlastimil Babka
  2020-09-08 18:29 ` [RFC 0/5] " David Hildenbrand
  5 siblings, 1 reply; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-07 16:36 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim, Vlastimil Babka

All per-cpu pagesets for a zone use the same high and batch values, that are
duplicated there just for performance (locality) reasons. This patch adds the
same variables also to struct zone as 'central' ones.

This will be useful later for making possible to disable pcplists temporarily
by setting high value to 0, while remembering the values for restoring them
later. But we can also immediately benefit from not updating pagesets of all
possible cpus in case the newly recalculated values (after sysctl change or
memory online/offline) are actually unchanged from the previous ones.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mmzone.h |  2 ++
 mm/page_alloc.c        | 18 +++++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8379432f4f2f..15582ca368b9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -431,6 +431,8 @@ struct zone {
 #endif
 	struct pglist_data	*zone_pgdat;
 	struct per_cpu_pageset __percpu *pageset;
+	int pageset_high;
+	int pageset_batch;
 
 #ifndef CONFIG_SPARSEMEM
 	/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a0cab2c6055e..004350a2b6ca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5834,7 +5834,7 @@ static void build_zonelists(pg_data_t *pgdat)
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static void setup_pageset(struct per_cpu_pageset *p);
+static void pageset_init(struct per_cpu_pageset *p);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
@@ -6237,7 +6237,7 @@ static void pageset_init(struct per_cpu_pageset *p)
 	pcp->batch  = 1;
 }
 
-static void zone_set_pageset_high_and_batch(struct zone *zone)
+static void zone_set_pageset_high_and_batch(struct zone *zone, bool force_update)
 {
 	unsigned long new_high;
 	unsigned long new_batch;
@@ -6256,6 +6256,14 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
 		new_batch = max(1UL, 1 * new_batch);
 	}
 
+	if (zone->pageset_high != new_high ||
+	    zone->pageset_batch != new_batch) {
+		zone->pageset_high = new_high;
+		zone->pageset_batch = new_batch;
+	} else if (!force_update) {
+		return;
+	}
+
 	for_each_possible_cpu(cpu) {
 		p = per_cpu_ptr(zone->pageset, cpu);
 		pageset_update(&p->pcp, new_high, new_batch);
@@ -6273,7 +6281,7 @@ void __meminit setup_zone_pageset(struct zone *zone)
 		pageset_init(p);
 	}
 
-	zone_set_pageset_high_and_batch(zone);
+	zone_set_pageset_high_and_batch(zone, true);
 }
 
 /*
@@ -8038,7 +8046,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 		goto out;
 
 	for_each_populated_zone(zone)
-		zone_set_pageset_high_and_batch(zone);
+		zone_set_pageset_high_and_batch(zone, false);
 out:
 	mutex_unlock(&pcp_batch_high_lock);
 	return ret;
@@ -8654,7 +8662,7 @@ EXPORT_SYMBOL(free_contig_range);
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	mutex_lock(&pcp_batch_high_lock);
-	zone_set_pageset_high_and_batch(zone);
+	zone_set_pageset_high_and_batch(zone, false);
 	mutex_unlock(&pcp_batch_high_lock);
 }
 
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-07 16:36 [RFC 0/5] disable pcplists during page isolation Vlastimil Babka
                   ` (3 preceding siblings ...)
  2020-09-07 16:36 ` [RFC 4/5] mm, page_alloc: cache pageset high and batch in struct zone Vlastimil Babka
@ 2020-09-07 16:36 ` Vlastimil Babka
  2020-09-09 10:48   ` Vlastimil Babka
  2020-09-08 18:29 ` [RFC 0/5] " David Hildenbrand
  5 siblings, 1 reply; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-07 16:36 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim, Vlastimil Babka, Michal Hocko

Page isolation can race with process freeing pages to pcplists in a way that
a page from isolated pageblock can end up on pcplist. This can be fixed by
repeated draining of pcplists, as done by patch "mm/memory_hotplug: drain
per-cpu pages again during memory offline" in [1].

David and Michal would prefer that this race was closed in a way that callers
of page isolation don't need to care about drain. David suggested disabling
pcplists usage completely during page isolation, instead of repeatedly draining
them.

To achieve this without adding special cases in alloc/free fastpath, we can use
the same 'trick' as boot pagesets - when pcp->high is 0, any pcplist addition
will be immediately flushed.

The race can thus be closed by setting pcp->high to 0 and draining pcplists
once in start_isolate_page_range(). The draining will serialize after processes
that already disabled interrupts and read the old value of pcp->high in
free_unref_page_commit(), and processes that have not yet disabled interrupts,
will observe pcp->high == 0 when they are rescheduled, and skip pcplists.
This guarantees no stray pages on pcplists in zones where isolation happens.

We can use the variable zone->nr_isolate_pageblock (protected by zone->lock)
to detect transitions from 0 to 1 (to change pcp->high to 0 and issue drain)
and from 1 to 0 (to restore original pcp->high and batch values cached in
struct zone). We have to avoid external updates to high and batch by taking
pcp_batch_high_lock. To allow multiple isolations in parallel, change this
lock from mutex to rwsem.

For callers that pair start_isolate_page_range() with
undo_isolated_page_range() properly, this is transparent. Currently that's
alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
in the succes case, so it has to be carful to handle restoring pcp->high and batch
and unlocking pcp_batch_high_lock.

This commit also changes drain_all_pages() to not trust reading pcp->count during
drain for page isolation - I believe that could be racy and lead to missing some
cpu's to drain. If others agree, this can be separated and potentially backported.

[1] https://lore.kernel.org/linux-mm/20200903140032.380431-1-pasha.tatashin@soleen.com/

Suggested-by: David Hildenbrand <david@redhat.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/gfp.h |  1 +
 mm/internal.h       |  4 ++++
 mm/memory_hotplug.c | 24 ++++++++-----------
 mm/page_alloc.c     | 58 +++++++++++++++++++++++++++++----------------
 mm/page_isolation.c | 45 ++++++++++++++++++++++++++++-------
 5 files changed, 89 insertions(+), 43 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 67a0774e080b..cc52c5cc9fab 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -592,6 +592,7 @@ extern void page_frag_free(void *addr);
 
 void page_alloc_init(void);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
+void __drain_all_pages(struct zone *zone, bool page_isolation);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
diff --git a/mm/internal.h b/mm/internal.h
index 10c677655912..c157af87a9ed 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -7,6 +7,7 @@
 #ifndef __MM_INTERNAL_H
 #define __MM_INTERNAL_H
 
+#include <linux/rwsem.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -201,8 +202,11 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
 
+extern struct rw_semaphore pcp_batch_high_lock;
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
+extern void zone_update_pageset_high_and_batch(struct zone *zone,
+		unsigned long high, unsigned long batch);
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b11a269e2356..a978ac32279b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1485,6 +1485,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	struct zone *zone;
 	struct memory_notify arg;
 	char *reason;
+	bool unisolated_last = false;
 
 	mem_hotplug_begin();
 
@@ -1575,20 +1576,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
 		/* check again */
 		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
 					    NULL, check_pages_isolated_cb);
-		/*
-		 * per-cpu pages are drained in start_isolate_page_range, but if
-		 * there are still pages that are not free, make sure that we
-		 * drain again, because when we isolated range we might
-		 * have raced with another thread that was adding pages to pcp
-		 * list.
-		 *
-		 * Forward progress should be still guaranteed because
-		 * pages on the pcp list can only belong to MOVABLE_ZONE
-		 * because has_unmovable_pages explicitly checks for
-		 * PageBuddy on freed pages on other zones.
-		 */
-		if (ret)
-			drain_all_pages(zone);
 	} while (ret);
 
 	/* Ok, all of our target is isolated.
@@ -1602,8 +1589,17 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	 * pageblocks zone counter here.
 	 */
 	spin_lock_irqsave(&zone->lock, flags);
+	if (nr_isolate_pageblock && nr_isolate_pageblock ==
+			zone->nr_isolate_pageblock)
+		unisolated_last = true;
 	zone->nr_isolate_pageblock -= nr_isolate_pageblock;
 	spin_unlock_irqrestore(&zone->lock, flags);
+	if (unisolated_last) {
+		zone_update_pageset_high_and_batch(zone, zone->pageset_high,
+				zone->pageset_batch);
+	}
+	/* pairs with start_isolate_page_range() */
+	up_read(&pcp_batch_high_lock);
 
 	/* removal success */
 	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 004350a2b6ca..d82f3bec7953 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -78,7 +78,7 @@
 #include "page_reporting.h"
 
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
-static DEFINE_MUTEX(pcp_batch_high_lock);
+DECLARE_RWSEM(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
@@ -2958,14 +2958,7 @@ static void drain_local_pages_wq(struct work_struct *work)
 	preempt_enable();
 }
 
-/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
- *
- * When zone parameter is non-NULL, spill just the single zone's pages.
- *
- * Note that this can be extremely slow as the draining happens in a workqueue.
- */
-void drain_all_pages(struct zone *zone)
+void __drain_all_pages(struct zone *zone, bool page_isolation)
 {
 	int cpu;
 
@@ -3004,7 +2997,13 @@ void drain_all_pages(struct zone *zone)
 		struct zone *z;
 		bool has_pcps = false;
 
-		if (zone) {
+		if (page_isolation) {
+			/*
+			 * For page isolation, don't trust the racy pcp.count
+			 * check. We need to flush really everything.
+			 */
+			has_pcps = true;
+		} else if (zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count)
 				has_pcps = true;
@@ -3037,6 +3036,18 @@ void drain_all_pages(struct zone *zone)
 	mutex_unlock(&pcpu_drain_mutex);
 }
 
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
+ * Note that this can be extremely slow as the draining happens in a workqueue.
+ */
+void drain_all_pages(struct zone *zone)
+{
+	__drain_all_pages(zone, false);
+}
+
 #ifdef CONFIG_HIBERNATION
 
 /*
@@ -6237,13 +6248,23 @@ static void pageset_init(struct per_cpu_pageset *p)
 	pcp->batch  = 1;
 }
 
+void zone_update_pageset_high_and_batch(struct zone *zone, unsigned long high,
+		unsigned long batch)
+{
+	struct per_cpu_pageset *p;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(zone->pageset, cpu);
+		pageset_update(&p->pcp, high, batch);
+	}
+}
+
 static void zone_set_pageset_high_and_batch(struct zone *zone, bool force_update)
 {
 	unsigned long new_high;
 	unsigned long new_batch;
 	int fraction = READ_ONCE(percpu_pagelist_fraction);
-	int cpu;
-	struct per_cpu_pageset *p;
 
 	if (fraction) {
 		new_high = zone_managed_pages(zone) / fraction;
@@ -6264,10 +6285,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, bool force_update
 		return;
 	}
 
-	for_each_possible_cpu(cpu) {
-		p = per_cpu_ptr(zone->pageset, cpu);
-		pageset_update(&p->pcp, new_high, new_batch);
-	}
+	zone_update_pageset_high_and_batch(zone, new_high, new_batch);
 }
 
 void __meminit setup_zone_pageset(struct zone *zone)
@@ -8026,7 +8044,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 	int old_percpu_pagelist_fraction;
 	int ret;
 
-	mutex_lock(&pcp_batch_high_lock);
+	down_write(&pcp_batch_high_lock);
 	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
 
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -8048,7 +8066,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 	for_each_populated_zone(zone)
 		zone_set_pageset_high_and_batch(zone, false);
 out:
-	mutex_unlock(&pcp_batch_high_lock);
+	up_write(&pcp_batch_high_lock);
 	return ret;
 }
 
@@ -8661,9 +8679,9 @@ EXPORT_SYMBOL(free_contig_range);
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
-	mutex_lock(&pcp_batch_high_lock);
+	down_write(&pcp_batch_high_lock);
 	zone_set_pageset_high_and_batch(zone, false);
-	mutex_unlock(&pcp_batch_high_lock);
+	up_write(&pcp_batch_high_lock);
 }
 
 void zone_pcp_reset(struct zone *zone)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 63a3db10a8c0..ceada64abd1f 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -21,6 +21,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	struct zone *zone;
 	unsigned long flags;
 	int ret = -EBUSY;
+	bool first_isolated_pageblock = false;
 
 	zone = page_zone(page);
 
@@ -45,6 +46,8 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		zone->nr_isolate_pageblock++;
+		if (zone->nr_isolate_pageblock == 1)
+			first_isolated_pageblock = true;
 		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
 									NULL);
 
@@ -54,8 +57,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
-	if (!ret) {
-		drain_all_pages(zone);
+	if (!ret && first_isolated_pageblock) {
+		zone_update_pageset_high_and_batch(zone, 0, 1);
+		__drain_all_pages(zone, true);
 	} else {
 		WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
 
@@ -78,6 +82,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	unsigned int order;
 	unsigned long pfn, buddy_pfn;
 	struct page *buddy;
+	bool unisolated_last = false;
 
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
@@ -120,8 +125,14 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	if (isolated_page)
 		__putback_isolated_page(page, order, migratetype);
 	zone->nr_isolate_pageblock--;
+	if (zone->nr_isolate_pageblock == 0)
+		unisolated_last = true;
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
+	if (unisolated_last) {
+		zone_update_pageset_high_and_batch(zone, zone->pageset_high,
+				zone->pageset_batch);
+	}
 }
 
 static inline struct page *
@@ -170,14 +181,17 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * pageblocks we may have modified and return -EBUSY to caller. This
  * prevents two threads from simultaneously working on overlapping ranges.
  *
- * Please note that there is no strong synchronization with the page allocator
- * either. Pages might be freed while their page blocks are marked ISOLATED.
- * In some cases pages might still end up on pcp lists and that would allow
- * for their allocation even when they are in fact isolated already. Depending
- * on how strong of a guarantee the caller needs drain_all_pages might be needed
- * (e.g. __offline_pages will need to call it after check for isolated range for
- * a next retry).
+ * To synchronize with page allocator users freeing pages on the pcplists, we
+ * disable them by setting their allowed usage (pcp->high) to 0, and issue a
+ * drain. This is only needed when isolating the first pageblock of a zone.
  *
+ * Successful call to start_isolate_page_range() has to be paired with
+ * undo_isolate_page_range() for proper accounting of zone->nr_isolate_pageblock
+ * (which controls pcplist enabling/disabling discussed above, including
+ * handling of pcp_batch_high_lock).
+ * If undo_isolate_page_range() is not used, this has to be handled manually
+ * by caller.
+ * 
  * Return: the number of isolated pageblocks on success and -EBUSY if any part
  * of range cannot be isolated.
  */
@@ -192,6 +206,13 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
 	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
 
+	/*
+	 * We are going to change pcplists's high and batch values temporarily,
+	 * so block any updates via sysctl. Caller must unlock by
+	 * undo_isolate_page_range() or finish_isolate_page_range().
+	 */
+	down_read(&pcp_batch_high_lock);
+
 	for (pfn = start_pfn;
 	     pfn < end_pfn;
 	     pfn += pageblock_nr_pages) {
@@ -215,6 +236,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 		unset_migratetype_isolate(page, migratetype);
 	}
 
+	up_read(&pcp_batch_high_lock);
+
 	return -EBUSY;
 }
 
@@ -238,7 +261,11 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			continue;
 		unset_migratetype_isolate(page, migratetype);
 	}
+
+	up_read(&pcp_batch_high_lock);
 }
+
+
 /*
  * Test all pages in the range is free(means isolated) or not.
  * all pages in [start_pfn...end_pfn) must be in the same zone.
-- 
2.28.0


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [RFC 0/5] disable pcplists during page isolation
  2020-09-07 16:36 [RFC 0/5] disable pcplists during page isolation Vlastimil Babka
                   ` (4 preceding siblings ...)
  2020-09-07 16:36 ` [RFC 5/5] mm, page_alloc: disable pcplists during page isolation Vlastimil Babka
@ 2020-09-08 18:29 ` David Hildenbrand
  2020-09-09 10:54   ` Vlastimil Babka
  5 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-09-08 18:29 UTC (permalink / raw)
  To: Vlastimil Babka, linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, Oscar Salvador, Joonsoo Kim

On 07.09.20 18:36, Vlastimil Babka wrote:
> As per the discussions [1] [2] this is an attempt to implement David's
> suggestion that page isolation should disable pcplists to avoid races. This is
> done without extra checks in fast paths, as I mentioned should be possible in
> the discussion, and explained in patch 5. Patches 1-4 are preparatory cleanups.
> 
> Note this is untested RFC for now. Based on v5.9-rc4 plus Pavel's patch [2]
> (slated as a quick fix for mainline+stable).
> 
> [1] https://lore.kernel.org/linux-mm/20200901124615.137200-1-pasha.tatashin@soleen.com/
> [2] https://lore.kernel.org/linux-mm/20200903140032.380431-1-pasha.tatashin@soleen.com/
> 
> Vlastimil Babka (5):
>   mm, page_alloc: clean up pageset high and batch update
>   mm, page_alloc: calculate pageset high and batch once per zone
>   mm, page_alloc(): remove setup_pageset()
>   mm, page_alloc: cache pageset high and batch in struct zone
>   mm, page_alloc: disable pcplists during page isolation
> 
>  include/linux/gfp.h    |   1 +
>  include/linux/mmzone.h |   2 +
>  mm/internal.h          |   4 ++
>  mm/memory_hotplug.c    |  24 +++----
>  mm/page_alloc.c        | 138 ++++++++++++++++++++++-------------------
>  mm/page_isolation.c    |  45 +++++++++++---
>  6 files changed, 127 insertions(+), 87 deletions(-)
> 

Thanks for looking into this! Just a heads-up that -mm and -next contain
some changes to memory hotplug code, whereby new pageblocks start out in
MIGRATE_ISOLATE when onlining, until we're done with the heavy lifting.
Might require some tweaks, similar to when isolating pageblocks.

Will dive into this in the following days. What's you're general
perception of performance aspects?

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-07 16:36 ` [RFC 5/5] mm, page_alloc: disable pcplists during page isolation Vlastimil Babka
@ 2020-09-09 10:48   ` Vlastimil Babka
  2020-09-09 11:36     ` Michal Hocko
  0 siblings, 1 reply; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-09 10:48 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim, Michal Hocko

Here's a version that will apply on top of next-20200908. The first 4 patches need no change.

----8<----
From 8febc17272b8e8b378e2e5ea5e76b2616f029c5b Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 7 Sep 2020 17:20:39 +0200
Subject: [PATCH] mm, page_alloc: disable pcplists during page isolation

Page isolation can race with process freeing pages to pcplists in a way that
a page from isolated pageblock can end up on pcplist. This can be fixed by
repeated draining of pcplists, as done by patch "mm/memory_hotplug: drain
per-cpu pages again during memory offline" in [1].

David and Michal would prefer that this race was closed in a way that callers
of page isolation don't need to care about drain. David suggested disabling
pcplists usage completely during page isolation, instead of repeatedly draining
them.

To achieve this without adding special cases in alloc/free fastpath, we can use
the same 'trick' as boot pagesets - when pcp->high is 0, any pcplist addition
will be immediately flushed.

The race can thus be closed by setting pcp->high to 0 and draining pcplists
once in start_isolate_page_range(). The draining will serialize after processes
that already disabled interrupts and read the old value of pcp->high in
free_unref_page_commit(), and processes that have not yet disabled interrupts,
will observe pcp->high == 0 when they are rescheduled, and skip pcplists.
This guarantees no stray pages on pcplists in zones where isolation happens.

We can use the variable zone->nr_isolate_pageblock (protected by zone->lock)
to detect transitions from 0 to 1 (to change pcp->high to 0 and issue drain)
and from 1 to 0 (to restore original pcp->high and batch values cached in
struct zone). We have to avoid external updates to high and batch by taking
pcp_batch_high_lock. To allow multiple isolations in parallel, change this
lock from mutex to rwsem.

For callers that pair start_isolate_page_range() with
undo_isolated_page_range() properly, this is transparent. Currently that's
alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
in the succes case, so it has to be carful to handle restoring pcp->high and batch
and unlocking pcp_batch_high_lock.

This commit also changes drain_all_pages() to not trust reading pcp->count during
drain for page isolation - I believe that could be racy and lead to missing some
cpu's to drain. If others agree, this can be separated and potentially backported.

[1] https://lore.kernel.org/linux-mm/20200903140032.380431-1-pasha.tatashin@soleen.com/

Suggested-by: David Hildenbrand <david@redhat.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/gfp.h |  1 +
 mm/internal.h       |  4 +++
 mm/memory_hotplug.c | 55 ++++++++++++++++++++++++++++-------------
 mm/page_alloc.c     | 60 +++++++++++++++++++++++++++++----------------
 mm/page_isolation.c | 45 ++++++++++++++++++++++++++++------
 5 files changed, 119 insertions(+), 46 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 67a0774e080b..cc52c5cc9fab 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -592,6 +592,7 @@ extern void page_frag_free(void *addr);
 
 void page_alloc_init(void);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
+void __drain_all_pages(struct zone *zone, bool page_isolation);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
diff --git a/mm/internal.h b/mm/internal.h
index ab4beb7c5cd2..149822747db7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -7,6 +7,7 @@
 #ifndef __MM_INTERNAL_H
 #define __MM_INTERNAL_H
 
+#include <linux/rwsem.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -196,8 +197,11 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
 					gfp_t gfp_flags);
 extern int user_min_free_kbytes;
 
+extern struct rw_semaphore pcp_batch_high_lock;
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
+extern void zone_update_pageset_high_and_batch(struct zone *zone,
+		unsigned long high, unsigned long batch);
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index baded53b9ff9..c433565a782c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -803,6 +803,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	int need_zonelists_rebuild = 0;
 	int ret;
 	struct memory_notify arg;
+	bool first_isolated_pageblock = false;
 
 	/* We can only online full sections (e.g., SECTION_IS_ONLINE) */
 	if (WARN_ON_ONCE(!nr_pages ||
@@ -826,9 +827,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 
 	/*
 	 * Fixup the number of isolated pageblocks before marking the sections
-	 * onlining, such that undo_isolate_page_range() works correctly.
+	 * onlining, such that undo_isolate_page_range() works correctly. We
+	 * also take pcp_batch_high_lock that pairs with the unlock there.
 	 */
+	down_read(&pcp_batch_high_lock);
 	spin_lock_irqsave(&zone->lock, flags);
+	if (!zone->nr_isolate_pageblock)
+		first_isolated_pageblock = true;
 	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
 	spin_unlock_irqrestore(&zone->lock, flags);
 
@@ -842,6 +847,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 		setup_zone_pageset(zone);
 	}
 
+	/* If we are adding the first isolated pageblocks, we have to disable
+	 * pcplists (if the zone is already populated) and drain them, same as
+	 * set_migratetype_isolate() would. While the MIGRATE_ISOLATE pages we
+	 * just added cannot be on the pcplist, there might be another page
+	 * isolation user racing, which might need the drain, and wouldn't do
+	 * it if nr_isolated_pageblock was already non-zero.
+	 */
+	if (first_isolated_pageblock) {
+		zone_update_pageset_high_and_batch(zone, 0, 1);
+		__drain_all_pages(zone, true);
+	}
+
 	online_pages_range(pfn, nr_pages);
 	zone->present_pages += nr_pages;
 
@@ -852,11 +869,17 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	node_states_set_node(nid, &arg);
 	if (need_zonelists_rebuild)
 		build_all_zonelists(NULL);
-	zone_pcp_update(zone);
 
 	/* Basic onlining is complete, allow allocation of onlined pages. */
 	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
 
+	/*
+	 * Adjust pcplist high and batch based on new zone's size. This takes
+	 * pcp_batch_high_lock for write, so we have to do that after
+	 * undo_isolate_page_range() unlocks it for read.
+	 */
+	zone_pcp_update(zone);
+
 	/*
 	 * When exposing larger, physically contiguous memory areas to the
 	 * buddy, shuffling in the buddy (when freeing onlined pages, putting
@@ -1472,6 +1495,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	struct memory_notify arg;
 	int ret, node;
 	char *reason;
+	unsigned long nr_isolate_pageblock = nr_pages / pageblock_nr_pages;
+	bool unisolated_last = false;
 
 	/* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
 	if (WARN_ON_ONCE(!nr_pages ||
@@ -1564,21 +1589,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 			goto failed_removal_isolated;
 		}
 
-		/*
-		 * per-cpu pages are drained in start_isolate_page_range, but if
-		 * there are still pages that are not free, make sure that we
-		 * drain again, because when we isolated range we might
-		 * have raced with another thread that was adding pages to pcp
-		 * list.
-		 *
-		 * Forward progress should be still guaranteed because
-		 * pages on the pcp list can only belong to MOVABLE_ZONE
-		 * because has_unmovable_pages explicitly checks for
-		 * PageBuddy on freed pages on other zones.
-		 */
 		ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
-		if (ret)
-			drain_all_pages(zone);
+
 	} while (ret);
 
 	/* Mark all sections offline and remove free pages from the buddy. */
@@ -1591,8 +1603,17 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	 * of isolated pageblocks, memory onlining will properly revert this.
 	 */
 	spin_lock_irqsave(&zone->lock, flags);
-	zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
+	if (nr_isolate_pageblock && nr_isolate_pageblock ==
+			zone->nr_isolate_pageblock)
+		unisolated_last = true;
+	zone->nr_isolate_pageblock -= nr_isolate_pageblock;
 	spin_unlock_irqrestore(&zone->lock, flags);
+	if (unisolated_last) {
+		zone_update_pageset_high_and_batch(zone, zone->pageset_high,
+				zone->pageset_batch);
+	}
+	/* pairs with start_isolate_page_range() */
+	up_read(&pcp_batch_high_lock);
 
 	/* removal success */
 	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 72922ef2d7cb..defefed79cfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -78,7 +78,7 @@
 #include "page_reporting.h"
 
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
-static DEFINE_MUTEX(pcp_batch_high_lock);
+DECLARE_RWSEM(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
@@ -2958,14 +2958,7 @@ static void drain_local_pages_wq(struct work_struct *work)
 	preempt_enable();
 }
 
-/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
- *
- * When zone parameter is non-NULL, spill just the single zone's pages.
- *
- * Note that this can be extremely slow as the draining happens in a workqueue.
- */
-void drain_all_pages(struct zone *zone)
+void __drain_all_pages(struct zone *zone, bool page_isolation)
 {
 	int cpu;
 
@@ -3004,7 +2997,13 @@ void drain_all_pages(struct zone *zone)
 		struct zone *z;
 		bool has_pcps = false;
 
-		if (zone) {
+		if (page_isolation) {
+			/*
+			 * For page isolation, don't trust the racy pcp.count
+			 * check. We need to flush really everything.
+			 */
+			has_pcps = true;
+		} else if (zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count)
 				has_pcps = true;
@@ -3037,6 +3036,18 @@ void drain_all_pages(struct zone *zone)
 	mutex_unlock(&pcpu_drain_mutex);
 }
 
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
+ * Note that this can be extremely slow as the draining happens in a workqueue.
+ */
+void drain_all_pages(struct zone *zone)
+{
+	__drain_all_pages(zone, false);
+}
+
 #ifdef CONFIG_HIBERNATION
 
 /*
@@ -3131,7 +3142,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
-	if (pcp->count >= pcp->high) {
+	if (pcp->count >= READ_ONCE(pcp->high)) {
 		unsigned long batch = READ_ONCE(pcp->batch);
 		free_pcppages_bulk(zone, batch, pcp);
 	}
@@ -6228,13 +6239,23 @@ static void pageset_init(struct per_cpu_pageset *p)
 	pcp->batch  = 1;
 }
 
+void zone_update_pageset_high_and_batch(struct zone *zone, unsigned long high,
+		unsigned long batch)
+{
+	struct per_cpu_pageset *p;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(zone->pageset, cpu);
+		pageset_update(&p->pcp, high, batch);
+	}
+}
+
 static void zone_set_pageset_high_and_batch(struct zone *zone, bool force_update)
 {
 	unsigned long new_high;
 	unsigned long new_batch;
 	int fraction = READ_ONCE(percpu_pagelist_fraction);
-	int cpu;
-	struct per_cpu_pageset *p;
 
 	if (fraction) {
 		new_high = zone_managed_pages(zone) / fraction;
@@ -6255,10 +6276,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, bool force_update
 		return;
 	}
 
-	for_each_possible_cpu(cpu) {
-		p = per_cpu_ptr(zone->pageset, cpu);
-		pageset_update(&p->pcp, new_high, new_batch);
-	}
+	zone_update_pageset_high_and_batch(zone, new_high, new_batch);
 }
 
 void __meminit setup_zone_pageset(struct zone *zone)
@@ -8016,7 +8034,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 	int old_percpu_pagelist_fraction;
 	int ret;
 
-	mutex_lock(&pcp_batch_high_lock);
+	down_write(&pcp_batch_high_lock);
 	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
 
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -8038,7 +8056,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 	for_each_populated_zone(zone)
 		zone_set_pageset_high_and_batch(zone, false);
 out:
-	mutex_unlock(&pcp_batch_high_lock);
+	up_write(&pcp_batch_high_lock);
 	return ret;
 }
 
@@ -8642,9 +8660,9 @@ EXPORT_SYMBOL(free_contig_range);
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
-	mutex_lock(&pcp_batch_high_lock);
+	down_write(&pcp_batch_high_lock);
 	zone_set_pageset_high_and_batch(zone, false);
-	mutex_unlock(&pcp_batch_high_lock);
+	up_write(&pcp_batch_high_lock);
 }
 
 void zone_pcp_reset(struct zone *zone)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index abfe26ad59fd..391091a73355 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -20,6 +20,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	struct zone *zone = page_zone(page);
 	struct page *unmovable;
 	unsigned long flags;
+	bool first_isolated_pageblock = false;
 
 	spin_lock_irqsave(&zone->lock, flags);
 
@@ -44,12 +45,17 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		zone->nr_isolate_pageblock++;
+		if (zone->nr_isolate_pageblock == 1)
+			first_isolated_pageblock = true;
 		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
 									NULL);
 
 		__mod_zone_freepage_state(zone, -nr_pages, mt);
 		spin_unlock_irqrestore(&zone->lock, flags);
-		drain_all_pages(zone);
+		if (first_isolated_pageblock) {
+			zone_update_pageset_high_and_batch(zone, 0, 1);
+			__drain_all_pages(zone, true);
+		}
 		return 0;
 	}
 
@@ -73,6 +79,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	unsigned int order;
 	unsigned long pfn, buddy_pfn;
 	struct page *buddy;
+	bool unisolated_last = false;
 
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
@@ -115,8 +122,14 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	if (isolated_page)
 		__putback_isolated_page(page, order, migratetype);
 	zone->nr_isolate_pageblock--;
+	if (zone->nr_isolate_pageblock == 0)
+		unisolated_last = true;
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
+	if (unisolated_last) {
+		zone_update_pageset_high_and_batch(zone, zone->pageset_high,
+				zone->pageset_batch);
+	}
 }
 
 static inline struct page *
@@ -165,13 +178,16 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * pageblocks we may have modified and return -EBUSY to caller. This
  * prevents two threads from simultaneously working on overlapping ranges.
  *
- * Please note that there is no strong synchronization with the page allocator
- * either. Pages might be freed while their page blocks are marked ISOLATED.
- * In some cases pages might still end up on pcp lists and that would allow
- * for their allocation even when they are in fact isolated already. Depending
- * on how strong of a guarantee the caller needs drain_all_pages might be needed
- * (e.g. __offline_pages will need to call it after check for isolated range for
- * a next retry).
+ * To synchronize with page allocator users freeing pages on the pcplists, we
+ * disable them by setting their allowed usage (pcp->high) to 0, and issue a
+ * drain. This is only needed when isolating the first pageblock of a zone.
+ *
+ * Successful call to start_isolate_page_range() has to be paired with
+ * undo_isolate_page_range() for proper accounting of zone->nr_isolate_pageblock
+ * (which controls pcplist enabling/disabling discussed above, including
+ * handling of pcp_batch_high_lock).
+ * If undo_isolate_page_range() is not used, this has to be handled manually
+ * by caller.
  *
  * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
  */
@@ -185,6 +201,13 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
 	BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
 
+	/*
+	 * We are going to change pcplists's high and batch values temporarily,
+	 * so block any updates via sysctl. Caller must unlock by
+	 * undo_isolate_page_range() or finish_isolate_page_range().
+	 */
+	down_read(&pcp_batch_high_lock);
+
 	for (pfn = start_pfn;
 	     pfn < end_pfn;
 	     pfn += pageblock_nr_pages) {
@@ -207,6 +230,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 		unset_migratetype_isolate(page, migratetype);
 	}
 
+	up_read(&pcp_batch_high_lock);
+
 	return -EBUSY;
 }
 
@@ -230,7 +255,11 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			continue;
 		unset_migratetype_isolate(page, migratetype);
 	}
+
+	up_read(&pcp_batch_high_lock);
 }
+
+
 /*
  * Test all pages in the range is free(means isolated) or not.
  * all pages in [start_pfn...end_pfn) must be in the same zone.
-- 
2.28.0



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [RFC 0/5] disable pcplists during page isolation
  2020-09-08 18:29 ` [RFC 0/5] " David Hildenbrand
@ 2020-09-09 10:54   ` Vlastimil Babka
  2020-09-09 11:27     ` osalvador
  0 siblings, 1 reply; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-09 10:54 UTC (permalink / raw)
  To: David Hildenbrand, linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, Oscar Salvador, Joonsoo Kim

On 9/8/20 8:29 PM, David Hildenbrand wrote:
> On 07.09.20 18:36, Vlastimil Babka wrote:
>> As per the discussions [1] [2] this is an attempt to implement David's
>> suggestion that page isolation should disable pcplists to avoid races. This is
>> done without extra checks in fast paths, as I mentioned should be possible in
>> the discussion, and explained in patch 5. Patches 1-4 are preparatory cleanups.
>> 
>> Note this is untested RFC for now. Based on v5.9-rc4 plus Pavel's patch [2]
>> (slated as a quick fix for mainline+stable).
>> 
>> [1] https://lore.kernel.org/linux-mm/20200901124615.137200-1-pasha.tatashin@soleen.com/
>> [2] https://lore.kernel.org/linux-mm/20200903140032.380431-1-pasha.tatashin@soleen.com/
>> 
>> Vlastimil Babka (5):
>>   mm, page_alloc: clean up pageset high and batch update
>>   mm, page_alloc: calculate pageset high and batch once per zone
>>   mm, page_alloc(): remove setup_pageset()
>>   mm, page_alloc: cache pageset high and batch in struct zone
>>   mm, page_alloc: disable pcplists during page isolation
>> 
>>  include/linux/gfp.h    |   1 +
>>  include/linux/mmzone.h |   2 +
>>  mm/internal.h          |   4 ++
>>  mm/memory_hotplug.c    |  24 +++----
>>  mm/page_alloc.c        | 138 ++++++++++++++++++++++-------------------
>>  mm/page_isolation.c    |  45 +++++++++++---
>>  6 files changed, 127 insertions(+), 87 deletions(-)
>> 
> 
> Thanks for looking into this! Just a heads-up that -mm and -next contain
> some changes to memory hotplug code, whereby new pageblocks start out in
> MIGRATE_ISOLATE when onlining, until we're done with the heavy lifting.
> Might require some tweaks, similar to when isolating pageblocks.

Thanks for the heads-up. I've posted updated patch 5/5 for the -next as a reply
to the first one. It was a bit tricky to order everything correctly in
online_pages(), hopefully I avoided any deadlock.

> Will dive into this in the following days. What's you're general
> perception of performance aspects?

Thanks! I expect no performance change while no isolation is in progress, as
there are no new tests added in alloc/free paths. During page isolation there's
a single drain instead of once-per-pageblock, which is a benefit. But the
pcplists are effectively disabled for the whole of online_pages(),
offline_pages() or alloc_contig_range(), which will affect parallel page
allocator users. It depends on how long these operations take and how heavy the
parallel usage is, so I have no good answers. Might be similar to the current
periodic drain.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 0/5] disable pcplists during page isolation
  2020-09-09 10:54   ` Vlastimil Babka
@ 2020-09-09 11:27     ` osalvador
  2020-09-09 11:29       ` David Hildenbrand
  0 siblings, 1 reply; 32+ messages in thread
From: osalvador @ 2020-09-09 11:27 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: David Hildenbrand, linux-mm, linux-kernel, Michal Hocko,
	Pavel Tatashin, Joonsoo Kim

On 2020-09-09 12:54, Vlastimil Babka wrote:
> Thanks! I expect no performance change while no isolation is in 
> progress, as
> there are no new tests added in alloc/free paths. During page isolation 
> there's
> a single drain instead of once-per-pageblock, which is a benefit. But 
> the
> pcplists are effectively disabled for the whole of online_pages(),
> offline_pages() or alloc_contig_range(), which will affect parallel 
> page
> allocator users. It depends on how long these operations take and how 
> heavy the
> parallel usage is, so I have no good answers. Might be similar to the 
> current
> periodic drain.

I have seen some systems taking quite some time when offlining sections 
due to the migration of
the respective pages not being that smooth and having do_migrate_range 
to do some spins.
But to be fair, online_pages and offline_pages are not routines that get 
called that often, and we would be safe to assume that memory-hotplug 
operations are not constantly happening, but are rather one-offs 
operations.

I am not sure about Xen and HV, IIRC Xen was using online_pages and 
offline_pages routines to do the ballooning?

I will dive in this in the following days, thanks for the work Vlastimil



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 0/5] disable pcplists during page isolation
  2020-09-09 11:27     ` osalvador
@ 2020-09-09 11:29       ` David Hildenbrand
  0 siblings, 0 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-09-09 11:29 UTC (permalink / raw)
  To: osalvador, Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin, Joonsoo Kim

On 09.09.20 13:27, osalvador@suse.de wrote:
> On 2020-09-09 12:54, Vlastimil Babka wrote:
>> Thanks! I expect no performance change while no isolation is in 
>> progress, as
>> there are no new tests added in alloc/free paths. During page isolation 
>> there's
>> a single drain instead of once-per-pageblock, which is a benefit. But 
>> the
>> pcplists are effectively disabled for the whole of online_pages(),
>> offline_pages() or alloc_contig_range(), which will affect parallel 
>> page
>> allocator users. It depends on how long these operations take and how 
>> heavy the
>> parallel usage is, so I have no good answers. Might be similar to the 
>> current
>> periodic drain.
> 
> I have seen some systems taking quite some time when offlining sections 
> due to the migration of
> the respective pages not being that smooth and having do_migrate_range 
> to do some spins.
> But to be fair, online_pages and offline_pages are not routines that get 
> called that often, and we would be safe to assume that memory-hotplug 
> operations are not constantly happening, but are rather one-offs 
> operations.
> 
> I am not sure about Xen and HV, IIRC Xen was using online_pages and 
> offline_pages routines to do the ballooning?

No, they only add more memory blocks and online them. Memory hot(un)plug
is an expensive operation already, I don't think that's an issue.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-09 10:48   ` Vlastimil Babka
@ 2020-09-09 11:36     ` Michal Hocko
  2020-09-09 11:44       ` David Hildenbrand
                         ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Michal Hocko @ 2020-09-09 11:36 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim

On Wed 09-09-20 12:48:54, Vlastimil Babka wrote:
> Here's a version that will apply on top of next-20200908. The first 4 patches need no change.
> 
> ----8<----
> >From 8febc17272b8e8b378e2e5ea5e76b2616f029c5b Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <vbabka@suse.cz>
> Date: Mon, 7 Sep 2020 17:20:39 +0200
> Subject: [PATCH] mm, page_alloc: disable pcplists during page isolation
> 
> Page isolation can race with process freeing pages to pcplists in a way that
> a page from isolated pageblock can end up on pcplist. This can be fixed by
> repeated draining of pcplists, as done by patch "mm/memory_hotplug: drain
> per-cpu pages again during memory offline" in [1].
> 
> David and Michal would prefer that this race was closed in a way that callers
> of page isolation don't need to care about drain. David suggested disabling
> pcplists usage completely during page isolation, instead of repeatedly draining
> them.
> 
> To achieve this without adding special cases in alloc/free fastpath, we can use
> the same 'trick' as boot pagesets - when pcp->high is 0, any pcplist addition
> will be immediately flushed.
> 
> The race can thus be closed by setting pcp->high to 0 and draining pcplists
> once in start_isolate_page_range(). The draining will serialize after processes
> that already disabled interrupts and read the old value of pcp->high in
> free_unref_page_commit(), and processes that have not yet disabled interrupts,
> will observe pcp->high == 0 when they are rescheduled, and skip pcplists.
> This guarantees no stray pages on pcplists in zones where isolation happens.
> 
> We can use the variable zone->nr_isolate_pageblock (protected by zone->lock)
> to detect transitions from 0 to 1 (to change pcp->high to 0 and issue drain)
> and from 1 to 0 (to restore original pcp->high and batch values cached in
> struct zone). We have to avoid external updates to high and batch by taking
> pcp_batch_high_lock. To allow multiple isolations in parallel, change this
> lock from mutex to rwsem.
> 
> For callers that pair start_isolate_page_range() with
> undo_isolated_page_range() properly, this is transparent. Currently that's
> alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
> in the succes case, so it has to be carful to handle restoring pcp->high and batch
> and unlocking pcp_batch_high_lock.

I was hoping that it would be possible to have this completely hidden
inside start_isolate_page_range code path. If we need some sort of
disable_pcp_free/enable_pcp_free then it seems like a better fit to have
an explicit API for that (the naming would be obviously different
because we do not want to call out pcp free lists). I strongly suspect
that only the memory hotplug really cares for this hard guanrantee.
alloc_contig_range simply goes with EBUSY.
 
> This commit also changes drain_all_pages() to not trust reading pcp->count during
> drain for page isolation - I believe that could be racy and lead to missing some
> cpu's to drain. If others agree, this can be separated and potentially backported.
> 
> [1] https://lore.kernel.org/linux-mm/20200903140032.380431-1-pasha.tatashin@soleen.com/
> 
> Suggested-by: David Hildenbrand <david@redhat.com>
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  include/linux/gfp.h |  1 +
>  mm/internal.h       |  4 +++
>  mm/memory_hotplug.c | 55 ++++++++++++++++++++++++++++-------------
>  mm/page_alloc.c     | 60 +++++++++++++++++++++++++++++----------------
>  mm/page_isolation.c | 45 ++++++++++++++++++++++++++++------
>  5 files changed, 119 insertions(+), 46 deletions(-)

This has turned out much larger than I would expect.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-09 11:36     ` Michal Hocko
@ 2020-09-09 11:44       ` David Hildenbrand
  2020-09-09 12:57         ` David Hildenbrand
  2020-09-09 11:55       ` Vlastimil Babka
  2020-09-09 12:53         ` Pavel Tatashin
  2 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-09-09 11:44 UTC (permalink / raw)
  To: Michal Hocko, Vlastimil Babka
  Cc: linux-mm, linux-kernel, Pavel Tatashin, Oscar Salvador, Joonsoo Kim

On 09.09.20 13:36, Michal Hocko wrote:
> On Wed 09-09-20 12:48:54, Vlastimil Babka wrote:
>> Here's a version that will apply on top of next-20200908. The first 4 patches need no change.
>>
>> ----8<----
>> >From 8febc17272b8e8b378e2e5ea5e76b2616f029c5b Mon Sep 17 00:00:00 2001
>> From: Vlastimil Babka <vbabka@suse.cz>
>> Date: Mon, 7 Sep 2020 17:20:39 +0200
>> Subject: [PATCH] mm, page_alloc: disable pcplists during page isolation
>>
>> Page isolation can race with process freeing pages to pcplists in a way that
>> a page from isolated pageblock can end up on pcplist. This can be fixed by
>> repeated draining of pcplists, as done by patch "mm/memory_hotplug: drain
>> per-cpu pages again during memory offline" in [1].
>>
>> David and Michal would prefer that this race was closed in a way that callers
>> of page isolation don't need to care about drain. David suggested disabling
>> pcplists usage completely during page isolation, instead of repeatedly draining
>> them.
>>
>> To achieve this without adding special cases in alloc/free fastpath, we can use
>> the same 'trick' as boot pagesets - when pcp->high is 0, any pcplist addition
>> will be immediately flushed.
>>
>> The race can thus be closed by setting pcp->high to 0 and draining pcplists
>> once in start_isolate_page_range(). The draining will serialize after processes
>> that already disabled interrupts and read the old value of pcp->high in
>> free_unref_page_commit(), and processes that have not yet disabled interrupts,
>> will observe pcp->high == 0 when they are rescheduled, and skip pcplists.
>> This guarantees no stray pages on pcplists in zones where isolation happens.
>>
>> We can use the variable zone->nr_isolate_pageblock (protected by zone->lock)
>> to detect transitions from 0 to 1 (to change pcp->high to 0 and issue drain)
>> and from 1 to 0 (to restore original pcp->high and batch values cached in
>> struct zone). We have to avoid external updates to high and batch by taking
>> pcp_batch_high_lock. To allow multiple isolations in parallel, change this
>> lock from mutex to rwsem.
>>
>> For callers that pair start_isolate_page_range() with
>> undo_isolated_page_range() properly, this is transparent. Currently that's
>> alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
>> in the succes case, so it has to be carful to handle restoring pcp->high and batch
>> and unlocking pcp_batch_high_lock.
> 
> I was hoping that it would be possible to have this completely hidden
> inside start_isolate_page_range code path. If we need some sort of
> disable_pcp_free/enable_pcp_free then it seems like a better fit to have
> an explicit API for that (the naming would be obviously different
> because we do not want to call out pcp free lists). I strongly suspect
> that only the memory hotplug really cares for this hard guanrantee.
> alloc_contig_range simply goes with EBUSY.

There will be different alloc_contig_range() demands in the future: try
fast (e.g., loads of small CMA allocations) vs. try hard (e.g.,
virtio-mem). We can add ways to specify that.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-09 11:36     ` Michal Hocko
  2020-09-09 11:44       ` David Hildenbrand
@ 2020-09-09 11:55       ` Vlastimil Babka
  2020-09-10 10:29         ` David Hildenbrand
  2020-09-09 12:53         ` Pavel Tatashin
  2 siblings, 1 reply; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-09 11:55 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, linux-kernel, Pavel Tatashin, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim

On 9/9/20 1:36 PM, Michal Hocko wrote:
> On Wed 09-09-20 12:48:54, Vlastimil Babka wrote:
>> Here's a version that will apply on top of next-20200908. The first 4 patches need no change.
>> 
>> ----8<----
>> >From 8febc17272b8e8b378e2e5ea5e76b2616f029c5b Mon Sep 17 00:00:00 2001
>> From: Vlastimil Babka <vbabka@suse.cz>
>> Date: Mon, 7 Sep 2020 17:20:39 +0200
>> Subject: [PATCH] mm, page_alloc: disable pcplists during page isolation
>> 
>> Page isolation can race with process freeing pages to pcplists in a way that
>> a page from isolated pageblock can end up on pcplist. This can be fixed by
>> repeated draining of pcplists, as done by patch "mm/memory_hotplug: drain
>> per-cpu pages again during memory offline" in [1].
>> 
>> David and Michal would prefer that this race was closed in a way that callers
>> of page isolation don't need to care about drain. David suggested disabling
>> pcplists usage completely during page isolation, instead of repeatedly draining
>> them.
>> 
>> To achieve this without adding special cases in alloc/free fastpath, we can use
>> the same 'trick' as boot pagesets - when pcp->high is 0, any pcplist addition
>> will be immediately flushed.
>> 
>> The race can thus be closed by setting pcp->high to 0 and draining pcplists
>> once in start_isolate_page_range(). The draining will serialize after processes
>> that already disabled interrupts and read the old value of pcp->high in
>> free_unref_page_commit(), and processes that have not yet disabled interrupts,
>> will observe pcp->high == 0 when they are rescheduled, and skip pcplists.
>> This guarantees no stray pages on pcplists in zones where isolation happens.
>> 
>> We can use the variable zone->nr_isolate_pageblock (protected by zone->lock)
>> to detect transitions from 0 to 1 (to change pcp->high to 0 and issue drain)
>> and from 1 to 0 (to restore original pcp->high and batch values cached in
>> struct zone). We have to avoid external updates to high and batch by taking
>> pcp_batch_high_lock. To allow multiple isolations in parallel, change this
>> lock from mutex to rwsem.
>> 
>> For callers that pair start_isolate_page_range() with
>> undo_isolated_page_range() properly, this is transparent. Currently that's
>> alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
>> in the succes case, so it has to be carful to handle restoring pcp->high and batch
>> and unlocking pcp_batch_high_lock.
> 
> I was hoping that it would be possible to have this completely hidden
> inside start_isolate_page_range code path.

I hoped so too, but we can't know the moment when all processes that were in the
critical part of freeing pages to pcplists have moved on (they might have been
rescheduled).
We could change free_unref_page() to disable IRQs sooner, before
free_unref_page_prepare(), or at least the get_pfnblock_migratetype() part. Then
after the single drain, we should be safe, AFAICS?
RT guys might not be happy though, but it's much simpler than this patch. I
still like some of the cleanups in 1-4 though tbh :)

> If we need some sort of
> disable_pcp_free/enable_pcp_free then it seems like a better fit to have
> an explicit API for that (the naming would be obviously different
> because we do not want to call out pcp free lists). I strongly suspect
> that only the memory hotplug really cares for this hard guanrantee.
> alloc_contig_range simply goes with EBUSY.
>  
>> This commit also changes drain_all_pages() to not trust reading pcp->count during
>> drain for page isolation - I believe that could be racy and lead to missing some
>> cpu's to drain. If others agree, this can be separated and potentially backported.
>> 
>> [1] https://lore.kernel.org/linux-mm/20200903140032.380431-1-pasha.tatashin@soleen.com/
>> 
>> Suggested-by: David Hildenbrand <david@redhat.com>
>> Suggested-by: Michal Hocko <mhocko@suse.com>
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
>> ---
>>  include/linux/gfp.h |  1 +
>>  mm/internal.h       |  4 +++
>>  mm/memory_hotplug.c | 55 ++++++++++++++++++++++++++++-------------
>>  mm/page_alloc.c     | 60 +++++++++++++++++++++++++++++----------------
>>  mm/page_isolation.c | 45 ++++++++++++++++++++++++++++------
>>  5 files changed, 119 insertions(+), 46 deletions(-)
> 
> This has turned out much larger than I would expect.
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-09 11:36     ` Michal Hocko
@ 2020-09-09 12:53         ` Pavel Tatashin
  2020-09-09 11:55       ` Vlastimil Babka
  2020-09-09 12:53         ` Pavel Tatashin
  2 siblings, 0 replies; 32+ messages in thread
From: Pavel Tatashin @ 2020-09-09 12:53 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Vlastimil Babka, linux-mm, LKML, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim

> because we do not want to call out pcp free lists). I strongly suspect
> that only the memory hotplug really cares for this hard guanrantee.
> alloc_contig_range simply goes with EBUSY.
>

You are right, the strong requirement is for hotplug case only, but if
PCP disable/enable functionality is properly implemented, and we are
still draining the lists in alloc_contig_range case, I do not see a
reason not to disable/enable PCP during CMA as well just to reduce
number of spurious errors.

Thanks,
Pasha

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
@ 2020-09-09 12:53         ` Pavel Tatashin
  0 siblings, 0 replies; 32+ messages in thread
From: Pavel Tatashin @ 2020-09-09 12:53 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Vlastimil Babka, linux-mm, LKML, David Hildenbrand,
	Oscar Salvador, Joonsoo Kim

> because we do not want to call out pcp free lists). I strongly suspect
> that only the memory hotplug really cares for this hard guanrantee.
> alloc_contig_range simply goes with EBUSY.
>

You are right, the strong requirement is for hotplug case only, but if
PCP disable/enable functionality is properly implemented, and we are
still draining the lists in alloc_contig_range case, I do not see a
reason not to disable/enable PCP during CMA as well just to reduce
number of spurious errors.

Thanks,
Pasha


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-09 11:44       ` David Hildenbrand
@ 2020-09-09 12:57         ` David Hildenbrand
  0 siblings, 0 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-09-09 12:57 UTC (permalink / raw)
  To: Michal Hocko, Vlastimil Babka
  Cc: linux-mm, linux-kernel, Pavel Tatashin, Oscar Salvador, Joonsoo Kim

On 09.09.20 13:44, David Hildenbrand wrote:
> On 09.09.20 13:36, Michal Hocko wrote:
>> On Wed 09-09-20 12:48:54, Vlastimil Babka wrote:
>>> Here's a version that will apply on top of next-20200908. The first 4 patches need no change.
>>>
>>> ----8<----
>>> >From 8febc17272b8e8b378e2e5ea5e76b2616f029c5b Mon Sep 17 00:00:00 2001
>>> From: Vlastimil Babka <vbabka@suse.cz>
>>> Date: Mon, 7 Sep 2020 17:20:39 +0200
>>> Subject: [PATCH] mm, page_alloc: disable pcplists during page isolation
>>>
>>> Page isolation can race with process freeing pages to pcplists in a way that
>>> a page from isolated pageblock can end up on pcplist. This can be fixed by
>>> repeated draining of pcplists, as done by patch "mm/memory_hotplug: drain
>>> per-cpu pages again during memory offline" in [1].
>>>
>>> David and Michal would prefer that this race was closed in a way that callers
>>> of page isolation don't need to care about drain. David suggested disabling
>>> pcplists usage completely during page isolation, instead of repeatedly draining
>>> them.
>>>
>>> To achieve this without adding special cases in alloc/free fastpath, we can use
>>> the same 'trick' as boot pagesets - when pcp->high is 0, any pcplist addition
>>> will be immediately flushed.
>>>
>>> The race can thus be closed by setting pcp->high to 0 and draining pcplists
>>> once in start_isolate_page_range(). The draining will serialize after processes
>>> that already disabled interrupts and read the old value of pcp->high in
>>> free_unref_page_commit(), and processes that have not yet disabled interrupts,
>>> will observe pcp->high == 0 when they are rescheduled, and skip pcplists.
>>> This guarantees no stray pages on pcplists in zones where isolation happens.
>>>
>>> We can use the variable zone->nr_isolate_pageblock (protected by zone->lock)
>>> to detect transitions from 0 to 1 (to change pcp->high to 0 and issue drain)
>>> and from 1 to 0 (to restore original pcp->high and batch values cached in
>>> struct zone). We have to avoid external updates to high and batch by taking
>>> pcp_batch_high_lock. To allow multiple isolations in parallel, change this
>>> lock from mutex to rwsem.
>>>
>>> For callers that pair start_isolate_page_range() with
>>> undo_isolated_page_range() properly, this is transparent. Currently that's
>>> alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
>>> in the succes case, so it has to be carful to handle restoring pcp->high and batch
>>> and unlocking pcp_batch_high_lock.
>>
>> I was hoping that it would be possible to have this completely hidden
>> inside start_isolate_page_range code path. If we need some sort of
>> disable_pcp_free/enable_pcp_free then it seems like a better fit to have
>> an explicit API for that (the naming would be obviously different
>> because we do not want to call out pcp free lists). I strongly suspect
>> that only the memory hotplug really cares for this hard guanrantee.
>> alloc_contig_range simply goes with EBUSY.
> 
> There will be different alloc_contig_range() demands in the future: try
> fast (e.g., loads of small CMA allocations) vs. try hard (e.g.,
> virtio-mem). We can add ways to specify that.
> 

A reference to a related discussion regarding the "try fast" use case in
CMA for the future:
https://lkml.kernel.org/r/20200818164934.GF3852332@google.com

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 1/5] mm, page_alloc: clean up pageset high and batch update
  2020-09-07 16:36 ` [RFC 1/5] mm, page_alloc: clean up pageset high and batch update Vlastimil Babka
@ 2020-09-10  8:31   ` Oscar Salvador
  2020-09-10  8:34     ` Oscar Salvador
                       ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Oscar Salvador @ 2020-09-10  8:31 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On Mon, Sep 07, 2020 at 06:36:24PM +0200, Vlastimil Babka wrote:
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

>  	for_each_possible_cpu(cpu)
> -		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
> +		setup_pageset(&per_cpu(boot_pageset, cpu));

This is not really anything important but I realized we have like 7 functions
messing with pcp lists, and everytime I try to follow them my head spins.

Since setup_pageset is only being called here, could we replace it by the
pageset_init and pageset_update?

(As I said, not important and probably a matter of taste. I just think that
having so many mini functions around is not always cool,
e.g: setup_zone_pageset->zone_pageset_init)

> -/*
> - * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
> - * to the value high for the pageset p.
> - */
> -static void pageset_set_high(struct per_cpu_pageset *p,
> -				unsigned long high)
> -{
> -	unsigned long batch = max(1UL, high / 4);
> -	if ((high / 4) > (PAGE_SHIFT * 8))
> -		batch = PAGE_SHIFT * 8;
> -
> -	pageset_update(&p->pcp, high, batch);
> +	pageset_update(&p->pcp, 0, 1);
>  }

Could we restore the comment we had in pageset_set_high, and maybe
update it to match this new function? I think it would be useful.
>  
>  static void pageset_set_high_and_batch(struct zone *zone,
> -				       struct per_cpu_pageset *pcp)
> +				       struct per_cpu_pageset *p)
>  {
> -	if (percpu_pagelist_fraction)
> -		pageset_set_high(pcp,
> -			(zone_managed_pages(zone) /
> -				percpu_pagelist_fraction));
> -	else
> -		pageset_set_batch(pcp, zone_batchsize(zone));
> +	unsigned long new_high;
> +	unsigned long new_batch;
> +	int fraction = READ_ONCE(percpu_pagelist_fraction);

Why the READ_ONCE? In case there is a parallel update so things to get
messed up?

as I said, I'd appreciate a comment in pageset_set_high_and_batch to be
restored and updated, otherwise:

Reviewed-by: Oscar Salvador <osalvador@suse.de> 

Thanks

-- 
Oscar Salvador
SUSE L3

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 1/5] mm, page_alloc: clean up pageset high and batch update
  2020-09-10  8:31   ` Oscar Salvador
@ 2020-09-10  8:34     ` Oscar Salvador
  2020-09-10 10:47     ` David Hildenbrand
  2020-09-17 16:05     ` Vlastimil Babka
  2 siblings, 0 replies; 32+ messages in thread
From: Oscar Salvador @ 2020-09-10  8:34 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On Thu, Sep 10, 2020 at 10:31:20AM +0200, Oscar Salvador wrote:
> On Mon, Sep 07, 2020 at 06:36:24PM +0200, Vlastimil Babka wrote:
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> >  	for_each_possible_cpu(cpu)
> > -		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
> > +		setup_pageset(&per_cpu(boot_pageset, cpu));
> 
> This is not really anything important but I realized we have like 7 functions
> messing with pcp lists, and everytime I try to follow them my head spins.
> 
> Since setup_pageset is only being called here, could we replace it by the
> pageset_init and pageset_update?
> 
> (As I said, not important and probably a matter of taste. I just think that
> having so many mini functions around is not always cool,
> e.g: setup_zone_pageset->zone_pageset_init)

Sorry, I did not see that you just did that in Patch#3, bleh.


-- 
Oscar Salvador
SUSE L3

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone
  2020-09-07 16:36 ` [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone Vlastimil Babka
@ 2020-09-10  9:00   ` Oscar Salvador
  2020-09-10 10:56   ` David Hildenbrand
  1 sibling, 0 replies; 32+ messages in thread
From: Oscar Salvador @ 2020-09-10  9:00 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On Mon, Sep 07, 2020 at 06:36:25PM +0200, Vlastimil Babka wrote:
> We currently call pageset_set_high_and_batch() for each possible cpu,
> which repeats the same calculations of high and batch values.
> 
> Instead call it once per zone, and it applies the calculated values
> to all per-cpu pagesets of the zone.
> 
> This also allows removing zone_pageset_init() and __zone_pcp_update() wrappers.
> 
> No functional change.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

I like this, it simplifies the things.

Reviewed-by: Oscar Salvador <osalvador@suse.de>

-- 
Oscar Salvador
SUSE L3

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] mm, page_alloc(): remove setup_pageset()
  2020-09-07 16:36 ` [RFC 3/5] mm, page_alloc(): remove setup_pageset() Vlastimil Babka
@ 2020-09-10  9:23   ` Oscar Salvador
  2020-09-10  9:57     ` Oscar Salvador
  2020-09-18  9:37     ` Vlastimil Babka
  2020-09-10 11:00   ` David Hildenbrand
  1 sibling, 2 replies; 32+ messages in thread
From: Oscar Salvador @ 2020-09-10  9:23 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On Mon, Sep 07, 2020 at 06:36:26PM +0200, Vlastimil Babka wrote:
> We initialize boot-time pagesets with setup_pageset(), which sets high and
> batch values that effectively disable pcplists.
> 
> We can remove this wrapper if we just set these values for all pagesets in
> pageset_init(). Non-boot pagesets then subsequently update them to specific
> values.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Reviewed-by: Oscar Salvador <osalvador@suse.de>

Just one question below:

> -static void setup_pageset(struct per_cpu_pageset *p)
> -{
> -	pageset_init(p);
> -	pageset_update(&p->pcp, 0, 1);
> +	/*
> +	 * Set batch and high values safe for a boot pageset. Proper pageset's
> +	 * initialization will update them.
> +	 */
> +	pcp->high = 0;
> +	pcp->batch  = 1;

pageset_update was manipulating these values with barriers in between.
I guess we do not care here because we are not really updating but
initializing them, right?

-- 
Oscar Salvador
SUSE L3

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] mm, page_alloc(): remove setup_pageset()
  2020-09-10  9:23   ` Oscar Salvador
@ 2020-09-10  9:57     ` Oscar Salvador
  2020-09-18  9:37     ` Vlastimil Babka
  1 sibling, 0 replies; 32+ messages in thread
From: Oscar Salvador @ 2020-09-10  9:57 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On Thu, Sep 10, 2020 at 11:23:07AM +0200, Oscar Salvador wrote:
> On Mon, Sep 07, 2020 at 06:36:26PM +0200, Vlastimil Babka wrote:
> > We initialize boot-time pagesets with setup_pageset(), which sets high and
> > batch values that effectively disable pcplists.
> > 
> > We can remove this wrapper if we just set these values for all pagesets in
> > pageset_init(). Non-boot pagesets then subsequently update them to specific
> > values.
> > 
> > Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Reviewed-by: Oscar Salvador <osalvador@suse.de>

You would need to squash here the remove of setup_pageset's declaration.
And then remove it from patch#4.

-- 
Oscar Salvador
SUSE L3

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-09 11:55       ` Vlastimil Babka
@ 2020-09-10 10:29         ` David Hildenbrand
  2020-09-10 11:05           ` Vlastimil Babka
  0 siblings, 1 reply; 32+ messages in thread
From: David Hildenbrand @ 2020-09-10 10:29 UTC (permalink / raw)
  To: Vlastimil Babka, Michal Hocko
  Cc: linux-mm, linux-kernel, Pavel Tatashin, Oscar Salvador, Joonsoo Kim

On 09.09.20 13:55, Vlastimil Babka wrote:
> On 9/9/20 1:36 PM, Michal Hocko wrote:
>> On Wed 09-09-20 12:48:54, Vlastimil Babka wrote:
>>> Here's a version that will apply on top of next-20200908. The first 4 patches need no change.
>>>
>>> ----8<----
>>> >From 8febc17272b8e8b378e2e5ea5e76b2616f029c5b Mon Sep 17 00:00:00 2001
>>> From: Vlastimil Babka <vbabka@suse.cz>
>>> Date: Mon, 7 Sep 2020 17:20:39 +0200
>>> Subject: [PATCH] mm, page_alloc: disable pcplists during page isolation
>>>
>>> Page isolation can race with process freeing pages to pcplists in a way that
>>> a page from isolated pageblock can end up on pcplist. This can be fixed by
>>> repeated draining of pcplists, as done by patch "mm/memory_hotplug: drain
>>> per-cpu pages again during memory offline" in [1].
>>>
>>> David and Michal would prefer that this race was closed in a way that callers
>>> of page isolation don't need to care about drain. David suggested disabling
>>> pcplists usage completely during page isolation, instead of repeatedly draining
>>> them.
>>>
>>> To achieve this without adding special cases in alloc/free fastpath, we can use
>>> the same 'trick' as boot pagesets - when pcp->high is 0, any pcplist addition
>>> will be immediately flushed.
>>>
>>> The race can thus be closed by setting pcp->high to 0 and draining pcplists
>>> once in start_isolate_page_range(). The draining will serialize after processes
>>> that already disabled interrupts and read the old value of pcp->high in
>>> free_unref_page_commit(), and processes that have not yet disabled interrupts,
>>> will observe pcp->high == 0 when they are rescheduled, and skip pcplists.
>>> This guarantees no stray pages on pcplists in zones where isolation happens.
>>>
>>> We can use the variable zone->nr_isolate_pageblock (protected by zone->lock)
>>> to detect transitions from 0 to 1 (to change pcp->high to 0 and issue drain)
>>> and from 1 to 0 (to restore original pcp->high and batch values cached in
>>> struct zone). We have to avoid external updates to high and batch by taking
>>> pcp_batch_high_lock. To allow multiple isolations in parallel, change this
>>> lock from mutex to rwsem.
>>>
>>> For callers that pair start_isolate_page_range() with
>>> undo_isolated_page_range() properly, this is transparent. Currently that's
>>> alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
>>> in the succes case, so it has to be carful to handle restoring pcp->high and batch
>>> and unlocking pcp_batch_high_lock.
>>
>> I was hoping that it would be possible to have this completely hidden
>> inside start_isolate_page_range code path.
> 
> I hoped so too, but we can't know the moment when all processes that were in the
> critical part of freeing pages to pcplists have moved on (they might have been
> rescheduled).
> We could change free_unref_page() to disable IRQs sooner, before
> free_unref_page_prepare(), or at least the get_pfnblock_migratetype() part. Then
> after the single drain, we should be safe, AFAICS?

At least moving it before getting the migratetype should not be that severe?

> RT guys might not be happy though, but it's much simpler than this patch. I
> still like some of the cleanups in 1-4 though tbh :)

It would certainly make this patch much simpler. Do you have a prototype
lying around?

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 1/5] mm, page_alloc: clean up pageset high and batch update
  2020-09-10  8:31   ` Oscar Salvador
  2020-09-10  8:34     ` Oscar Salvador
@ 2020-09-10 10:47     ` David Hildenbrand
  2020-09-17 16:05     ` Vlastimil Babka
  2 siblings, 0 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-09-10 10:47 UTC (permalink / raw)
  To: Oscar Salvador, Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin, Joonsoo Kim

On 10.09.20 10:31, Oscar Salvador wrote:
> On Mon, Sep 07, 2020 at 06:36:24PM +0200, Vlastimil Babka wrote:
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
>>  	for_each_possible_cpu(cpu)
>> -		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
>> +		setup_pageset(&per_cpu(boot_pageset, cpu));
> 
> This is not really anything important but I realized we have like 7 functions
> messing with pcp lists, and everytime I try to follow them my head spins.
> 
> Since setup_pageset is only being called here, could we replace it by the
> pageset_init and pageset_update?

Had the same thought, so +1.

>> -/*
>> - * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
>> - * to the value high for the pageset p.
>> - */
>> -static void pageset_set_high(struct per_cpu_pageset *p,
>> -				unsigned long high)
>> -{
>> -	unsigned long batch = max(1UL, high / 4);
>> -	if ((high / 4) > (PAGE_SHIFT * 8))
>> -		batch = PAGE_SHIFT * 8;
>> -
>> -	pageset_update(&p->pcp, high, batch);
>> +	pageset_update(&p->pcp, 0, 1);
>>  }
> 
> Could we restore the comment we had in pageset_set_high, and maybe
> update it to match this new function? I think it would be useful.

At least I didn't really understand what "pageset_set_high() sets the
high water mark for hot per_cpu_pagelist to the value high for the
pageset p." was trying to tell me.

I think the only valuable information is the "hot", meaning it is in use
and we have to be careful when updating, right?

>>  
>>  static void pageset_set_high_and_batch(struct zone *zone,
>> -				       struct per_cpu_pageset *pcp)
>> +				       struct per_cpu_pageset *p)
>>  {
>> -	if (percpu_pagelist_fraction)
>> -		pageset_set_high(pcp,
>> -			(zone_managed_pages(zone) /
>> -				percpu_pagelist_fraction));
>> -	else
>> -		pageset_set_batch(pcp, zone_batchsize(zone));
>> +	unsigned long new_high;
>> +	unsigned long new_batch;
>> +	int fraction = READ_ONCE(percpu_pagelist_fraction);
> 
> Why the READ_ONCE? In case there is a parallel update so things to get
> messed up?

Agreed, this is an actual change in the code. If this is a fix, separate
patch?

Apart from that, looks much better to me!

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone
  2020-09-07 16:36 ` [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone Vlastimil Babka
  2020-09-10  9:00   ` Oscar Salvador
@ 2020-09-10 10:56   ` David Hildenbrand
  1 sibling, 0 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-09-10 10:56 UTC (permalink / raw)
  To: Vlastimil Babka, linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, Oscar Salvador, Joonsoo Kim

On 07.09.20 18:36, Vlastimil Babka wrote:
> We currently call pageset_set_high_and_batch() for each possible cpu,
> which repeats the same calculations of high and batch values.
> 
> Instead call it once per zone, and it applies the calculated values
> to all per-cpu pagesets of the zone.
> 
> This also allows removing zone_pageset_init() and __zone_pcp_update() wrappers.
> 
> No functional change.
> 
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/page_alloc.c | 40 +++++++++++++++++-----------------------
>  1 file changed, 17 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 0b516208afda..f669a251f654 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6236,12 +6236,13 @@ static void setup_pageset(struct per_cpu_pageset *p)
>  	pageset_update(&p->pcp, 0, 1);
>  }
>  
> -static void pageset_set_high_and_batch(struct zone *zone,
> -				       struct per_cpu_pageset *p)
> +static void zone_set_pageset_high_and_batch(struct zone *zone)
>  {
>  	unsigned long new_high;
>  	unsigned long new_batch;
>  	int fraction = READ_ONCE(percpu_pagelist_fraction);
> +	int cpu;
> +	struct per_cpu_pageset *p;

I would use reverse Christmas tree layout. (same applies to patch #1,
you could put both unsigned long variables into a single line)

[...]

LGTM

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] mm, page_alloc(): remove setup_pageset()
  2020-09-07 16:36 ` [RFC 3/5] mm, page_alloc(): remove setup_pageset() Vlastimil Babka
  2020-09-10  9:23   ` Oscar Salvador
@ 2020-09-10 11:00   ` David Hildenbrand
  1 sibling, 0 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-09-10 11:00 UTC (permalink / raw)
  To: Vlastimil Babka, linux-mm
  Cc: linux-kernel, Michal Hocko, Pavel Tatashin, Oscar Salvador, Joonsoo Kim

On 07.09.20 18:36, Vlastimil Babka wrote:
> We initialize boot-time pagesets with setup_pageset(), which sets high and
> batch values that effectively disable pcplists.
> 
> We can remove this wrapper if we just set these values for all pagesets in
> pageset_init(). Non-boot pagesets then subsequently update them to specific
> values.
> 

subject: s/page_alloc()/page_alloc/ to make it look consistent

> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
>  mm/page_alloc.c | 13 +++++++------
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f669a251f654..a0cab2c6055e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5902,7 +5902,7 @@ build_all_zonelists_init(void)
>  	 * (a chicken-egg dilemma).
>  	 */
>  	for_each_possible_cpu(cpu)
> -		setup_pageset(&per_cpu(boot_pageset, cpu));
> +		pageset_init(&per_cpu(boot_pageset, cpu));
>  
>  	mminit_verify_zonelist();
>  	cpuset_init_current_mems_allowed();
> @@ -6228,12 +6228,13 @@ static void pageset_init(struct per_cpu_pageset *p)
>  	pcp = &p->pcp;
>  	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
>  		INIT_LIST_HEAD(&pcp->lists[migratetype]);
> -}
>  
> -static void setup_pageset(struct per_cpu_pageset *p)
> -{
> -	pageset_init(p);
> -	pageset_update(&p->pcp, 0, 1);
> +	/*
> +	 * Set batch and high values safe for a boot pageset. Proper pageset's
> +	 * initialization will update them.
> +	 */
> +	pcp->high = 0;
> +	pcp->batch  = 1;
>  }
>  
>  static void zone_set_pageset_high_and_batch(struct zone *zone)
> 

Had the same thought about barriers being useless when the pageset is
not hot.

LGTM

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-10 10:29         ` David Hildenbrand
@ 2020-09-10 11:05           ` Vlastimil Babka
  2020-09-10 12:42             ` David Hildenbrand
  0 siblings, 1 reply; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-10 11:05 UTC (permalink / raw)
  To: David Hildenbrand, Michal Hocko
  Cc: linux-mm, linux-kernel, Pavel Tatashin, Oscar Salvador,
	Joonsoo Kim, Mel Gorman

On 9/10/20 12:29 PM, David Hildenbrand wrote:
> On 09.09.20 13:55, Vlastimil Babka wrote:
>> On 9/9/20 1:36 PM, Michal Hocko wrote:
>>> On Wed 09-09-20 12:48:54, Vlastimil Babka wrote:
>>>> Here's a version that will apply on top of next-20200908. The first 4 patches need no change.
>>>> For callers that pair start_isolate_page_range() with
>>>> undo_isolated_page_range() properly, this is transparent. Currently that's
>>>> alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
>>>> in the succes case, so it has to be carful to handle restoring pcp->high and batch
>>>> and unlocking pcp_batch_high_lock.
>>>
>>> I was hoping that it would be possible to have this completely hidden
>>> inside start_isolate_page_range code path.
>> 
>> I hoped so too, but we can't know the moment when all processes that were in the
>> critical part of freeing pages to pcplists have moved on (they might have been
>> rescheduled).
>> We could change free_unref_page() to disable IRQs sooner, before
>> free_unref_page_prepare(), or at least the get_pfnblock_migratetype() part. Then
>> after the single drain, we should be safe, AFAICS?
> 
> At least moving it before getting the migratetype should not be that severe?
> 
>> RT guys might not be happy though, but it's much simpler than this patch. I
>> still like some of the cleanups in 1-4 though tbh :)
> 
> It would certainly make this patch much simpler. Do you have a prototype
> lying around?

Below is the critical part, while writing it I realized that there's also
free_unref_page_list() where it's more ugly.

So free_unref_page() simply moves the "determine migratetype" part under
disabled interrupts. For free_unref_page_list() we optimistically determine them
without disabling interrupts, and with disabled interrupts we check if zone has
isolated pageblocks and thus we should not trust that value anymore. That's same
pattern as free_pcppages_bulk uses.

But unfortunately it means an extra page_zone() plus a test for each page on the
list :/

----8<----
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index defefed79cfb..57e2a341c95c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3103,18 +3103,21 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+static bool free_unref_page_prepare(struct page *page)
 {
-	int migratetype;
-
 	if (!free_pcp_prepare(page))
 		return false;
 
-	migratetype = get_pfnblock_migratetype(page, pfn);
-	set_pcppage_migratetype(page, migratetype);
 	return true;
 }
 
+static inline void free_unref_page_set_migratetype(struct page *page, unsigned long pfn)
+{
+	int migratetype	= get_pfnblock_migratetype(page, pfn);
+
+	set_pcppage_migratetype(page, migratetype);
+}
+
 static void free_unref_page_commit(struct page *page, unsigned long pfn)
 {
 	struct zone *zone = page_zone(page);
@@ -3156,10 +3159,17 @@ void free_unref_page(struct page *page)
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 
-	if (!free_unref_page_prepare(page, pfn))
+	if (!free_unref_page_prepare(page))
 		return;
 
+	/*
+	 * by disabling interrupts before reading pageblock's migratetype,
+	 * we can guarantee that after changing a pageblock to MIGRATE_ISOLATE
+	 * and drain_all_pages(), there's nobody who would read the old
+	 * migratetype and put a page from isoalted pageblock to pcplists
+	 */
 	local_irq_save(flags);
+	free_unref_page_set_migratetype(page, pfn);
 	free_unref_page_commit(page, pfn);
 	local_irq_restore(flags);
 }
@@ -3176,9 +3186,10 @@ void free_unref_page_list(struct list_head *list)
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		pfn = page_to_pfn(page);
-		if (!free_unref_page_prepare(page, pfn))
+		if (!free_unref_page_prepare(page))
 			list_del(&page->lru);
 		set_page_private(page, pfn);
+		free_unref_page_set_migratetype(page, pfn);
 	}
 
 	local_irq_save(flags);
@@ -3187,6 +3198,8 @@ void free_unref_page_list(struct list_head *list)
 
 		set_page_private(page, 0);
 		trace_mm_page_free_batched(page);
+		if (has_isolate_pageblock(page_zone(page)))
+			free_unref_page_set_migratetype(page, pfn);
 		free_unref_page_commit(page, pfn);
 
 		/*



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [RFC 4/5] mm, page_alloc: cache pageset high and batch in struct zone
  2020-09-07 16:36 ` [RFC 4/5] mm, page_alloc: cache pageset high and batch in struct zone Vlastimil Babka
@ 2020-09-10 11:30   ` Oscar Salvador
  2020-09-18 12:02     ` Vlastimil Babka
  0 siblings, 1 reply; 32+ messages in thread
From: Oscar Salvador @ 2020-09-10 11:30 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On Mon, Sep 07, 2020 at 06:36:27PM +0200, Vlastimil Babka wrote:
   */
> -static void setup_pageset(struct per_cpu_pageset *p);
> +static void pageset_init(struct per_cpu_pageset *p);

this belongs to the respective patches

> -static void zone_set_pageset_high_and_batch(struct zone *zone)
> +static void zone_set_pageset_high_and_batch(struct zone *zone, bool force_update)
>  {
>  	unsigned long new_high;
>  	unsigned long new_batch;
> @@ -6256,6 +6256,14 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
>  		new_batch = max(1UL, 1 * new_batch);
>  	}
>  
> +	if (zone->pageset_high != new_high ||
> +	    zone->pageset_batch != new_batch) {
> +		zone->pageset_high = new_high;
> +		zone->pageset_batch = new_batch;
> +	} else if (!force_update) {
> +		return;
> +	}

I am probably missimg something obvious, so sorry, but why do we need
force_update here?
AFAICS, we only want to call pageset_update() in case zone->pageset_high/batch
and the new computed high/batch differs, so if everything is equal, why do we want
to call it anyways?

-- 
Oscar Salvador
SUSE L3

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 5/5] mm, page_alloc: disable pcplists during page isolation
  2020-09-10 11:05           ` Vlastimil Babka
@ 2020-09-10 12:42             ` David Hildenbrand
  0 siblings, 0 replies; 32+ messages in thread
From: David Hildenbrand @ 2020-09-10 12:42 UTC (permalink / raw)
  To: Vlastimil Babka, Michal Hocko
  Cc: linux-mm, linux-kernel, Pavel Tatashin, Oscar Salvador,
	Joonsoo Kim, Mel Gorman

On 10.09.20 13:05, Vlastimil Babka wrote:
> On 9/10/20 12:29 PM, David Hildenbrand wrote:
>> On 09.09.20 13:55, Vlastimil Babka wrote:
>>> On 9/9/20 1:36 PM, Michal Hocko wrote:
>>>> On Wed 09-09-20 12:48:54, Vlastimil Babka wrote:
>>>>> Here's a version that will apply on top of next-20200908. The first 4 patches need no change.
>>>>> For callers that pair start_isolate_page_range() with
>>>>> undo_isolated_page_range() properly, this is transparent. Currently that's
>>>>> alloc_contig_range(). __offline_pages() doesn't call undo_isolated_page_range()
>>>>> in the succes case, so it has to be carful to handle restoring pcp->high and batch
>>>>> and unlocking pcp_batch_high_lock.
>>>>
>>>> I was hoping that it would be possible to have this completely hidden
>>>> inside start_isolate_page_range code path.
>>>
>>> I hoped so too, but we can't know the moment when all processes that were in the
>>> critical part of freeing pages to pcplists have moved on (they might have been
>>> rescheduled).
>>> We could change free_unref_page() to disable IRQs sooner, before
>>> free_unref_page_prepare(), or at least the get_pfnblock_migratetype() part. Then
>>> after the single drain, we should be safe, AFAICS?
>>
>> At least moving it before getting the migratetype should not be that severe?
>>
>>> RT guys might not be happy though, but it's much simpler than this patch. I
>>> still like some of the cleanups in 1-4 though tbh :)
>>
>> It would certainly make this patch much simpler. Do you have a prototype
>> lying around?
> 
> Below is the critical part, while writing it I realized that there's also
> free_unref_page_list() where it's more ugly.
> 
> So free_unref_page() simply moves the "determine migratetype" part under
> disabled interrupts. For free_unref_page_list() we optimistically determine them
> without disabling interrupts, and with disabled interrupts we check if zone has
> isolated pageblocks and thus we should not trust that value anymore. That's same
> pattern as free_pcppages_bulk uses.
> 
> But unfortunately it means an extra page_zone() plus a test for each page on the
> list :/

That function is mostly called from mm/vmscan.c AFAIKS.

The fist thing free_unref_page_commit() does is doing page_zone().
That's easy to optimize if needed.

> 
> ----8<----
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index defefed79cfb..57e2a341c95c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3103,18 +3103,21 @@ void mark_free_pages(struct zone *zone)
>  }
>  #endif /* CONFIG_PM */
>  
> -static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
> +static bool free_unref_page_prepare(struct page *page)
>  {
> -	int migratetype;
> -
>  	if (!free_pcp_prepare(page))
>  		return false;
>  
> -	migratetype = get_pfnblock_migratetype(page, pfn);
> -	set_pcppage_migratetype(page, migratetype);
>  	return true;
>  }
>  
> +static inline void free_unref_page_set_migratetype(struct page *page, unsigned long pfn)
> +{
> +	int migratetype	= get_pfnblock_migratetype(page, pfn);
> +
> +	set_pcppage_migratetype(page, migratetype);
> +}
> +
>  static void free_unref_page_commit(struct page *page, unsigned long pfn)
>  {
>  	struct zone *zone = page_zone(page);
> @@ -3156,10 +3159,17 @@ void free_unref_page(struct page *page)
>  	unsigned long flags;
>  	unsigned long pfn = page_to_pfn(page);
>  
> -	if (!free_unref_page_prepare(page, pfn))
> +	if (!free_unref_page_prepare(page))
>  		return;
>  
> +	/*
> +	 * by disabling interrupts before reading pageblock's migratetype,
> +	 * we can guarantee that after changing a pageblock to MIGRATE_ISOLATE
> +	 * and drain_all_pages(), there's nobody who would read the old
> +	 * migratetype and put a page from isoalted pageblock to pcplists
> +	 */
>  	local_irq_save(flags);
> +	free_unref_page_set_migratetype(page, pfn);
>  	free_unref_page_commit(page, pfn);
>  	local_irq_restore(flags);
>  }
> @@ -3176,9 +3186,10 @@ void free_unref_page_list(struct list_head *list)
>  	/* Prepare pages for freeing */
>  	list_for_each_entry_safe(page, next, list, lru) {
>  		pfn = page_to_pfn(page);
> -		if (!free_unref_page_prepare(page, pfn))
> +		if (!free_unref_page_prepare(page))
>  			list_del(&page->lru);
>  		set_page_private(page, pfn);
> +		free_unref_page_set_migratetype(page, pfn);
>  	}
>  
>  	local_irq_save(flags);
> @@ -3187,6 +3198,8 @@ void free_unref_page_list(struct list_head *list)
>  
>  		set_page_private(page, 0);
>  		trace_mm_page_free_batched(page);
> +		if (has_isolate_pageblock(page_zone(page)))
> +			free_unref_page_set_migratetype(page, pfn);
>  		free_unref_page_commit(page, pfn);
>  

FWIW: You can safely add unlikely() for the has_isolate_pageblock(), as
for most other users of that function I was able to spot.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 1/5] mm, page_alloc: clean up pageset high and batch update
  2020-09-10  8:31   ` Oscar Salvador
  2020-09-10  8:34     ` Oscar Salvador
  2020-09-10 10:47     ` David Hildenbrand
@ 2020-09-17 16:05     ` Vlastimil Babka
  2 siblings, 0 replies; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-17 16:05 UTC (permalink / raw)
  To: Oscar Salvador
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On 9/10/20 10:31 AM, Oscar Salvador wrote:
> On Mon, Sep 07, 2020 at 06:36:24PM +0200, Vlastimil Babka wrote:
> 
>> -/*
>> - * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
>> - * to the value high for the pageset p.
>> - */
>> -static void pageset_set_high(struct per_cpu_pageset *p,
>> -				unsigned long high)
>> -{
>> -	unsigned long batch = max(1UL, high / 4);
>> -	if ((high / 4) > (PAGE_SHIFT * 8))
>> -		batch = PAGE_SHIFT * 8;
>> -
>> -	pageset_update(&p->pcp, high, batch);
>> +	pageset_update(&p->pcp, 0, 1);
>>  }
> 
> Could we restore the comment we had in pageset_set_high, and maybe
> update it to match this new function? I think it would be useful.

Same as David, I didn't find the comment useful at all. But I can try writing a
better one instead.

>>  
>>  static void pageset_set_high_and_batch(struct zone *zone,
>> -				       struct per_cpu_pageset *pcp)
>> +				       struct per_cpu_pageset *p)
>>  {
>> -	if (percpu_pagelist_fraction)
>> -		pageset_set_high(pcp,
>> -			(zone_managed_pages(zone) /
>> -				percpu_pagelist_fraction));
>> -	else
>> -		pageset_set_batch(pcp, zone_batchsize(zone));
>> +	unsigned long new_high;
>> +	unsigned long new_batch;
>> +	int fraction = READ_ONCE(percpu_pagelist_fraction);
> 
> Why the READ_ONCE? In case there is a parallel update so things to get
> messed up?

Yeah, I think online of a new zone can race with sysctl update to
percpu_pagelist_fraction, because online doesn't take pcp_batch_high_lock...
until patch 5. But you're right this should be separate.

> as I said, I'd appreciate a comment in pageset_set_high_and_batch to be
> restored and updated, otherwise:
> 
> Reviewed-by: Oscar Salvador <osalvador@suse.de> 

Thanks!

> Thanks
> 


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 3/5] mm, page_alloc(): remove setup_pageset()
  2020-09-10  9:23   ` Oscar Salvador
  2020-09-10  9:57     ` Oscar Salvador
@ 2020-09-18  9:37     ` Vlastimil Babka
  1 sibling, 0 replies; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-18  9:37 UTC (permalink / raw)
  To: Oscar Salvador
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On 9/10/20 11:23 AM, Oscar Salvador wrote:
> On Mon, Sep 07, 2020 at 06:36:26PM +0200, Vlastimil Babka wrote:
>> We initialize boot-time pagesets with setup_pageset(), which sets high and
>> batch values that effectively disable pcplists.
>> 
>> We can remove this wrapper if we just set these values for all pagesets in
>> pageset_init(). Non-boot pagesets then subsequently update them to specific
>> values.
>> 
>> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Reviewed-by: Oscar Salvador <osalvador@suse.de>

Thanks!

> Just one question below:
> 
>> -static void setup_pageset(struct per_cpu_pageset *p)
>> -{
>> -	pageset_init(p);
>> -	pageset_update(&p->pcp, 0, 1);
>> +	/*
>> +	 * Set batch and high values safe for a boot pageset. Proper pageset's
>> +	 * initialization will update them.
>> +	 */
>> +	pcp->high = 0;
>> +	pcp->batch  = 1;
> 
> pageset_update was manipulating these values with barriers in between.
> I guess we do not care here because we are not really updating but
> initializing them, right?

Sure. We just initialized all the list heads, so there can be no concurrent
access at this point. But I'll mention it in the comment.



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC 4/5] mm, page_alloc: cache pageset high and batch in struct zone
  2020-09-10 11:30   ` Oscar Salvador
@ 2020-09-18 12:02     ` Vlastimil Babka
  0 siblings, 0 replies; 32+ messages in thread
From: Vlastimil Babka @ 2020-09-18 12:02 UTC (permalink / raw)
  To: Oscar Salvador
  Cc: linux-mm, linux-kernel, Michal Hocko, Pavel Tatashin,
	David Hildenbrand, Joonsoo Kim

On 9/10/20 1:30 PM, Oscar Salvador wrote:
> On Mon, Sep 07, 2020 at 06:36:27PM +0200, Vlastimil Babka wrote:
>    */
>> -static void setup_pageset(struct per_cpu_pageset *p);
>> +static void pageset_init(struct per_cpu_pageset *p);
> 
> this belongs to the respective patches

Right, thanks.

>> -static void zone_set_pageset_high_and_batch(struct zone *zone)
>> +static void zone_set_pageset_high_and_batch(struct zone *zone, bool force_update)
>>  {
>>  	unsigned long new_high;
>>  	unsigned long new_batch;
>> @@ -6256,6 +6256,14 @@ static void zone_set_pageset_high_and_batch(struct zone *zone)
>>  		new_batch = max(1UL, 1 * new_batch);
>>  	}
>>  
>> +	if (zone->pageset_high != new_high ||
>> +	    zone->pageset_batch != new_batch) {
>> +		zone->pageset_high = new_high;
>> +		zone->pageset_batch = new_batch;
>> +	} else if (!force_update) {
>> +		return;
>> +	}
> 
> I am probably missimg something obvious, so sorry, but why do we need
> force_update here?
> AFAICS, we only want to call pageset_update() in case zone->pageset_high/batch
> and the new computed high/batch differs, so if everything is equal, why do we want
> to call it anyways?

My reasoning is that initially we don't have guarantee that
zone->pageset_high/batch matches the respective pcp->high/batch. So we could
detect no change in the zone values and return, but leave the pcp value
incoherent. But now I think it could be achieved also in a simpler way, so I'll try.



^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2020-09-18 12:02 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-07 16:36 [RFC 0/5] disable pcplists during page isolation Vlastimil Babka
2020-09-07 16:36 ` [RFC 1/5] mm, page_alloc: clean up pageset high and batch update Vlastimil Babka
2020-09-10  8:31   ` Oscar Salvador
2020-09-10  8:34     ` Oscar Salvador
2020-09-10 10:47     ` David Hildenbrand
2020-09-17 16:05     ` Vlastimil Babka
2020-09-07 16:36 ` [RFC 2/5] mm, page_alloc: calculate pageset high and batch once per zone Vlastimil Babka
2020-09-10  9:00   ` Oscar Salvador
2020-09-10 10:56   ` David Hildenbrand
2020-09-07 16:36 ` [RFC 3/5] mm, page_alloc(): remove setup_pageset() Vlastimil Babka
2020-09-10  9:23   ` Oscar Salvador
2020-09-10  9:57     ` Oscar Salvador
2020-09-18  9:37     ` Vlastimil Babka
2020-09-10 11:00   ` David Hildenbrand
2020-09-07 16:36 ` [RFC 4/5] mm, page_alloc: cache pageset high and batch in struct zone Vlastimil Babka
2020-09-10 11:30   ` Oscar Salvador
2020-09-18 12:02     ` Vlastimil Babka
2020-09-07 16:36 ` [RFC 5/5] mm, page_alloc: disable pcplists during page isolation Vlastimil Babka
2020-09-09 10:48   ` Vlastimil Babka
2020-09-09 11:36     ` Michal Hocko
2020-09-09 11:44       ` David Hildenbrand
2020-09-09 12:57         ` David Hildenbrand
2020-09-09 11:55       ` Vlastimil Babka
2020-09-10 10:29         ` David Hildenbrand
2020-09-10 11:05           ` Vlastimil Babka
2020-09-10 12:42             ` David Hildenbrand
2020-09-09 12:53       ` Pavel Tatashin
2020-09-09 12:53         ` Pavel Tatashin
2020-09-08 18:29 ` [RFC 0/5] " David Hildenbrand
2020-09-09 10:54   ` Vlastimil Babka
2020-09-09 11:27     ` osalvador
2020-09-09 11:29       ` David Hildenbrand

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.