[PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v4

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v4
@ 2017-01-17  9:29 Mel Gorman
  2017-01-17  9:29 ` [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue Mel Gorman
                   ` (3 more replies)
  0 siblings, 4 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-17  9:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel, Linux-MM, Vlastimil Babka, Hillf Danton,
	Jesper Dangaard Brouer, Mel Gorman

For Vlastimil, this version passed a few tests with full debugging on
without triggering the additional !in_interrupt() checks. The biggest change
is patch 3 which avoids draining the per-cpu lists from IPI context.

Changelog since v3
o Debugging check in allocation path
o Make it harder to use the free path incorrectly
o Use preempt-safe stats counter
o Do not use IPIs to drain the per-cpu allocator

Changelog since v2
o Add ack's and benchmark data
o Rebase to 4.10-rc3

Changelog since v1
o Remove a scheduler point from the allocation path
o Finalise the bulk allocator and test it

This series is motivated by a conversation led by Jesper Dangaard Brouer at
the last LSF/MM proposing a generic page pool for DMA-coherent pages. Part
of his motivation was due to the overhead of allocating multiple order-0
that led some drivers to use high-order allocations and splitting them. This
is very slow in some cases.

The first two patches in this series restructure the page allocator such
that it is relatively easy to introduce an order-0 bulk page allocator.
A patch exists to do that and has been handed over to Jesper until an
in-kernel users is created. The third patch prevents the per-cpu allocator
being drained from IPI context as that can potentially corrupt the list
after patch four is merged. The final patch alters the per-cpu alloctor
to make it exclusive to !irq requests. This cuts allocation/free overhead
by roughly 30%.

Performance tests from both Jesper and I are included in the patch.

 mm/page_alloc.c | 284 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 181 insertions(+), 103 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue
  2017-01-17  9:29 [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v4 Mel Gorman
@ 2017-01-17  9:29 ` Mel Gorman
  2017-01-17 18:07   ` Jesper Dangaard Brouer
  2017-01-17  9:29 ` [PATCH 2/4] mm, page_alloc: Split alloc_pages_nodemask Mel Gorman
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 25+ messages in thread
From: Mel Gorman @ 2017-01-17  9:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel, Linux-MM, Vlastimil Babka, Hillf Danton,
	Jesper Dangaard Brouer, Mel Gorman

buffered_rmqueue removes a page from a given zone and uses the per-cpu
list for order-0. This is fine but a hypothetical caller that wanted
multiple order-0 pages has to disable/reenable interrupts multiple
times. This patch structures buffere_rmqueue such that it's relatively
easy to build a bulk order-0 page allocator. There is no functional
change.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
---
 mm/page_alloc.c | 126 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 78 insertions(+), 48 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d604d2596b7b..0e8404e546f5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2602,73 +2602,103 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 }
 
+/* Remove page from the per-cpu list, caller must protect the list */
+static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+			bool cold, struct per_cpu_pages *pcp,
+			struct list_head *list)
+{
+	struct page *page;
+
+	do {
+		if (list_empty(list)) {
+			pcp->count += rmqueue_bulk(zone, 0,
+					pcp->batch, list,
+					migratetype, cold);
+			if (unlikely(list_empty(list)))
+				return NULL;
+		}
+
+		if (cold)
+			page = list_last_entry(list, struct page, lru);
+		else
+			page = list_first_entry(list, struct page, lru);
+
+		list_del(&page->lru);
+		pcp->count--;
+	} while (check_new_pcp(page));
+
+	return page;
+}
+
+/* Lock and remove page from the per-cpu list */
+static struct page *rmqueue_pcplist(struct zone *preferred_zone,
+			struct zone *zone, unsigned int order,
+			gfp_t gfp_flags, int migratetype)
+{
+	struct per_cpu_pages *pcp;
+	struct list_head *list;
+	bool cold = ((gfp_flags & __GFP_COLD) != 0);
+	struct page *page;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcp = &this_cpu_ptr(zone->pageset)->pcp;
+	list = &pcp->lists[migratetype];
+	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+	if (page) {
+		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+		zone_statistics(preferred_zone, zone, gfp_flags);
+	}
+	local_irq_restore(flags);
+	return page;
+}
+
 /*
  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
  */
 static inline
-struct page *buffered_rmqueue(struct zone *preferred_zone,
+struct page *rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
 			gfp_t gfp_flags, unsigned int alloc_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
-	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 
 	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
-		struct list_head *list;
-
-		local_irq_save(flags);
-		do {
-			pcp = &this_cpu_ptr(zone->pageset)->pcp;
-			list = &pcp->lists[migratetype];
-			if (list_empty(list)) {
-				pcp->count += rmqueue_bulk(zone, 0,
-						pcp->batch, list,
-						migratetype, cold);
-				if (unlikely(list_empty(list)))
-					goto failed;
-			}
-
-			if (cold)
-				page = list_last_entry(list, struct page, lru);
-			else
-				page = list_first_entry(list, struct page, lru);
-
-			list_del(&page->lru);
-			pcp->count--;
+		page = rmqueue_pcplist(preferred_zone, zone, order,
+				gfp_flags, migratetype);
+		goto out;
+	}
 
-		} while (check_new_pcp(page));
-	} else {
-		/*
-		 * We most definitely don't want callers attempting to
-		 * allocate greater than order-1 page units with __GFP_NOFAIL.
-		 */
-		WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-		spin_lock_irqsave(&zone->lock, flags);
+	/*
+	 * We most definitely don't want callers attempting to
+	 * allocate greater than order-1 page units with __GFP_NOFAIL.
+	 */
+	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+	spin_lock_irqsave(&zone->lock, flags);
 
-		do {
-			page = NULL;
-			if (alloc_flags & ALLOC_HARDER) {
-				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-				if (page)
-					trace_mm_page_alloc_zone_locked(page, order, migratetype);
-			}
-			if (!page)
-				page = __rmqueue(zone, order, migratetype);
-		} while (page && check_new_pages(page, order));
-		spin_unlock(&zone->lock);
+	do {
+		page = NULL;
+		if (alloc_flags & ALLOC_HARDER) {
+			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+			if (page)
+				trace_mm_page_alloc_zone_locked(page, order, migratetype);
+		}
 		if (!page)
-			goto failed;
-		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_pcppage_migratetype(page));
-	}
+			page = __rmqueue(zone, order, migratetype);
+	} while (page && check_new_pages(page, order));
+	spin_unlock(&zone->lock);
+	if (!page)
+		goto failed;
+	__mod_zone_freepage_state(zone, -(1 << order),
+				  get_pcppage_migratetype(page));
 
 	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 	zone_statistics(preferred_zone, zone);
 	local_irq_restore(flags);
 
+out:
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 	return page;
 
@@ -2974,7 +3004,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 		}
 
 try_this_zone:
-		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
+		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
 				gfp_mask, alloc_flags, ac->migratetype);
 		if (page) {
 			prep_new_page(page, order, gfp_mask, alloc_flags);
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/4] mm, page_alloc: Split alloc_pages_nodemask
  2017-01-17  9:29 [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v4 Mel Gorman
  2017-01-17  9:29 ` [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue Mel Gorman
@ 2017-01-17  9:29 ` Mel Gorman
  2017-01-17  9:29 ` [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context Mel Gorman
  2017-01-17  9:29 ` [PATCH 4/4] mm, page_alloc: Only use per-cpu allocator for irq-safe requests Mel Gorman
  3 siblings, 0 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-17  9:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel, Linux-MM, Vlastimil Babka, Hillf Danton,
	Jesper Dangaard Brouer, Mel Gorman

alloc_pages_nodemask does a number of preperation steps that determine
what zones can be used for the allocation depending on a variety of
factors. This is fine but a hypothetical caller that wanted multiple
order-0 pages has to do the preparation steps multiple times. This patch
structures __alloc_pages_nodemask such that it's relatively easy to build
a bulk order-0 page allocator. There is no functional change.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
---
 mm/page_alloc.c | 81 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 32 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0e8404e546f5..d15527a20dce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3756,64 +3756,81 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	return page;
 }
 
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-			struct zonelist *zonelist, nodemask_t *nodemask)
+static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, nodemask_t *nodemask,
+		struct alloc_context *ac, gfp_t *alloc_mask,
+		unsigned int *alloc_flags)
 {
-	struct page *page;
-	unsigned int cpuset_mems_cookie;
-	unsigned int alloc_flags = ALLOC_WMARK_LOW;
-	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
-	struct alloc_context ac = {
-		.high_zoneidx = gfp_zone(gfp_mask),
-		.zonelist = zonelist,
-		.nodemask = nodemask,
-		.migratetype = gfpflags_to_migratetype(gfp_mask),
-	};
+	ac->high_zoneidx = gfp_zone(gfp_mask);
+	ac->zonelist = zonelist;
+	ac->nodemask = nodemask;
+	ac->migratetype = gfpflags_to_migratetype(gfp_mask);
 
 	if (cpusets_enabled()) {
-		alloc_mask |= __GFP_HARDWALL;
-		alloc_flags |= ALLOC_CPUSET;
-		if (!ac.nodemask)
-			ac.nodemask = &cpuset_current_mems_allowed;
+		*alloc_mask |= __GFP_HARDWALL;
+		*alloc_flags |= ALLOC_CPUSET;
+		if (!ac->nodemask)
+			ac->nodemask = &cpuset_current_mems_allowed;
 	}
 
-	gfp_mask &= gfp_allowed_mask;
-
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
 	if (should_fail_alloc_page(gfp_mask, order))
-		return NULL;
+		return false;
 
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of __GFP_THISNODE and a memoryless node
 	 */
-	if (unlikely(!zonelist->_zonerefs->zone))
-		return NULL;
+	if (unlikely(!ac->zonelist->_zonerefs->zone))
+		return false;
 
-	if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
-		alloc_flags |= ALLOC_CMA;
+	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
+		*alloc_flags |= ALLOC_CMA;
 
-retry_cpuset:
-	cpuset_mems_cookie = read_mems_allowed_begin();
+	return true;
+}
 
+/* Determine whether to spread dirty pages and what the first usable zone */
+static inline void finalise_ac(gfp_t gfp_mask,
+		unsigned int order, struct alloc_context *ac)
+{
 	/* Dirty zone balancing only done in the fast path */
-	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/*
 	 * The preferred zone is used for statistics but crucially it is
 	 * also used as the starting point for the zonelist iterator. It
 	 * may get reset for allocations that ignore memory policies.
 	 */
-	ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
-					ac.high_zoneidx, ac.nodemask);
+	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+					ac->high_zoneidx, ac->nodemask);
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	struct page *page;
+	unsigned int cpuset_mems_cookie;
+	unsigned int alloc_flags = ALLOC_WMARK_LOW;
+	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+	struct alloc_context ac = { };
+
+	gfp_mask &= gfp_allowed_mask;
+	if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+		return NULL;
+
+retry_cpuset:
+	cpuset_mems_cookie = read_mems_allowed_begin();
+
+	finalise_ac(gfp_mask, order, &ac);
 	if (!ac.preferred_zoneref) {
 		page = NULL;
 		goto no_zone;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-17  9:29 [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v4 Mel Gorman
  2017-01-17  9:29 ` [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue Mel Gorman
  2017-01-17  9:29 ` [PATCH 2/4] mm, page_alloc: Split alloc_pages_nodemask Mel Gorman
@ 2017-01-17  9:29 ` Mel Gorman
  2017-01-20 14:26   ` Vlastimil Babka
  2017-01-24 11:08   ` Vlastimil Babka
  2017-01-17  9:29 ` [PATCH 4/4] mm, page_alloc: Only use per-cpu allocator for irq-safe requests Mel Gorman
  3 siblings, 2 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-17  9:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel, Linux-MM, Vlastimil Babka, Hillf Danton,
	Jesper Dangaard Brouer, Mel Gorman

The per-cpu page allocator can be drained immediately via drain_all_pages()
which sends IPIs to every CPU. In the next patch, the per-cpu allocator
will only be used for interrupt-safe allocations which prevents draining
it from IPI context. This patch uses workqueues to drain the per-cpu
lists instead.

This is slower but no slowdown during intensive reclaim was measured and
the paths that use drain_all_pages() are not that sensitive to performance.
This is particularly true as the path would only be triggered when reclaim
is failing. It also makes a some sense to avoid storming a machine with IPIs
when it's under memory pressure. Arguably, it should be further adjusted
so that only one caller at a time is draining pages but it's beyond the
scope of the current patch.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d15527a20dce..9c3a0fcf8c13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2341,19 +2341,21 @@ void drain_local_pages(struct zone *zone)
 		drain_pages(cpu);
 }
 
+static void drain_local_pages_wq(struct work_struct *work)
+{
+	drain_local_pages(NULL);
+}
+
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * When zone parameter is non-NULL, spill just the single zone's pages.
  *
- * Note that this code is protected against sending an IPI to an offline
- * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
- * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
- * nothing keeps CPUs from showing up after we populated the cpumask and
- * before the call to on_each_cpu_mask().
+ * Note that this can be extremely slow as the draining happens in a workqueue.
  */
 void drain_all_pages(struct zone *zone)
 {
+	struct work_struct __percpu *works;
 	int cpu;
 
 	/*
@@ -2362,6 +2364,16 @@ void drain_all_pages(struct zone *zone)
 	 */
 	static cpumask_t cpus_with_pcps;
 
+	/* Workqueues cannot recurse */
+	if (current->flags & PF_WQ_WORKER)
+		return;
+
+	/*
+	 * As this can be called from reclaim context, do not reenter reclaim.
+	 * An allocation failure can be handled, it's simply slower
+	 */
+	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);
+
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
@@ -2392,8 +2404,24 @@ void drain_all_pages(struct zone *zone)
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
-	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
-								zone, 1);
+
+	if (works) {
+		for_each_cpu(cpu, &cpus_with_pcps) {
+			struct work_struct *work = per_cpu_ptr(works, cpu);
+			INIT_WORK(work, drain_local_pages_wq);
+			schedule_work_on(cpu, work);
+		}
+		for_each_cpu(cpu, &cpus_with_pcps)
+			flush_work(per_cpu_ptr(works, cpu));
+	} else {
+		for_each_cpu(cpu, &cpus_with_pcps) {
+			struct work_struct work;
+
+			INIT_WORK(&work, drain_local_pages_wq);
+			schedule_work_on(cpu, &work);
+			flush_work(&work);
+		}
+	}
 }
 
 #ifdef CONFIG_HIBERNATION
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 4/4] mm, page_alloc: Only use per-cpu allocator for irq-safe requests
  2017-01-17  9:29 [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v4 Mel Gorman
                   ` (2 preceding siblings ...)
  2017-01-17  9:29 ` [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context Mel Gorman
@ 2017-01-17  9:29 ` Mel Gorman
  2017-01-20 15:02   ` Vlastimil Babka
  3 siblings, 1 reply; 25+ messages in thread
From: Mel Gorman @ 2017-01-17  9:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel, Linux-MM, Vlastimil Babka, Hillf Danton,
	Jesper Dangaard Brouer, Mel Gorman

Many workloads that allocate pages are not handling an interrupt at a
time. As allocation requests may be from IRQ context, it's necessary to
disable/enable IRQs for every page allocation. This cost is the bulk
of the free path but also a significant percentage of the allocation
path.

This patch alters the locking and checks such that only irq-safe allocation
requests use the per-cpu allocator. All others acquire the irq-safe
zone->lock and allocate from the buddy allocator. It relies on disabling
preemption to safely access the per-cpu structures. It could be slightly
modified to avoid soft IRQs using it but it's not clear it's worthwhile.

This modification may slow allocations from IRQ context slightly but the main
gain from the per-cpu allocator is that it scales better for allocations
from multiple contexts. There is an implicit assumption that intensive
allocations from IRQ contexts on multiple CPUs from a single NUMA node are
rare and that the fast majority of scaling issues are encountered in !IRQ
contexts such as page faulting. It's worth noting that this patch is not
required for a bulk page allocator but it significantly reduces the overhead.

The following is results from a page allocator micro-benchmark. Only
order-0 is interesting as higher orders do not use the per-cpu allocator

                                          4.10.0-rc2                 4.10.0-rc2
                                             vanilla               irqsafe-v1r5
Amean    alloc-odr0-1               287.15 (  0.00%)           219.00 ( 23.73%)
Amean    alloc-odr0-2               221.23 (  0.00%)           183.23 ( 17.18%)
Amean    alloc-odr0-4               187.00 (  0.00%)           151.38 ( 19.05%)
Amean    alloc-odr0-8               167.54 (  0.00%)           132.77 ( 20.75%)
Amean    alloc-odr0-16              156.00 (  0.00%)           123.00 ( 21.15%)
Amean    alloc-odr0-32              149.00 (  0.00%)           118.31 ( 20.60%)
Amean    alloc-odr0-64              138.77 (  0.00%)           116.00 ( 16.41%)
Amean    alloc-odr0-128             145.00 (  0.00%)           118.00 ( 18.62%)
Amean    alloc-odr0-256             136.15 (  0.00%)           125.00 (  8.19%)
Amean    alloc-odr0-512             147.92 (  0.00%)           121.77 ( 17.68%)
Amean    alloc-odr0-1024            147.23 (  0.00%)           126.15 ( 14.32%)
Amean    alloc-odr0-2048            155.15 (  0.00%)           129.92 ( 16.26%)
Amean    alloc-odr0-4096            164.00 (  0.00%)           136.77 ( 16.60%)
Amean    alloc-odr0-8192            166.92 (  0.00%)           138.08 ( 17.28%)
Amean    alloc-odr0-16384           159.00 (  0.00%)           138.00 ( 13.21%)
Amean    free-odr0-1                165.00 (  0.00%)            89.00 ( 46.06%)
Amean    free-odr0-2                113.00 (  0.00%)            63.00 ( 44.25%)
Amean    free-odr0-4                 99.00 (  0.00%)            54.00 ( 45.45%)
Amean    free-odr0-8                 88.00 (  0.00%)            47.38 ( 46.15%)
Amean    free-odr0-16                83.00 (  0.00%)            46.00 ( 44.58%)
Amean    free-odr0-32                80.00 (  0.00%)            44.38 ( 44.52%)
Amean    free-odr0-64                72.62 (  0.00%)            43.00 ( 40.78%)
Amean    free-odr0-128               78.00 (  0.00%)            42.00 ( 46.15%)
Amean    free-odr0-256               80.46 (  0.00%)            57.00 ( 29.16%)
Amean    free-odr0-512               96.38 (  0.00%)            64.69 ( 32.88%)
Amean    free-odr0-1024             107.31 (  0.00%)            72.54 ( 32.40%)
Amean    free-odr0-2048             108.92 (  0.00%)            78.08 ( 28.32%)
Amean    free-odr0-4096             113.38 (  0.00%)            82.23 ( 27.48%)
Amean    free-odr0-8192             112.08 (  0.00%)            82.85 ( 26.08%)
Amean    free-odr0-16384            110.38 (  0.00%)            81.92 ( 25.78%)
Amean    total-odr0-1               452.15 (  0.00%)           308.00 ( 31.88%)
Amean    total-odr0-2               334.23 (  0.00%)           246.23 ( 26.33%)
Amean    total-odr0-4               286.00 (  0.00%)           205.38 ( 28.19%)
Amean    total-odr0-8               255.54 (  0.00%)           180.15 ( 29.50%)
Amean    total-odr0-16              239.00 (  0.00%)           169.00 ( 29.29%)
Amean    total-odr0-32              229.00 (  0.00%)           162.69 ( 28.96%)
Amean    total-odr0-64              211.38 (  0.00%)           159.00 ( 24.78%)
Amean    total-odr0-128             223.00 (  0.00%)           160.00 ( 28.25%)
Amean    total-odr0-256             216.62 (  0.00%)           182.00 ( 15.98%)
Amean    total-odr0-512             244.31 (  0.00%)           186.46 ( 23.68%)
Amean    total-odr0-1024            254.54 (  0.00%)           198.69 ( 21.94%)
Amean    total-odr0-2048            264.08 (  0.00%)           208.00 ( 21.24%)
Amean    total-odr0-4096            277.38 (  0.00%)           219.00 ( 21.05%)
Amean    total-odr0-8192            279.00 (  0.00%)           220.92 ( 20.82%)
Amean    total-odr0-16384           269.38 (  0.00%)           219.92 ( 18.36%)

This is the alloc, free and total overhead of allocating order-0 pages in
batches of 1 page up to 16384 pages. Avoiding disabling/enabling overhead
massively reduces overhead. Alloc overhead is roughly reduced by 14-20% in
most cases. The free path is reduced by 26-46% and the total reduction
is significant.

Many users require zeroing of pages from the page allocator which is the
vast cost of allocation. Hence, the impact on a basic page faulting benchmark
is not that significant

                              4.10.0-rc2            4.10.0-rc2
                                 vanilla          irqsafe-v1r5
Hmean    page_test   656632.98 (  0.00%)   675536.13 (  2.88%)
Hmean    brk_test   3845502.67 (  0.00%)  3867186.94 (  0.56%)
Stddev   page_test    10543.29 (  0.00%)     4104.07 ( 61.07%)
Stddev   brk_test     33472.36 (  0.00%)    15538.39 ( 53.58%)
CoeffVar page_test        1.61 (  0.00%)        0.61 ( 62.15%)
CoeffVar brk_test         0.87 (  0.00%)        0.40 ( 53.84%)
Max      page_test   666513.33 (  0.00%)   678640.00 (  1.82%)
Max      brk_test   3882800.00 (  0.00%)  3887008.66 (  0.11%)

This is from aim9 and the most notable outcome is that fault variability
is reduced by the patch. The headline improvement is small as the overall
fault cost, zeroing, page table insertion etc dominate relative to
disabling/enabling IRQs in the per-cpu allocator.

Similarly, little benefit was seen on networking benchmarks both localhost
and between physical server/clients where other costs dominate. It's
possible that this will only be noticable on very high speed networks.

Jesper Dangaard Brouer independently tested
this with a separate microbenchmark from
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

Micro-benchmarked with [1] page_bench02:
 modprobe page_bench02 page_order=0 run_flags=$((2#010)) loops=$((10**8)); \
  rmmod page_bench02 ; dmesg --notime | tail -n 4

Compared to baseline: 213 cycles(tsc) 53.417 ns
 - against this     : 184 cycles(tsc) 46.056 ns
 - Saving           : -29 cycles
 - Very close to expected 27 cycles saving [see below [2]]

Micro benchmarking via time_bench_sample[3], we get the cost of these
operations:

 time_bench: Type:for_loop                 Per elem: 0 cycles(tsc) 0.232 ns (step:0)
 time_bench: Type:spin_lock_unlock         Per elem: 33 cycles(tsc) 8.334 ns (step:0)
 time_bench: Type:spin_lock_unlock_irqsave Per elem: 62 cycles(tsc) 15.607 ns (step:0)
 time_bench: Type:irqsave_before_lock      Per elem: 57 cycles(tsc) 14.344 ns (step:0)
 time_bench: Type:spin_lock_unlock_irq     Per elem: 34 cycles(tsc) 8.560 ns (step:0)
 time_bench: Type:simple_irq_disable_before_lock Per elem: 37 cycles(tsc) 9.289 ns (step:0)
 time_bench: Type:local_BH_disable_enable  Per elem: 19 cycles(tsc) 4.920 ns (step:0)
 time_bench: Type:local_IRQ_disable_enable Per elem: 7 cycles(tsc) 1.864 ns (step:0)
 time_bench: Type:local_irq_save_restore   Per elem: 38 cycles(tsc) 9.665 ns (step:0)
 [Mel's patch removes a ^^^^^^^^^^^^^^^^]            ^^^^^^^^^ expected saving - preempt cost
 time_bench: Type:preempt_disable_enable   Per elem: 11 cycles(tsc) 2.794 ns (step:0)
 [adds a preempt  ^^^^^^^^^^^^^^^^^^^^^^]            ^^^^^^^^^ adds this cost
 time_bench: Type:funcion_call_cost        Per elem: 6 cycles(tsc) 1.689 ns (step:0)
 time_bench: Type:func_ptr_call_cost       Per elem: 11 cycles(tsc) 2.767 ns (step:0)
 time_bench: Type:page_alloc_put           Per elem: 211 cycles(tsc) 52.803 ns (step:0)

Thus, expected improvement is: 38-11 = 27 cycles.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 mm/page_alloc.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9c3a0fcf8c13..15b11fc0cd75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1087,10 +1087,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
 	int migratetype = 0;
 	int batch_free = 0;
-	unsigned long nr_scanned;
+	unsigned long nr_scanned, flags;
 	bool isolated_pageblocks;
 
-	spin_lock(&zone->lock);
+	spin_lock_irqsave(&zone->lock, flags);
 	isolated_pageblocks = has_isolate_pageblock(zone);
 	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
 	if (nr_scanned)
@@ -1139,7 +1139,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			trace_mm_page_pcpu_drain(page, 0, mt);
 		} while (--count && --batch_free && !list_empty(list));
 	}
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static void free_one_page(struct zone *zone,
@@ -1147,8 +1147,8 @@ static void free_one_page(struct zone *zone,
 				unsigned int order,
 				int migratetype)
 {
-	unsigned long nr_scanned;
-	spin_lock(&zone->lock);
+	unsigned long nr_scanned, flags;
+	spin_lock_irqsave(&zone->lock, flags);
 	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
 	if (nr_scanned)
 		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1158,7 +1158,7 @@ static void free_one_page(struct zone *zone,
 		migratetype = get_pfnblock_migratetype(page, pfn);
 	}
 	__free_one_page(page, pfn, zone, order, migratetype);
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 }
 
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1236,7 +1236,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
-	unsigned long flags;
 	int migratetype;
 	unsigned long pfn = page_to_pfn(page);
 
@@ -1244,10 +1243,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
-	local_irq_save(flags);
-	__count_vm_events(PGFREE, 1 << order);
+	count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, pfn, order, migratetype);
-	local_irq_restore(flags);
 }
 
 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2219,8 +2216,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			int migratetype, bool cold)
 {
 	int i, alloced = 0;
+	unsigned long flags;
 
-	spin_lock(&zone->lock);
+	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
@@ -2256,7 +2254,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	 * pages added to the pcp list.
 	 */
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
-	spin_unlock(&zone->lock);
+	spin_unlock_irqrestore(&zone->lock, flags);
 	return alloced;
 }
 
@@ -2472,16 +2470,20 @@ void free_hot_cold_page(struct page *page, bool cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
-	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
 	if (!free_pcp_prepare(page))
 		return;
 
+	if (in_interrupt()) {
+		__free_pages_ok(page, 0);
+		return;
+	}
+
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_pcppage_migratetype(page, migratetype);
-	local_irq_save(flags);
+	preempt_disable();
 	__count_vm_event(PGFREE);
 
 	/*
@@ -2512,7 +2514,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 	}
 
 out:
-	local_irq_restore(flags);
+	preempt_enable_no_resched();
 }
 
 /*
@@ -2637,6 +2639,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 {
 	struct page *page;
 
+	VM_BUG_ON(in_interrupt());
+
 	do {
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
@@ -2667,9 +2671,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct list_head *list;
 	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
-	unsigned long flags;
 
-	local_irq_save(flags);
+	preempt_disable();
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
@@ -2677,7 +2680,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone, gfp_flags);
 	}
-	local_irq_restore(flags);
+	preempt_enable_no_resched();
 	return page;
 }
 
@@ -2693,7 +2696,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 
-	if (likely(order == 0)) {
+	if (likely(order == 0) && !in_interrupt()) {
 		page = rmqueue_pcplist(preferred_zone, zone, order,
 				gfp_flags, migratetype);
 		goto out;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue
  2017-01-17  9:29 ` [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue Mel Gorman
@ 2017-01-17 18:07   ` Jesper Dangaard Brouer
       [not found]     ` <2df88f73-a32d-4b71-d4de-3a0ad8831d9a@suse.cz>
  0 siblings, 1 reply; 25+ messages in thread
From: Jesper Dangaard Brouer @ 2017-01-17 18:07 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Linux Kernel, Linux-MM, Vlastimil Babka,
	Hillf Danton, brouer, Michal Hocko


On Tue, 17 Jan 2017 09:29:51 +0000 Mel Gorman <mgorman@techsingularity.net> wrote:

> +/* Lock and remove page from the per-cpu list */
> +static struct page *rmqueue_pcplist(struct zone *preferred_zone,
> +			struct zone *zone, unsigned int order,
> +			gfp_t gfp_flags, int migratetype)
> +{
> +	struct per_cpu_pages *pcp;
> +	struct list_head *list;
> +	bool cold = ((gfp_flags & __GFP_COLD) != 0);
> +	struct page *page;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> +	list = &pcp->lists[migratetype];
> +	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
> +	if (page) {
> +		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
> +		zone_statistics(preferred_zone, zone, gfp_flags);

Word-of-warning: The zone_statistics() call changed number of
parameters in commit 41b6167e8f74 ("mm: get rid of __GFP_OTHER_NODE").
(Not sure what tree you are based on)

> +	}
> +	local_irq_restore(flags);
> +	return page;
> +}


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue
       [not found]     ` <2df88f73-a32d-4b71-d4de-3a0ad8831d9a@suse.cz>
@ 2017-01-17 20:20       ` Mel Gorman
  2017-01-17 21:07         ` Mel Gorman
  0 siblings, 1 reply; 25+ messages in thread
From: Mel Gorman @ 2017-01-17 20:20 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Jesper Dangaard Brouer, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Michal Hocko

On Tue, Jan 17, 2017 at 07:17:22PM +0100, Vlastimil Babka wrote:
> On 01/17/2017 07:07 PM, Jesper Dangaard Brouer wrote:
> > 
> > On Tue, 17 Jan 2017 09:29:51 +0000 Mel Gorman <mgorman@techsingularity.net> wrote:
> > 
> >> +/* Lock and remove page from the per-cpu list */
> >> +static struct page *rmqueue_pcplist(struct zone *preferred_zone,
> >> +			struct zone *zone, unsigned int order,
> >> +			gfp_t gfp_flags, int migratetype)
> >> +{
> >> +	struct per_cpu_pages *pcp;
> >> +	struct list_head *list;
> >> +	bool cold = ((gfp_flags & __GFP_COLD) != 0);
> >> +	struct page *page;
> >> +	unsigned long flags;
> >> +
> >> +	local_irq_save(flags);
> >> +	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> >> +	list = &pcp->lists[migratetype];
> >> +	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
> >> +	if (page) {
> >> +		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
> >> +		zone_statistics(preferred_zone, zone, gfp_flags);
> > 
> > Word-of-warning: The zone_statistics() call changed number of
> > parameters in commit 41b6167e8f74 ("mm: get rid of __GFP_OTHER_NODE").
> > (Not sure what tree you are based on)
> 

Yes, there's a conflict. The fix is trivial and shouldn't affect the
overall series. Not that it matters because of ths next part

> Yeah and there will likely be more conflicts with fixes wrt the "getting
> oom/stalls for ltp test cpuset01 with latest/4.9 kernel???" thread,
> hopefully tomorrow.
> 

It's was on my list to look closer at that thread tomorrow. I only took a
quick look for the first time a few minutes ago and it looks bad. There
is at least a flaw in the retry sequence if cpusets are disabled during
an allocation that fails as it won't retry. That leaves a small window if
the last cpuset disappeared during which an allocation could artifically
fail but that can't be what's going on here.

It could still be the retry logic because the nodemask is not necessarily
synced up with cpuset_current_mems_allowed. I'll try reproducing this
in the morning. The fix is almost certainly going to conflict with this
series but this series can wait until after that gets resolved and I'll
rebase on top of mmotm.

It's late so I'm fairly tired but assuming I can reproduce this in the
morning, the first thing I'll try is something like this to force a reread
of mems_allowed;

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ebea51cc0135..3fc2b3a8d301 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3774,13 +3774,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		.migratetype = gfpflags_to_migratetype(gfp_mask),
 	};
 
-	if (cpusets_enabled()) {
-		alloc_mask |= __GFP_HARDWALL;
-		alloc_flags |= ALLOC_CPUSET;
-		if (!ac.nodemask)
-			ac.nodemask = &cpuset_current_mems_allowed;
-	}
-
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
@@ -3802,6 +3795,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		alloc_flags |= ALLOC_CMA;
 
 retry_cpuset:
+	if (cpusets_enabled()) {
+		alloc_mask |= __GFP_HARDWALL;
+		alloc_flags |= ALLOC_CPUSET;
+		if (!nodemask)
+			ac.nodemask = &cpuset_current_mems_allowed;
+	}
+
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
 	/* Dirty zone balancing only done in the fast path */

If that doesn't work out then I'll start kicking the problem properly
unless you've beaten me to the correct solution already :)

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue
  2017-01-17 20:20       ` Mel Gorman
@ 2017-01-17 21:07         ` Mel Gorman
  2017-01-17 21:24           ` Vlastimil Babka
  0 siblings, 1 reply; 25+ messages in thread
From: Mel Gorman @ 2017-01-17 21:07 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Jesper Dangaard Brouer, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Michal Hocko

On Tue, Jan 17, 2017 at 08:20:08PM +0000, Mel Gorman wrote:
> It's late so I'm fairly tired but assuming I can reproduce this in the
> morning, the first thing I'll try is something like this to force a reread
> of mems_allowed;
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index ebea51cc0135..3fc2b3a8d301 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3774,13 +3774,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
>  		.migratetype = gfpflags_to_migratetype(gfp_mask),
>  	};
>  
> -	if (cpusets_enabled()) {
> -		alloc_mask |= __GFP_HARDWALL;
> -		alloc_flags |= ALLOC_CPUSET;
> -		if (!ac.nodemask)
> -			ac.nodemask = &cpuset_current_mems_allowed;
> -	}
> -
>  	gfp_mask &= gfp_allowed_mask;
>  
>  	lockdep_trace_alloc(gfp_mask);
> @@ -3802,6 +3795,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
>  		alloc_flags |= ALLOC_CMA;
>  
>  retry_cpuset:
> +	if (cpusets_enabled()) {
> +		alloc_mask |= __GFP_HARDWALL;
> +		alloc_flags |= ALLOC_CPUSET;
> +		if (!nodemask)
> +			ac.nodemask = &cpuset_current_mems_allowed;
> +	}
> +
>  	cpuset_mems_cookie = read_mems_allowed_begin();
>  
>  	/* Dirty zone balancing only done in the fast path */
> 

I later recalled that we looked at this before and didn't think a reinit
was necessary because the location of cpuset_current_mems_allowed doesn't
change so I came back and took another look.  The location doesn't change
but after the first attempt, we reset ac.nodemask to the given nodemask and
don't recheck current_mems_allowed if the cpuset changed. The application
of memory policies versus cpusets is a mess so it'll take time to pick
apart to see if this is even remotely in the right direction.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue
  2017-01-17 21:07         ` Mel Gorman
@ 2017-01-17 21:24           ` Vlastimil Babka
  0 siblings, 0 replies; 25+ messages in thread
From: Vlastimil Babka @ 2017-01-17 21:24 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Jesper Dangaard Brouer, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Michal Hocko

On 01/17/2017 10:07 PM, Mel Gorman wrote:
> On Tue, Jan 17, 2017 at 08:20:08PM +0000, Mel Gorman wrote:
> 
> I later recalled that we looked at this before and didn't think a reinit
> was necessary because the location of cpuset_current_mems_allowed doesn't
> change so I came back and took another look.  The location doesn't change
> but after the first attempt, we reset ac.nodemask to the given nodemask and
> don't recheck current_mems_allowed if the cpuset changed. The application
> of memory policies versus cpusets is a mess so it'll take time to pick
> apart to see if this is even remotely in the right direction.

Yes, I spent most of last 2 days untangling this, so I'll post at least
some RFC soon.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-17  9:29 ` [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context Mel Gorman
@ 2017-01-20 14:26   ` Vlastimil Babka
  2017-01-20 15:26     ` Mel Gorman
  2017-01-24 11:08   ` Vlastimil Babka
  1 sibling, 1 reply; 25+ messages in thread
From: Vlastimil Babka @ 2017-01-20 14:26 UTC (permalink / raw)
  To: Mel Gorman, Andrew Morton
  Cc: Linux Kernel, Linux-MM, Hillf Danton, Jesper Dangaard Brouer,
	Petr Mladek

On 01/17/2017 10:29 AM, Mel Gorman wrote:
> The per-cpu page allocator can be drained immediately via drain_all_pages()
> which sends IPIs to every CPU. In the next patch, the per-cpu allocator
> will only be used for interrupt-safe allocations which prevents draining
> it from IPI context. This patch uses workqueues to drain the per-cpu
> lists instead.
> 
> This is slower but no slowdown during intensive reclaim was measured and
> the paths that use drain_all_pages() are not that sensitive to performance.
> This is particularly true as the path would only be triggered when reclaim
> is failing. It also makes a some sense to avoid storming a machine with IPIs
> when it's under memory pressure. Arguably, it should be further adjusted
> so that only one caller at a time is draining pages but it's beyond the
> scope of the current patch.
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>

I'm not a workqueue expert (CC Petr Mladek) but I compared this to
lru_add_drain_all() and have some questions...

> ---
>  mm/page_alloc.c | 42 +++++++++++++++++++++++++++++++++++-------
>  1 file changed, 35 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d15527a20dce..9c3a0fcf8c13 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2341,19 +2341,21 @@ void drain_local_pages(struct zone *zone)
>  		drain_pages(cpu);
>  }
>  
> +static void drain_local_pages_wq(struct work_struct *work)
> +{
> +	drain_local_pages(NULL);
> +}
> +
>  /*
>   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
>   *
>   * When zone parameter is non-NULL, spill just the single zone's pages.
>   *
> - * Note that this code is protected against sending an IPI to an offline
> - * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
> - * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
> - * nothing keeps CPUs from showing up after we populated the cpumask and
> - * before the call to on_each_cpu_mask().
> + * Note that this can be extremely slow as the draining happens in a workqueue.
>   */
>  void drain_all_pages(struct zone *zone)
>  {
> +	struct work_struct __percpu *works;
>  	int cpu;
>  
>  	/*
> @@ -2362,6 +2364,16 @@ void drain_all_pages(struct zone *zone)
>  	 */
>  	static cpumask_t cpus_with_pcps;
>  
> +	/* Workqueues cannot recurse */
> +	if (current->flags & PF_WQ_WORKER)
> +		return;
> +
> +	/*
> +	 * As this can be called from reclaim context, do not reenter reclaim.
> +	 * An allocation failure can be handled, it's simply slower
> +	 */
> +	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);
> +
>  	/*
>  	 * We don't care about racing with CPU hotplug event
>  	 * as offline notification will cause the notified
> @@ -2392,8 +2404,24 @@ void drain_all_pages(struct zone *zone)
>  		else
>  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
>  	}
> -	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
> -								zone, 1);
> +
> +	if (works) {
> +		for_each_cpu(cpu, &cpus_with_pcps) {
> +			struct work_struct *work = per_cpu_ptr(works, cpu);
> +			INIT_WORK(work, drain_local_pages_wq);
> +			schedule_work_on(cpu, work);

This translates to queue_work_on(), which has the comment of "We queue
the work to a specific CPU, the caller must ensure it can't go away.",
so is this safe? lru_add_drain_all() uses get_online_cpus() around this.

schedule_work_on() also uses the generic system_wq, while lru drain has
its own workqueue with WQ_MEM_RECLAIM so it seems that would be useful
here as well?

> +		}
> +		for_each_cpu(cpu, &cpus_with_pcps)
> +			flush_work(per_cpu_ptr(works, cpu));
> +	} else {
> +		for_each_cpu(cpu, &cpus_with_pcps) {
> +			struct work_struct work;
> +
> +			INIT_WORK(&work, drain_local_pages_wq);
> +			schedule_work_on(cpu, &work);
> +			flush_work(&work);

Totally out of scope, but I wonder if schedule_on_each_cpu() could use
the same fallback that's here?

> +		}
> +	}
>  }
>  
>  #ifdef CONFIG_HIBERNATION
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/4] mm, page_alloc: Only use per-cpu allocator for irq-safe requests
  2017-01-17  9:29 ` [PATCH 4/4] mm, page_alloc: Only use per-cpu allocator for irq-safe requests Mel Gorman
@ 2017-01-20 15:02   ` Vlastimil Babka
  2017-01-23 11:17     ` Mel Gorman
  0 siblings, 1 reply; 25+ messages in thread
From: Vlastimil Babka @ 2017-01-20 15:02 UTC (permalink / raw)
  To: Mel Gorman, Andrew Morton
  Cc: Linux Kernel, Linux-MM, Hillf Danton, Jesper Dangaard Brouer

On 01/17/2017 10:29 AM, Mel Gorman wrote:

[...]

> @@ -1244,10 +1243,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
>  		return;
>  
>  	migratetype = get_pfnblock_migratetype(page, pfn);
> -	local_irq_save(flags);
> -	__count_vm_events(PGFREE, 1 << order);
> +	count_vm_events(PGFREE, 1 << order);

Maybe this could be avoided by moving the counting into free_one_page()?
Diff suggestion at the end of e-mail.

>  	free_one_page(page_zone(page), page, pfn, order, migratetype);
> -	local_irq_restore(flags);
>  }
>  
>  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
> @@ -2219,8 +2216,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  			int migratetype, bool cold)
>  {
>  	int i, alloced = 0;
> +	unsigned long flags;
>  
> -	spin_lock(&zone->lock);
> +	spin_lock_irqsave(&zone->lock, flags);
>  	for (i = 0; i < count; ++i) {
>  		struct page *page = __rmqueue(zone, order, migratetype);
>  		if (unlikely(page == NULL))
> @@ -2256,7 +2254,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  	 * pages added to the pcp list.
>  	 */
>  	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
> -	spin_unlock(&zone->lock);
> +	spin_unlock_irqrestore(&zone->lock, flags);
>  	return alloced;
>  }
>  
> @@ -2472,16 +2470,20 @@ void free_hot_cold_page(struct page *page, bool cold)
>  {
>  	struct zone *zone = page_zone(page);
>  	struct per_cpu_pages *pcp;
> -	unsigned long flags;
>  	unsigned long pfn = page_to_pfn(page);
>  	int migratetype;
>  
>  	if (!free_pcp_prepare(page))
>  		return;
>  
> +	if (in_interrupt()) {
> +		__free_pages_ok(page, 0);
> +		return;
> +	}

I think this should go *before* free_pcp_prepare() otherwise
free_pages_prepare() gets done twice in interrupt.


diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15b11fc0cd75..8c6f8a790272 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1149,6 +1149,7 @@ static void free_one_page(struct zone *zone,
 {
 	unsigned long nr_scanned, flags;
 	spin_lock_irqsave(&zone->lock, flags);
+	__count_vm_events(PGFREE, 1 << order);
 	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
 	if (nr_scanned)
 		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1243,7 +1244,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
-	count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, pfn, order, migratetype);
 }
 
@@ -2484,7 +2484,6 @@ void free_hot_cold_page(struct page *page, bool cold)
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_pcppage_migratetype(page, migratetype);
 	preempt_disable();
-	__count_vm_event(PGFREE);
 
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2501,6 +2500,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 		migratetype = MIGRATE_MOVABLE;
 	}
 
+	__count_vm_event(PGFREE);
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (!cold)
 		list_add(&page->lru, &pcp->lists[migratetype]);

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-20 14:26   ` Vlastimil Babka
@ 2017-01-20 15:26     ` Mel Gorman
  2017-01-23 16:29       ` Petr Mladek
  2017-01-23 17:03       ` Tejun Heo
  0 siblings, 2 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-20 15:26 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Linux Kernel, Linux-MM, Hillf Danton,
	Jesper Dangaard Brouer, Petr Mladek

On Fri, Jan 20, 2017 at 03:26:05PM +0100, Vlastimil Babka wrote:
> > @@ -2392,8 +2404,24 @@ void drain_all_pages(struct zone *zone)
> >  		else
> >  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
> >  	}
> > -	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
> > -								zone, 1);
> > +
> > +	if (works) {
> > +		for_each_cpu(cpu, &cpus_with_pcps) {
> > +			struct work_struct *work = per_cpu_ptr(works, cpu);
> > +			INIT_WORK(work, drain_local_pages_wq);
> > +			schedule_work_on(cpu, work);
> 
> This translates to queue_work_on(), which has the comment of "We queue
> the work to a specific CPU, the caller must ensure it can't go away.",
> so is this safe? lru_add_drain_all() uses get_online_cpus() around this.
> 

get_online_cpus() would be required.

> schedule_work_on() also uses the generic system_wq, while lru drain has
> its own workqueue with WQ_MEM_RECLAIM so it seems that would be useful
> here as well?
> 

I would be reluctant to introduce a dedicated queue unless there was a
definite case where an OOM occurred because pages were pinned on per-cpu
lists and couldn't be drained because the buddy allocator was depleted.
As it was, I thought the fallback case was excessively paranoid.

> > +		}
> > +		for_each_cpu(cpu, &cpus_with_pcps)
> > +			flush_work(per_cpu_ptr(works, cpu));
> > +	} else {
> > +		for_each_cpu(cpu, &cpus_with_pcps) {
> > +			struct work_struct work;
> > +
> > +			INIT_WORK(&work, drain_local_pages_wq);
> > +			schedule_work_on(cpu, &work);
> > +			flush_work(&work);
> 
> Totally out of scope, but I wonder if schedule_on_each_cpu() could use
> the same fallback that's here?
> 

I'm not aware of a case where it really has been a problem. I only considered
it here as the likely caller is in a context that is failing allocations.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 4/4] mm, page_alloc: Only use per-cpu allocator for irq-safe requests
  2017-01-20 15:02   ` Vlastimil Babka
@ 2017-01-23 11:17     ` Mel Gorman
  0 siblings, 0 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-23 11:17 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Linux Kernel, Linux-MM, Hillf Danton,
	Jesper Dangaard Brouer

On Fri, Jan 20, 2017 at 04:02:56PM +0100, Vlastimil Babka wrote:
> On 01/17/2017 10:29 AM, Mel Gorman wrote:
> 
> [...]
> 
> > @@ -1244,10 +1243,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
> >  		return;
> >  
> >  	migratetype = get_pfnblock_migratetype(page, pfn);
> > -	local_irq_save(flags);
> > -	__count_vm_events(PGFREE, 1 << order);
> > +	count_vm_events(PGFREE, 1 << order);
> 
> Maybe this could be avoided by moving the counting into free_one_page()?
> Diff suggestion at the end of e-mail.
> 

Yes, that would work.

> > @@ -2472,16 +2470,20 @@ void free_hot_cold_page(struct page *page, bool cold)
> >  {
> >  	struct zone *zone = page_zone(page);
> >  	struct per_cpu_pages *pcp;
> > -	unsigned long flags;
> >  	unsigned long pfn = page_to_pfn(page);
> >  	int migratetype;
> >  
> >  	if (!free_pcp_prepare(page))
> >  		return;
> >  
> > +	if (in_interrupt()) {
> > +		__free_pages_ok(page, 0);
> > +		return;
> > +	}
> 
> I think this should go *before* free_pcp_prepare() otherwise
> free_pages_prepare() gets done twice in interrupt.
> 

You're right, thanks.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-20 15:26     ` Mel Gorman
@ 2017-01-23 16:29       ` Petr Mladek
  2017-01-23 16:50         ` Mel Gorman
  2017-01-23 17:03       ` Tejun Heo
  1 sibling, 1 reply; 25+ messages in thread
From: Petr Mladek @ 2017-01-23 16:29 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Tejun Heo

On Fri 2017-01-20 15:26:06, Mel Gorman wrote:
> On Fri, Jan 20, 2017 at 03:26:05PM +0100, Vlastimil Babka wrote:
> > > @@ -2392,8 +2404,24 @@ void drain_all_pages(struct zone *zone)
> > >  		else
> > >  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
> > >  	}
> > > -	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
> > > -								zone, 1);
> > > +
> > > +	if (works) {
> > > +		for_each_cpu(cpu, &cpus_with_pcps) {
> > > +			struct work_struct *work = per_cpu_ptr(works, cpu);
> > > +			INIT_WORK(work, drain_local_pages_wq);
> > > +			schedule_work_on(cpu, work);
> > 
> > This translates to queue_work_on(), which has the comment of "We queue
> > the work to a specific CPU, the caller must ensure it can't go away.",
> > so is this safe? lru_add_drain_all() uses get_online_cpus() around this.
> > 
> 
> get_online_cpus() would be required.
> 
> > schedule_work_on() also uses the generic system_wq, while lru drain has
> > its own workqueue with WQ_MEM_RECLAIM so it seems that would be useful
> > here as well?
> > 
> 
> I would be reluctant to introduce a dedicated queue unless there was a
> definite case where an OOM occurred because pages were pinned on per-cpu
> lists and couldn't be drained because the buddy allocator was depleted.
> As it was, I thought the fallback case was excessively paranoid.

I guess that you know it but it is not clear from the above paragraph.

WQ_MEM_RECLAIM makes sure that there is a rescue worker available.
It is used when all workers are busy (blocked by an allocation
request) and new worker (kthread) cannot be forked because
the fork would need an allocation as well.

The fallback below solves the situation when struct work cannot
be allocated. But it does not solve the situation when there is
no worker to actually proceed the work. I am not sure if this
is relevant for drain_all_pages().

Best Regards,
Petr

> > > +		}
> > > +		for_each_cpu(cpu, &cpus_with_pcps)
> > > +			flush_work(per_cpu_ptr(works, cpu));
> > > +	} else {
> > > +		for_each_cpu(cpu, &cpus_with_pcps) {
> > > +			struct work_struct work;
> > > +
> > > +			INIT_WORK(&work, drain_local_pages_wq);
> > > +			schedule_work_on(cpu, &work);
> > > +			flush_work(&work);
> > 
> > Totally out of scope, but I wonder if schedule_on_each_cpu() could use
> > the same fallback that's here?
> > 
> 
> I'm not aware of a case where it really has been a problem. I only considered
> it here as the likely caller is in a context that is failing allocations.
> 
> -- 
> Mel Gorman
> SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-23 16:29       ` Petr Mladek
@ 2017-01-23 16:50         ` Mel Gorman
  0 siblings, 0 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-23 16:50 UTC (permalink / raw)
  To: Petr Mladek
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Tejun Heo

On Mon, Jan 23, 2017 at 05:29:20PM +0100, Petr Mladek wrote:
> On Fri 2017-01-20 15:26:06, Mel Gorman wrote:
> > On Fri, Jan 20, 2017 at 03:26:05PM +0100, Vlastimil Babka wrote:
> > > > @@ -2392,8 +2404,24 @@ void drain_all_pages(struct zone *zone)
> > > >  		else
> > > >  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
> > > >  	}
> > > > -	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
> > > > -								zone, 1);
> > > > +
> > > > +	if (works) {
> > > > +		for_each_cpu(cpu, &cpus_with_pcps) {
> > > > +			struct work_struct *work = per_cpu_ptr(works, cpu);
> > > > +			INIT_WORK(work, drain_local_pages_wq);
> > > > +			schedule_work_on(cpu, work);
> > > 
> > > This translates to queue_work_on(), which has the comment of "We queue
> > > the work to a specific CPU, the caller must ensure it can't go away.",
> > > so is this safe? lru_add_drain_all() uses get_online_cpus() around this.
> > > 
> > 
> > get_online_cpus() would be required.
> > 
> > > schedule_work_on() also uses the generic system_wq, while lru drain has
> > > its own workqueue with WQ_MEM_RECLAIM so it seems that would be useful
> > > here as well?
> > > 
> > 
> > I would be reluctant to introduce a dedicated queue unless there was a
> > definite case where an OOM occurred because pages were pinned on per-cpu
> > lists and couldn't be drained because the buddy allocator was depleted.
> > As it was, I thought the fallback case was excessively paranoid.
> 
> I guess that you know it but it is not clear from the above paragraph.
> 
> WQ_MEM_RECLAIM makes sure that there is a rescue worker available.
> It is used when all workers are busy (blocked by an allocation
> request) and new worker (kthread) cannot be forked because
> the fork would need an allocation as well.
> 
> The fallback below solves the situation when struct work cannot
> be allocated. But it does not solve the situation when there is
> no worker to actually proceed the work. I am not sure if this
> is relevant for drain_all_pages().
> 

I'm aware of the situation but in itself, I still don't think it justifies
a dedicated workqueue. The main call for drain_all_pages under reclaim
pressure is dubious because it's easy to trigger. For example, two contenders
for memory that are doing a streaming read or large amounts of anonymous
faults. Reclaim can be making progress but the two are racing with each
other to keep the watermarks above min and draining frequently. The IPIs
for a fairly normal situation are bad enough and even the workqueue work
isn't particularly welcome.

It would make more sense overall to move the unreserve and drain logic
into the nearly-oom path but it would likely be overkill. I'd only want
to look into that or a dedicated workqueue if there is a case of an OOM
triggered when a large number of CPUs had per-cpu pages available.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-20 15:26     ` Mel Gorman
  2017-01-23 16:29       ` Petr Mladek
@ 2017-01-23 17:03       ` Tejun Heo
  2017-01-23 20:04         ` Mel Gorman
  1 sibling, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2017-01-23 17:03 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

Hello,

On Fri, Jan 20, 2017 at 03:26:06PM +0000, Mel Gorman wrote:
> > This translates to queue_work_on(), which has the comment of "We queue
> > the work to a specific CPU, the caller must ensure it can't go away.",
> > so is this safe? lru_add_drain_all() uses get_online_cpus() around this.
> > 
> 
> get_online_cpus() would be required.

This part of workqueue usage has always been a bit clunky and I should
imrpove it but you don't necessarily have to pin the cpus from
queueing to execution.  You can queue without checking whether the CPU
is online and instead synchronize the actual work item execution
against cpu offline callback so that if the work item gets executed
after offline callback is finished, it becomes a noop.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-23 17:03       ` Tejun Heo
@ 2017-01-23 20:04         ` Mel Gorman
  2017-01-23 20:55           ` Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Mel Gorman @ 2017-01-23 20:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

On Mon, Jan 23, 2017 at 12:03:29PM -0500, Tejun Heo wrote:
> Hello,
> 
> On Fri, Jan 20, 2017 at 03:26:06PM +0000, Mel Gorman wrote:
> > > This translates to queue_work_on(), which has the comment of "We queue
> > > the work to a specific CPU, the caller must ensure it can't go away.",
> > > so is this safe? lru_add_drain_all() uses get_online_cpus() around this.
> > > 
> > 
> > get_online_cpus() would be required.
> 
> This part of workqueue usage has always been a bit clunky and I should
> imrpove it but you don't necessarily have to pin the cpus from
> queueing to execution.  You can queue without checking whether the CPU
> is online and instead synchronize the actual work item execution
> against cpu offline callback so that if the work item gets executed
> after offline callback is finished, it becomes a noop.
> 

What is the actual mechanism that does that? It's not something that
schedule_on_each_cpu does and one would expect that the core workqueue
implementation would get this sort of detail correct. Or is this a proposal
on how it should be done?

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-23 20:04         ` Mel Gorman
@ 2017-01-23 20:55           ` Tejun Heo
  2017-01-23 23:04             ` Mel Gorman
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2017-01-23 20:55 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

Hello, Mel.

On Mon, Jan 23, 2017 at 08:04:12PM +0000, Mel Gorman wrote:
> What is the actual mechanism that does that? It's not something that
> schedule_on_each_cpu does and one would expect that the core workqueue
> implementation would get this sort of detail correct. Or is this a proposal
> on how it should be done?

If you use schedule_on_each_cpu(), it's all fine as the thing pins
cpus and waits for all the work items synchronously.  If you wanna do
it asynchronously, right now, you'll have to manually synchronize work
items against the offline callback manually.

On this area, the current workqueue behavior is pretty bad.
Historically, we didn't distinguish affinity-for-optimization
affinity-for-correctness, so we couldn't really enforce strong
behaviors on it.  We started distinguishing them some releases ago, so
I should revisit it soon.

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-23 20:55           ` Tejun Heo
@ 2017-01-23 23:04             ` Mel Gorman
  2017-01-24 16:07               ` Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Mel Gorman @ 2017-01-23 23:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

On Mon, Jan 23, 2017 at 03:55:01PM -0500, Tejun Heo wrote:
> Hello, Mel.
> 
> On Mon, Jan 23, 2017 at 08:04:12PM +0000, Mel Gorman wrote:
> > What is the actual mechanism that does that? It's not something that
> > schedule_on_each_cpu does and one would expect that the core workqueue
> > implementation would get this sort of detail correct. Or is this a proposal
> > on how it should be done?
> 
> If you use schedule_on_each_cpu(), it's all fine as the thing pins
> cpus and waits for all the work items synchronously.  If you wanna do
> it asynchronously, right now, you'll have to manually synchronize work
> items against the offline callback manually.
> 

Is the current implementation and what it does wrong in some way? I ask
because synchronising against the offline callback sounds like it would
be a bit of a maintenance mess for relatively little gain.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-17  9:29 ` [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context Mel Gorman
  2017-01-20 14:26   ` Vlastimil Babka
@ 2017-01-24 11:08   ` Vlastimil Babka
  1 sibling, 0 replies; 25+ messages in thread
From: Vlastimil Babka @ 2017-01-24 11:08 UTC (permalink / raw)
  To: Mel Gorman, Andrew Morton
  Cc: Linux Kernel, Linux-MM, Hillf Danton, Jesper Dangaard Brouer, Tejun Heo

On 01/17/2017 10:29 AM, Mel Gorman wrote:
> The per-cpu page allocator can be drained immediately via drain_all_pages()
> which sends IPIs to every CPU. In the next patch, the per-cpu allocator
> will only be used for interrupt-safe allocations which prevents draining
> it from IPI context. This patch uses workqueues to drain the per-cpu
> lists instead.
> 
> This is slower but no slowdown during intensive reclaim was measured and
> the paths that use drain_all_pages() are not that sensitive to performance.
> This is particularly true as the path would only be triggered when reclaim
> is failing. It also makes a some sense to avoid storming a machine with IPIs
> when it's under memory pressure. Arguably, it should be further adjusted
> so that only one caller at a time is draining pages but it's beyond the
> scope of the current patch.
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  mm/page_alloc.c | 42 +++++++++++++++++++++++++++++++++++-------
>  1 file changed, 35 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d15527a20dce..9c3a0fcf8c13 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2341,19 +2341,21 @@ void drain_local_pages(struct zone *zone)
>  		drain_pages(cpu);
>  }
>  
> +static void drain_local_pages_wq(struct work_struct *work)
> +{
> +	drain_local_pages(NULL);
> +}
> +
>  /*
>   * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
>   *
>   * When zone parameter is non-NULL, spill just the single zone's pages.
>   *
> - * Note that this code is protected against sending an IPI to an offline
> - * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
> - * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
> - * nothing keeps CPUs from showing up after we populated the cpumask and
> - * before the call to on_each_cpu_mask().
> + * Note that this can be extremely slow as the draining happens in a workqueue.
>   */
>  void drain_all_pages(struct zone *zone)
>  {
> +	struct work_struct __percpu *works;
>  	int cpu;
>  
>  	/*
> @@ -2362,6 +2364,16 @@ void drain_all_pages(struct zone *zone)
>  	 */
>  	static cpumask_t cpus_with_pcps;
>  
> +	/* Workqueues cannot recurse */
> +	if (current->flags & PF_WQ_WORKER)
> +		return;
> +
> +	/*
> +	 * As this can be called from reclaim context, do not reenter reclaim.
> +	 * An allocation failure can be handled, it's simply slower
> +	 */
> +	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);

BTW I wonder, even with GFP_ATOMIC, is this a good idea to do for a
temporary allocation like this one? pcpu_alloc() seems rather involved
to me and I've glanced at the other usages and they seem much more
long-lived. Maybe it would be really better to have single static
"works" and serialize the callers as you suggest in the changelog?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-23 23:04             ` Mel Gorman
@ 2017-01-24 16:07               ` Tejun Heo
  2017-01-24 23:54                 ` Mel Gorman
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2017-01-24 16:07 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

Hello, Mel.

On Mon, Jan 23, 2017 at 11:04:29PM +0000, Mel Gorman wrote:
> On Mon, Jan 23, 2017 at 03:55:01PM -0500, Tejun Heo wrote:
> > Hello, Mel.
> > 
> > On Mon, Jan 23, 2017 at 08:04:12PM +0000, Mel Gorman wrote:
> > > What is the actual mechanism that does that? It's not something that
> > > schedule_on_each_cpu does and one would expect that the core workqueue
> > > implementation would get this sort of detail correct. Or is this a proposal
> > > on how it should be done?
> > 
> > If you use schedule_on_each_cpu(), it's all fine as the thing pins
> > cpus and waits for all the work items synchronously.  If you wanna do
> > it asynchronously, right now, you'll have to manually synchronize work
> > items against the offline callback manually.
> > 
> 
> Is the current implementation and what it does wrong in some way? I ask
> because synchronising against the offline callback sounds like it would
> be a bit of a maintenance mess for relatively little gain.

As long as you wrap them with get/put_online_cpus(), the current
implementation should be fine.  If it were up to me, I'd rather use
static percpu work_structs and synchronize with a mutex tho.  The cost
of synchronizing via mutex isn't high here compared to the overall
operation, the whole thing is synchronous anyway and you won't have to
worry about falling back.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-24 16:07               ` Tejun Heo
@ 2017-01-24 23:54                 ` Mel Gorman
  2017-01-25  2:02                   ` Tejun Heo
  0 siblings, 1 reply; 25+ messages in thread
From: Mel Gorman @ 2017-01-24 23:54 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

On Tue, Jan 24, 2017 at 11:07:22AM -0500, Tejun Heo wrote:
> Hello, Mel.
> 
> On Mon, Jan 23, 2017 at 11:04:29PM +0000, Mel Gorman wrote:
> > On Mon, Jan 23, 2017 at 03:55:01PM -0500, Tejun Heo wrote:
> > > Hello, Mel.
> > > 
> > > On Mon, Jan 23, 2017 at 08:04:12PM +0000, Mel Gorman wrote:
> > > > What is the actual mechanism that does that? It's not something that
> > > > schedule_on_each_cpu does and one would expect that the core workqueue
> > > > implementation would get this sort of detail correct. Or is this a proposal
> > > > on how it should be done?
> > > 
> > > If you use schedule_on_each_cpu(), it's all fine as the thing pins
> > > cpus and waits for all the work items synchronously.  If you wanna do
> > > it asynchronously, right now, you'll have to manually synchronize work
> > > items against the offline callback manually.
> > > 
> > 
> > Is the current implementation and what it does wrong in some way? I ask
> > because synchronising against the offline callback sounds like it would
> > be a bit of a maintenance mess for relatively little gain.
> 
> As long as you wrap them with get/put_online_cpus(), the current
> implementation should be fine.  If it were up to me, I'd rather use
> static percpu work_structs and synchronize with a mutex tho.  The cost
> of synchronizing via mutex isn't high here compared to the overall
> operation, the whole thing is synchronous anyway and you won't have to
> worry about falling back.
> 

The synchronisation is not even required in all cases. Multiple direct
reclaimers synching to do the drain doesn't necessarily make sense for
example. How does the following look to you?

---8<---
mm, page_alloc: Use static global work_struct for draining per-cpu pages

As suggested by Vlastimil Babka and Tejun Heo, this patch uses a static
work_struct to co-ordinate the draining of per-cpu pages on the workqueue.
Only one task can drain at a time but this is better than the previous
scheme that allowed multiple tasks to send IPIs at a time.

One consideration is whether parallel requests should synchronise against
each other. This patch does not synchronise for a global drain. The common
case for such callers is expected to be multiple parallel direct reclaimers
competing for pages when the watermark is close to min. Draining the
per-cpu list is unlikely to make much progress and serialising the drain
is of dubious merit in that case. Drains are synchonrised for callers such
as memory hotplug and CMA that care about the drain being complete when
the function returns.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e87508ffa759..da6be2a5ff7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -92,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 int _node_numa_mem_[MAX_NUMNODES];
 #endif
 
+/* work_structs for global per-cpu drains */
+DEFINE_MUTEX(pcpu_drain_mutex);
+DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 volatile unsigned long latent_entropy __latent_entropy;
 EXPORT_SYMBOL(latent_entropy);
@@ -2351,7 +2355,6 @@ static void drain_local_pages_wq(struct work_struct *work)
  */
 void drain_all_pages(struct zone *zone)
 {
-	struct work_struct __percpu *works;
 	int cpu;
 
 	/*
@@ -2365,11 +2368,21 @@ void drain_all_pages(struct zone *zone)
 		return;
 
 	/*
+	 * Do not drain if one is already in progress unless it's specific to
+	 * a zone. Such callers are primarily CMA and memory hotplug and need
+	 * the drain to be complete when the call returns.
+	 */
+	if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
+		if (!zone)
+			return;
+		mutex_lock(&pcpu_drain_mutex);
+	}
+
+	/*
 	 * As this can be called from reclaim context, do not reenter reclaim.
 	 * An allocation failure can be handled, it's simply slower
 	 */
 	get_online_cpus();
-	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);
 
 	/*
 	 * We don't care about racing with CPU hotplug event
@@ -2402,24 +2415,16 @@ void drain_all_pages(struct zone *zone)
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 
-	if (works) {
-		for_each_cpu(cpu, &cpus_with_pcps) {
-			struct work_struct *work = per_cpu_ptr(works, cpu);
-			INIT_WORK(work, drain_local_pages_wq);
-			schedule_work_on(cpu, work);
-		}
-		for_each_cpu(cpu, &cpus_with_pcps)
-			flush_work(per_cpu_ptr(works, cpu));
-	} else {
-		for_each_cpu(cpu, &cpus_with_pcps) {
-			struct work_struct work;
-
-			INIT_WORK(&work, drain_local_pages_wq);
-			schedule_work_on(cpu, &work);
-			flush_work(&work);
-		}
+	for_each_cpu(cpu, &cpus_with_pcps) {
+		struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+		INIT_WORK(work, drain_local_pages_wq);
+		schedule_work_on(cpu, work);
 	}
+	for_each_cpu(cpu, &cpus_with_pcps)
+		flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+
 	put_online_cpus();
+	mutex_unlock(&pcpu_drain_mutex);
 }
 
 #ifdef CONFIG_HIBERNATION

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-24 23:54                 ` Mel Gorman
@ 2017-01-25  2:02                   ` Tejun Heo
  2017-01-25  8:30                     ` Mel Gorman
  0 siblings, 1 reply; 25+ messages in thread
From: Tejun Heo @ 2017-01-25  2:02 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

Hello,

On Tue, Jan 24, 2017 at 11:54:57PM +0000, Mel Gorman wrote:
> @@ -2402,24 +2415,16 @@ void drain_all_pages(struct zone *zone)
>  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
>  	}
>  
> +	for_each_cpu(cpu, &cpus_with_pcps) {
> +		struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
> +		INIT_WORK(work, drain_local_pages_wq);
> +		schedule_work_on(cpu, work);
>  	}
> +	for_each_cpu(cpu, &cpus_with_pcps)
> +		flush_work(per_cpu_ptr(&pcpu_drain, cpu));
> +
>  	put_online_cpus();
> +	mutex_unlock(&pcpu_drain_mutex);

Looks good to me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-25  2:02                   ` Tejun Heo
@ 2017-01-25  8:30                     ` Mel Gorman
  0 siblings, 0 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-25  8:30 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vlastimil Babka, Andrew Morton, Linux Kernel, Linux-MM,
	Hillf Danton, Jesper Dangaard Brouer, Petr Mladek

On Tue, Jan 24, 2017 at 09:02:20PM -0500, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jan 24, 2017 at 11:54:57PM +0000, Mel Gorman wrote:
> > @@ -2402,24 +2415,16 @@ void drain_all_pages(struct zone *zone)
> >  			cpumask_clear_cpu(cpu, &cpus_with_pcps);
> >  	}
> >  
> > +	for_each_cpu(cpu, &cpus_with_pcps) {
> > +		struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
> > +		INIT_WORK(work, drain_local_pages_wq);
> > +		schedule_work_on(cpu, work);
> >  	}
> > +	for_each_cpu(cpu, &cpus_with_pcps)
> > +		flush_work(per_cpu_ptr(&pcpu_drain, cpu));
> > +
> >  	put_online_cpus();
> > +	mutex_unlock(&pcpu_drain_mutex);
> 
> Looks good to me.
> 

Thanks Tejun.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context
  2017-01-23 15:39 [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v5 Mel Gorman
@ 2017-01-23 15:39 ` Mel Gorman
  0 siblings, 0 replies; 25+ messages in thread
From: Mel Gorman @ 2017-01-23 15:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Kernel, Linux-MM, Vlastimil Babka, Hillf Danton,
	Jesper Dangaard Brouer, Mel Gorman

The per-cpu page allocator can be drained immediately via drain_all_pages()
which sends IPIs to every CPU. In the next patch, the per-cpu allocator
will only be used for interrupt-safe allocations which prevents draining
it from IPI context. This patch uses workqueues to drain the per-cpu
lists instead.

This is slower but no slowdown during intensive reclaim was measured and
the paths that use drain_all_pages() are not that sensitive to performance.
This is particularly true as the path would only be triggered when reclaim
is failing. It also makes a some sense to avoid storming a machine with IPIs
when it's under memory pressure. Arguably, it should be further adjusted
so that only one caller at a time is draining pages but it's beyond the
scope of the current patch.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd2ded8b416f..1acdfd80031a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2339,19 +2339,21 @@ void drain_local_pages(struct zone *zone)
 		drain_pages(cpu);
 }
 
+static void drain_local_pages_wq(struct work_struct *work)
+{
+	drain_local_pages(NULL);
+}
+
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * When zone parameter is non-NULL, spill just the single zone's pages.
  *
- * Note that this code is protected against sending an IPI to an offline
- * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
- * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
- * nothing keeps CPUs from showing up after we populated the cpumask and
- * before the call to on_each_cpu_mask().
+ * Note that this can be extremely slow as the draining happens in a workqueue.
  */
 void drain_all_pages(struct zone *zone)
 {
+	struct work_struct __percpu *works;
 	int cpu;
 
 	/*
@@ -2360,6 +2362,17 @@ void drain_all_pages(struct zone *zone)
 	 */
 	static cpumask_t cpus_with_pcps;
 
+	/* Workqueues cannot recurse */
+	if (current->flags & PF_WQ_WORKER)
+		return;
+
+	/*
+	 * As this can be called from reclaim context, do not reenter reclaim.
+	 * An allocation failure can be handled, it's simply slower
+	 */
+	get_online_cpus();
+	works = alloc_percpu_gfp(struct work_struct, GFP_ATOMIC);
+
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
@@ -2390,8 +2403,25 @@ void drain_all_pages(struct zone *zone)
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
-	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
-								zone, 1);
+
+	if (works) {
+		for_each_cpu(cpu, &cpus_with_pcps) {
+			struct work_struct *work = per_cpu_ptr(works, cpu);
+			INIT_WORK(work, drain_local_pages_wq);
+			schedule_work_on(cpu, work);
+		}
+		for_each_cpu(cpu, &cpus_with_pcps)
+			flush_work(per_cpu_ptr(works, cpu));
+	} else {
+		for_each_cpu(cpu, &cpus_with_pcps) {
+			struct work_struct work;
+
+			INIT_WORK(&work, drain_local_pages_wq);
+			schedule_work_on(cpu, &work);
+			flush_work(&work);
+		}
+	}
+	put_online_cpus();
 }
 
 #ifdef CONFIG_HIBERNATION
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2017-01-25  8:31 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-17  9:29 [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v4 Mel Gorman
2017-01-17  9:29 ` [PATCH 1/4] mm, page_alloc: Split buffered_rmqueue Mel Gorman
2017-01-17 18:07   ` Jesper Dangaard Brouer
     [not found]     ` <2df88f73-a32d-4b71-d4de-3a0ad8831d9a@suse.cz>
2017-01-17 20:20       ` Mel Gorman
2017-01-17 21:07         ` Mel Gorman
2017-01-17 21:24           ` Vlastimil Babka
2017-01-17  9:29 ` [PATCH 2/4] mm, page_alloc: Split alloc_pages_nodemask Mel Gorman
2017-01-17  9:29 ` [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context Mel Gorman
2017-01-20 14:26   ` Vlastimil Babka
2017-01-20 15:26     ` Mel Gorman
2017-01-23 16:29       ` Petr Mladek
2017-01-23 16:50         ` Mel Gorman
2017-01-23 17:03       ` Tejun Heo
2017-01-23 20:04         ` Mel Gorman
2017-01-23 20:55           ` Tejun Heo
2017-01-23 23:04             ` Mel Gorman
2017-01-24 16:07               ` Tejun Heo
2017-01-24 23:54                 ` Mel Gorman
2017-01-25  2:02                   ` Tejun Heo
2017-01-25  8:30                     ` Mel Gorman
2017-01-24 11:08   ` Vlastimil Babka
2017-01-17  9:29 ` [PATCH 4/4] mm, page_alloc: Only use per-cpu allocator for irq-safe requests Mel Gorman
2017-01-20 15:02   ` Vlastimil Babka
2017-01-23 11:17     ` Mel Gorman
2017-01-23 15:39 [PATCH 0/4] Use per-cpu allocator for !irq requests and prepare for a bulk allocator v5 Mel Gorman
2017-01-23 15:39 ` [PATCH 3/4] mm, page_alloc: Drain per-cpu pages from workqueue context Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).