* [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim
@ 2016-04-06 11:22 Mel Gorman
2016-04-06 11:22 ` [PATCH 23/27] mm, vmscan: Add classzone information to tracepoints Mel Gorman
` (4 more replies)
0 siblings, 5 replies; 10+ messages in thread
From: Mel Gorman @ 2016-04-06 11:22 UTC (permalink / raw)
To: Linux-MM; +Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
As reclaim is now per-node based, convert zone_reclaim to be node_reclaim.
It is possible that a node will be reclaimed multiple times if it has
multiple zones but this is unavoidable without caching all nodes traversed
so far. The documentation and interface to userspace is the same from
a configuration perspective and will will be similar in behaviour unless
the node-local allocation requests were also limited to lower zones.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
include/linux/mmzone.h | 18 +++++------
include/linux/swap.h | 9 +++---
include/linux/topology.h | 2 +-
kernel/sysctl.c | 4 +--
mm/huge_memory.c | 4 +--
mm/internal.h | 8 ++---
mm/page_alloc.c | 24 ++++++++++-----
mm/vmscan.c | 77 ++++++++++++++++++++++++------------------------
8 files changed, 77 insertions(+), 69 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3668df4a69b9..8c4aa4e98783 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -370,14 +370,6 @@ struct zone {
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_NUMA
- /*
- * zone reclaim becomes active if more unmapped pages exist.
- */
- unsigned long min_unmapped_pages;
- unsigned long min_slab_pages;
-#endif /* CONFIG_NUMA */
-
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
@@ -522,7 +514,6 @@ struct zone {
} ____cacheline_internodealigned_in_smp;
enum zone_flags {
- ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};
@@ -538,6 +529,7 @@ enum pgdat_flags {
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
+ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
};
static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -686,6 +678,14 @@ typedef struct pglist_data {
*/
unsigned long totalreserve_pages;
+#ifdef CONFIG_NUMA
+ /*
+ * zone reclaim becomes active if more unmapped pages exist.
+ */
+ unsigned long min_unmapped_pages;
+ unsigned long min_slab_pages;
+#endif /* CONFIG_NUMA */
+
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8d82d1f9d268..76f0ba627ff7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -333,13 +333,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages;
#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
{
return 0;
}
diff --git a/include/linux/topology.h b/include/linux/topology.h
index afce69296ac0..cb0775e1ee4b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..27148ed6bf6a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1476,8 +1476,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6d73f1a566ae..39ab35a92e53 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2197,10 +2197,10 @@ static bool khugepaged_scan_abort(int nid)
int i;
/*
- * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!zone_reclaim_mode)
+ if (!node_reclaim_mode)
return false;
/* If there is a count for this node already, it must be acceptable */
diff --git a/mm/internal.h b/mm/internal.h
index 1f91ce702cc5..5417545fd86e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,10 +426,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-#define ZONE_RECLAIM_NOSCAN -2
-#define ZONE_RECLAIM_FULL -1
-#define ZONE_RECLAIM_SOME 0
-#define ZONE_RECLAIM_SUCCESS 1
+#define NODE_RECLAIM_NOSCAN -2
+#define NODE_RECLAIM_FULL -1
+#define NODE_RECLAIM_SOME 0
+#define NODE_RECLAIM_SUCCESS 1
extern int hwpoison_filter(struct page *p);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e8e518af7a97..46c6a76cacb6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2698,16 +2698,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (zone_reclaim_mode == 0 ||
+ if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zone, zone))
continue;
- ret = zone_reclaim(zone, gfp_mask, order);
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
+ case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
- case ZONE_RECLAIM_FULL:
+ case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
@@ -5519,9 +5519,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+ pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+ pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
zone->zone_pgdat = pgdat;
@@ -6546,6 +6546,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6553,8 +6554,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
@@ -6562,6 +6566,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6569,8 +6574,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a1653fedd88..38325d331aa3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3571,12 +3571,12 @@ module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
- * Zone reclaim mode
+ * Node reclaim mode
*
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
* the watermarks.
*/
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
@@ -3584,14 +3584,14 @@ int zone_reclaim_mode __read_mostly;
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
/*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
@@ -3617,7 +3617,7 @@ static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct pglist_data *pgdat)
{
unsigned long nr_pagecache_reclaimable;
unsigned long delta = 0;
@@ -3628,14 +3628,14 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
* pages like swapcache and node_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_UNMAP)
- nr_pagecache_reclaimable = node_page_state(zone->zone_pgdat, NR_FILE_PAGES);
+ if (node_reclaim_mode & RECLAIM_UNMAP)
+ nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
else
- nr_pagecache_reclaimable = node_unmapped_file_pages(zone->zone_pgdat);
+ nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */
- if (!(zone_reclaim_mode & RECLAIM_WRITE))
- delta += node_page_state(zone->zone_pgdat, NR_FILE_DIRTY);
+ if (!(node_reclaim_mode & RECLAIM_WRITE))
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3645,23 +3645,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
}
/*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
*/
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
+ int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
- .priority = ZONE_RECLAIM_PRIORITY,
- .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
- .reclaim_idx = zone_idx(zone),
+ .reclaim_idx = classzone_idx,
};
cond_resched();
@@ -3675,13 +3676,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ if (zone_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
- shrink_node(zone->zone_pgdat, &sc, zone_idx(zone));
+ shrink_node(pgdat, &sc, classzone_idx);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
@@ -3691,49 +3692,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
return sc.nr_reclaimed >= nr_pages;
}
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
- int node_id;
int ret;
/*
- * Zone reclaim reclaims unmapped file backed pages and
+ * Node reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.
*
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
- * thrown out if the zone is overallocated. So we do not reclaim
- * if less than a specified percentage of the zone is used by
+ * thrown out if the node is overallocated. So we do not reclaim
+ * if less than a specified percentage of the node is used by
* unmapped file backed pages.
*/
- if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
- return ZONE_RECLAIM_FULL;
+ if (zone_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+ sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ return NODE_RECLAIM_FULL;
- if (!pgdat_reclaimable(zone->zone_pgdat))
- return ZONE_RECLAIM_FULL;
+ if (!pgdat_reclaimable(pgdat))
+ return NODE_RECLAIM_FULL;
/*
* Do not scan if the allocation should not be delayed.
*/
if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
- return ZONE_RECLAIM_NOSCAN;
+ return NODE_RECLAIM_NOSCAN;
/*
- * Only run zone reclaim on the local zone or on zones that do not
+ * Only run node reclaim on the local node or on nodes that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
- node_id = zone_to_nid(zone);
- if (node_state(node_id, N_CPU) && node_id != numa_node_id())
- return ZONE_RECLAIM_NOSCAN;
+ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+ return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
- return ZONE_RECLAIM_NOSCAN;
+ if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ return NODE_RECLAIM_NOSCAN;
- ret = __zone_reclaim(zone, gfp_mask, order);
- clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ ret = __node_reclaim(pgdat, gfp_mask, order);
+ clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 23/27] mm, vmscan: Add classzone information to tracepoints
2016-04-06 11:22 [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
@ 2016-04-06 11:22 ` Mel Gorman
2016-04-06 11:22 ` [PATCH 24/27] mm, page_alloc: Remove fair zone allocation policy Mel Gorman
` (3 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Mel Gorman @ 2016-04-06 11:22 UTC (permalink / raw)
To: Linux-MM; +Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
This is convenient when tracking down why the skip count is high because it'll
show what classzone kswapd woke up at and what zones are being isolated.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
include/trace/events/vmscan.h | 28 ++++++++++++++++++----------
mm/vmscan.c | 4 ++--
2 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 897f1aa1ee5f..3d242fb8910a 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -55,21 +55,23 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
TRACE_EVENT(mm_vmscan_kswapd_wake,
- TP_PROTO(int nid, int order),
+ TP_PROTO(int nid, int zid, int order),
- TP_ARGS(nid, order),
+ TP_ARGS(nid, zid, order),
TP_STRUCT__entry(
__field( int, nid )
+ __field( int, zid )
__field( int, order )
),
TP_fast_assign(
__entry->nid = nid;
+ __entry->zid = zid;
__entry->order = order;
),
- TP_printk("nid=%d order=%d", __entry->nid, __entry->order)
+ TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
);
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@@ -266,16 +268,18 @@ TRACE_EVENT(mm_shrink_slab_end,
DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
- TP_PROTO(int order,
+ TP_PROTO(int classzone_idx,
+ int order,
unsigned long nr_requested,
unsigned long nr_scanned,
unsigned long nr_taken,
isolate_mode_t isolate_mode,
int file),
- TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
+ TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
TP_STRUCT__entry(
+ __field(int, classzone_idx)
__field(int, order)
__field(unsigned long, nr_requested)
__field(unsigned long, nr_scanned)
@@ -285,6 +289,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
),
TP_fast_assign(
+ __entry->classzone_idx = classzone_idx;
__entry->order = order;
__entry->nr_requested = nr_requested;
__entry->nr_scanned = nr_scanned;
@@ -293,8 +298,9 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
__entry->file = file;
),
- TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
+ TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
__entry->isolate_mode,
+ __entry->classzone_idx,
__entry->order,
__entry->nr_requested,
__entry->nr_scanned,
@@ -304,27 +310,29 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
- TP_PROTO(int order,
+ TP_PROTO(int classzone_idx,
+ int order,
unsigned long nr_requested,
unsigned long nr_scanned,
unsigned long nr_taken,
isolate_mode_t isolate_mode,
int file),
- TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+ TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
);
DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
- TP_PROTO(int order,
+ TP_PROTO(int classzone_idx,
+ int order,
unsigned long nr_requested,
unsigned long nr_scanned,
unsigned long nr_taken,
isolate_mode_t isolate_mode,
int file),
- TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+ TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 38325d331aa3..522378c9849a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1417,7 +1417,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (!list_empty(&pages_skipped))
list_splice(&pages_skipped, src);
*nr_scanned = scan;
- trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+ trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
nr_taken, mode, is_file_lru(lru));
return nr_taken;
}
@@ -3405,7 +3405,7 @@ static int kswapd(void *p)
* Try reclaim the requested order but if that fails
* then try sleeping on the basis of the order reclaimed.
*/
- trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, order);
if (balance_pgdat(pgdat, order, classzone_idx) < order)
goto kswapd_try_sleep;
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 24/27] mm, page_alloc: Remove fair zone allocation policy
2016-04-06 11:22 [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
2016-04-06 11:22 ` [PATCH 23/27] mm, vmscan: Add classzone information to tracepoints Mel Gorman
@ 2016-04-06 11:22 ` Mel Gorman
2016-04-06 11:22 ` [PATCH 25/27] mm: page_alloc: Cache the last node whose dirty limit is reached Mel Gorman
` (2 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Mel Gorman @ 2016-04-06 11:22 UTC (permalink / raw)
To: Linux-MM; +Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
The fair zone allocation policy interleaves allocation requests between
zones to avoid an age inversion problem whereby new pages are reclaimed
to balance a zone. Reclaim is now node-based so this should no longer be
an issue and the fair zone allocation policy is not free. This patch
removes it.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
include/linux/mmzone.h | 2 --
mm/internal.h | 1 -
mm/page_alloc.c | 76 +-------------------------------------------------
mm/vmstat.c | 1 -
4 files changed, 1 insertion(+), 79 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8c4aa4e98783..258d4a11b062 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -117,7 +117,6 @@ struct zone_padding {
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
- NR_ALLOC_BATCH,
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
NR_SLAB_RECLAIMABLE,
NR_SLAB_UNRECLAIMABLE,
@@ -515,7 +514,6 @@ struct zone {
enum zone_flags {
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
- ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};
enum pgdat_flags {
diff --git a/mm/internal.h b/mm/internal.h
index 5417545fd86e..8726c5acddc7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -460,7 +460,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
-#define ALLOC_FAIR 0x100 /* fair zone allocation */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 46c6a76cacb6..54cfe26dcc66 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2399,11 +2399,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
get_pcppage_migratetype(page));
}
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
- if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
- !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
- set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
@@ -2588,40 +2583,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return local_zone->node == zone->node;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return true;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
#endif /* CONFIG_NUMA */
-static void reset_alloc_batches(struct zone *preferred_zone)
-{
- struct zone *zone = preferred_zone->zone_pgdat->node_zones;
-
- do {
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
- } while (zone++ != preferred_zone);
-}
-
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -2634,11 +2607,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
struct zoneref *z;
struct page *page = NULL;
struct zone *zone;
- int nr_fair_skipped = 0;
- bool zonelist_rescan;
-
-zonelist_scan:
- zonelist_rescan = false;
/*
* Scan zonelist, looking for a zone with enough free.
@@ -2653,20 +2621,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
!cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
- * Distribute pages in proportion to the individual
- * zone size to ensure fair page aging. The zone a
- * page was allocated in should have no effect on the
- * time the page has in memory before being reclaimed.
- */
- if (alloc_flags & ALLOC_FAIR) {
- if (!zone_local(ac->preferred_zone, zone))
- break;
- if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- nr_fair_skipped++;
- continue;
- }
- }
- /*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
@@ -2738,27 +2692,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
}
- /*
- * The first pass makes sure allocations are spread fairly within the
- * local node. However, the local node might have free pages left
- * after the fairness batches are exhausted, and remote zones haven't
- * even been considered yet. Try once more without fairness, and
- * include remote zones now, before entering the slowpath and waking
- * kswapd: prefer spilling to a remote zone over swapping locally.
- */
- if (alloc_flags & ALLOC_FAIR) {
- alloc_flags &= ~ALLOC_FAIR;
- if (nr_fair_skipped) {
- zonelist_rescan = true;
- reset_alloc_batches(ac->preferred_zone);
- }
- if (nr_online_nodes > 1)
- zonelist_rescan = true;
- }
-
- if (zonelist_rescan)
- goto zonelist_scan;
-
return NULL;
}
@@ -3312,7 +3245,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zoneref *preferred_zoneref;
struct page *page = NULL;
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
@@ -5530,9 +5463,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone_seqlock_init(zone);
zone_pcp_init(zone);
- /* For bootup, initialized properly in watermark setup */
- mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
-
if (!size)
continue;
@@ -6377,10 +6307,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-
spin_unlock_irqrestore(&zone->lock, flags);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 45ecff0f9f9f..2de1f3790548 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -937,7 +937,6 @@ int fragmentation_index(struct zone *zone, unsigned int order)
const char * const vmstat_text[] = {
/* enum zone_stat_item countes */
"nr_free_pages",
- "nr_alloc_batch",
"nr_mlock",
"nr_slab_reclaimable",
"nr_slab_unreclaimable",
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 25/27] mm: page_alloc: Cache the last node whose dirty limit is reached
2016-04-06 11:22 [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
2016-04-06 11:22 ` [PATCH 23/27] mm, vmscan: Add classzone information to tracepoints Mel Gorman
2016-04-06 11:22 ` [PATCH 24/27] mm, page_alloc: Remove fair zone allocation policy Mel Gorman
@ 2016-04-06 11:22 ` Mel Gorman
2016-04-06 11:22 ` [PATCH 26/27] mm: vmstat: Replace __count_zone_vm_events with a zone id equivalent Mel Gorman
2016-04-06 11:22 ` [PATCH 27/27] mm: vmstat: Account per-zone stalls and pages skipped during reclaim Mel Gorman
4 siblings, 0 replies; 10+ messages in thread
From: Mel Gorman @ 2016-04-06 11:22 UTC (permalink / raw)
To: Linux-MM; +Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
If a page is about to be dirtied then the page allocator attempts to limit
the total number of dirty pages that exists in any given zone. The call
to node_dirty_ok is expensive so this patch records if the last pgdat
examined hit the dirty limits. In some cases, this reduces the number
of calls to node_dirty_ok().
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
mm/page_alloc.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54cfe26dcc66..a6e6184d3e38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2607,6 +2607,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
struct zoneref *z;
struct page *page = NULL;
struct zone *zone;
+ struct pglist_data *last_pgdat_dirty_limit = NULL;
/*
* Scan zonelist, looking for a zone with enough free.
@@ -2639,8 +2640,15 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
- if (ac->spread_dirty_pages && !node_dirty_ok(zone->zone_pgdat))
- continue;
+ if (ac->spread_dirty_pages) {
+ if (last_pgdat_dirty_limit == zone->zone_pgdat)
+ continue;
+
+ if (!node_dirty_ok(zone->zone_pgdat)) {
+ last_pgdat_dirty_limit = zone->zone_pgdat;
+ continue;
+ }
+ }
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, order, mark,
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 26/27] mm: vmstat: Replace __count_zone_vm_events with a zone id equivalent
2016-04-06 11:22 [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
` (2 preceding siblings ...)
2016-04-06 11:22 ` [PATCH 25/27] mm: page_alloc: Cache the last node whose dirty limit is reached Mel Gorman
@ 2016-04-06 11:22 ` Mel Gorman
2016-04-06 11:22 ` [PATCH 27/27] mm: vmstat: Account per-zone stalls and pages skipped during reclaim Mel Gorman
4 siblings, 0 replies; 10+ messages in thread
From: Mel Gorman @ 2016-04-06 11:22 UTC (permalink / raw)
To: Linux-MM; +Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
This is partially a preparation patch for more vmstat work but it also
has the slight advantage that __count_zid_vm_events is cheaper to
calculate than __count_zone_vm_events().
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
include/linux/vmstat.h | 5 ++---
mm/page_alloc.c | 2 +-
2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ea00884ac8a0..810914b63564 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -101,9 +101,8 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_vmacache_event(x) do {} while (0)
#endif
-#define __count_zone_vm_events(item, zone, delta) \
- __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
- zone_idx(zone), delta)
+#define __count_zid_vm_events(item, zid, delta) \
+ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
/*
* Zone and node-based page accounting with per cpu differentials.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6e6184d3e38..ef04dc74e7e9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2399,7 +2399,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
get_pcppage_migratetype(page));
}
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 27/27] mm: vmstat: Account per-zone stalls and pages skipped during reclaim
2016-04-06 11:22 [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
` (3 preceding siblings ...)
2016-04-06 11:22 ` [PATCH 26/27] mm: vmstat: Replace __count_zone_vm_events with a zone id equivalent Mel Gorman
@ 2016-04-06 11:22 ` Mel Gorman
4 siblings, 0 replies; 10+ messages in thread
From: Mel Gorman @ 2016-04-06 11:22 UTC (permalink / raw)
To: Linux-MM; +Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
The vmstat allocstall was fairly useful in the general sense but
node-based LRUs change that. It's important to know if a stall was for an
address-limited allocation request as this will require skipping pages from
other zones. This patch adds pgstall_* counters to replace allocstall. The
sum of the counters will equal the old allocstall so it can be trivially
recalculated. A high number of address-limited allocation requests may
result in a lot of useless LRU scanning for suitable pages.
As address-limited allocations require pages to be skipped, it's important
to know how much useless LRU scanning took place so this patch adds
pgskip* counters. This yields the following model
1. The number of address-space limited stalls can be accounted for (pgstall)
2. The amount of useless work required to reclaim the data is accounted (pgskip)
3. The total number of scans is available from pgscan_kswapd and pgscan_direct
so from that the ratio of useful to useless scans can be calculated.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
include/linux/vm_event_item.h | 4 +++-
mm/vmscan.c | 15 +++++++++++++--
mm/vmstat.c | 3 ++-
3 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 8dcb5a813163..0a0503da8c3b 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -23,6 +23,8 @@
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
+ FOR_ALL_ZONES(PGSTALL),
+ FOR_ALL_ZONES(PGSCAN_SKIP),
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
@@ -37,7 +39,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#endif
PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
- PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ PAGEOUTRUN, PGROTATED,
DROP_PAGECACHE, DROP_SLAB,
#ifdef CONFIG_NUMA_BALANCING
NUMA_PTE_UPDATES,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 522378c9849a..d7d664324442 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1372,6 +1372,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
unsigned long scan;
+ unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
LIST_HEAD(pages_skipped);
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
@@ -1386,6 +1387,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (page_zonenum(page) > sc->reclaim_idx) {
list_move(&page->lru, &pages_skipped);
+ nr_skipped[page_zonenum(page)]++;
continue;
}
@@ -1414,8 +1416,17 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* scanning would soon rescan the same pages to skip and put the
* system at risk of premature OOM.
*/
- if (!list_empty(&pages_skipped))
+ if (!list_empty(&pages_skipped)) {
+ int zid;
+
list_splice(&pages_skipped, src);
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ if (!nr_skipped[zid])
+ continue;
+
+ __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+ }
+ }
*nr_scanned = scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
nr_taken, mode, is_file_lru(lru));
@@ -2686,7 +2697,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
delayacct_freepages_start();
if (global_reclaim(sc))
- count_vm_event(ALLOCSTALL);
+ __count_zid_vm_events(PGSTALL, classzone_idx, 1);
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 2de1f3790548..ee1fb1242c59 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -992,6 +992,8 @@ const char * const vmstat_text[] = {
"pswpout",
TEXTS_FOR_ZONES("pgalloc")
+ TEXTS_FOR_ZONES("pgstall")
+ TEXTS_FOR_ZONES("pgskip")
"pgfree",
"pgactivate",
@@ -1017,7 +1019,6 @@ const char * const vmstat_text[] = {
"kswapd_low_wmark_hit_quickly",
"kswapd_high_wmark_hit_quickly",
"pageoutrun",
- "allocstall",
"pgrotated",
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 00/27] Move LRU page reclaim from zones to nodes v5
@ 2016-04-15 9:13 Mel Gorman
2016-04-15 9:13 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
0 siblings, 1 reply; 10+ messages in thread
From: Mel Gorman @ 2016-04-15 9:13 UTC (permalink / raw)
To: Andrew Morton, Linux-MM
Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner,
Jesper Dangaard Brouer, LKML, Mel Gorman
Changelog since v4
o Rebase on top of v3 of page allocator optimisation series
Changelog since v3
o Rebase on top of the page allocator optimisation series
o Remove RFC tag
This is the latest version of a series that moves LRUs from the zones to
the node that is based upon 4.6-rc3 plus the page allocator optimisation
series. Conceptually, this is simple but there are a lot of details. Some
of the broad motivations for this are;
1. The residency of a page partially depends on what zone the page was
allocated from. This is partially combatted by the fair zone allocation
policy but that is a partial solution that introduces overhead in the
page allocator paths.
2. Currently, reclaim on node 0 behaves slightly different to node 1. For
example, direct reclaim scans in zonelist order and reclaims even if
the zone is over the high watermark regardless of the age of pages
in that LRU. Kswapd on the other hand starts reclaim on the highest
unbalanced zone. A difference in distribution of file/anon pages due
to when they were allocated results can result in a difference in
again. While the fair zone allocation policy mitigates some of the
problems here, the page reclaim results on a multi-zone node will
always be different to a single-zone node.
it was scheduled on as a result.
3. kswapd and the page allocator scan zones in the opposite order to
avoid interfering with each other but it's sensitive to timing. This
mitigates the page allocator using pages that were allocated very recently
in the ideal case but it's sensitive to timing. When kswapd is allocating
from lower zones then it's great but during the rebalancing of the highest
zone, the page allocator and kswapd interfere with each other. It's worse
if the highest zone is small and difficult to balance.
4. slab shrinkers are node-based which makes it harder to identify the exact
relationship between slab reclaim and LRU reclaim.
The reason we have zone-based reclaim is that we used to have
large highmem zones in common configurations and it was necessary
to quickly find ZONE_NORMAL pages for reclaim. Today, this is much
less of a concern as machines with lots of memory will (or should) use
64-bit kernels. Combinations of 32-bit hardware and 64-bit hardware are
rare. Machines that do use highmem should have relatively low highmem:lowmem
ratios than we worried about in the past.
Conceptually, moving to node LRUs should be easier to understand. The
page allocator plays fewer tricks to game reclaim and reclaim behaves
similarly on all nodes.
It was tested on a UMA (16 cores single socket) and a NUMA machine (48
cores, 2 sockets). In most cases, only the UMA results are presented as
the NUMA machine takes an excessive amount of time to complete tests.
There may be an obvious difference in the number of
allocations from each zone as the fair zone allocation policy is removed
towards the end of the series. In cases where the working set exceeds memory,
the differences will be small but on small workloads it'll be very obvious.
For example, these are the allocation stats on a workload that is doing small
amounts of dd.
4.6.0-rc1 4.6.0-rc1
vanilla nodelru-v3
DMA allocs 0 0
DMA32 allocs 1961196 0
Normal allocs 3355799 5247180
Movable allocs 0 0
The key reason why this is not a problem is that kswapd will sleep if any
applicable zone for a classzone is free. If it tried to balance all zones
then there would be excessive reclaim.
bonnie
------
This was configured to do an IO test with a working set 2*RAM using the
ext4 filesystem. For both machines, there was no significant performance
difference between them but this is the result for the UMA machine
bonnie
4.6.0-rc1 4.6.0-rc1
vanilla nodelru-v3r10
Hmean SeqOut Char 53306.32 ( 0.00%) 79027.86 ( 48.25%)
Hmean SeqOut Block 87796.15 ( 0.00%) 87881.69 ( 0.10%)
Hmean SeqOut Rewrite 35996.31 ( 0.00%) 36355.59 ( 1.00%)
Hmean SeqIn Char 38789.17 ( 0.00%) 76356.20 ( 96.85%)
Hmean SeqIn Block 105315.39 ( 0.00%) 105514.07 ( 0.19%)
Hmean Random seeks 329.80 ( 0.00%) 334.36 ( 1.38%)
Hmean SeqCreate ops 4.62 ( 0.00%) 4.62 ( 0.00%)
Hmean SeqCreate read 4.62 ( 0.00%) 4.62 ( 0.00%)
Hmean SeqCreate del 599.29 ( 0.00%) 1580.23 (163.68%)
Hmean RandCreate ops 5.00 ( 0.00%) 5.00 ( 0.00%)
Hmean RandCreate read 5.00 ( 0.00%) 4.62 ( -7.69%)
Hmean RandCreate del 629.51 ( 0.00%) 1634.55 (159.66%)
4.6.0-rc1 4.6.0-rc1
vanillanodelru-v3r10
User 2049.02 1078.82
System 294.25 181.00
Elapsed 6960.58 6021.58
Note that the massive gains shown here are possible an anomaly. It has been noted
that in some cases, bonnie gets an artifical boost due to dumb reclaim luck. There
is no guarantee this result would be reproducible on the same machine let alone
any other machine. That said, the VM stats are interesting;
However, the overall VM stats are interesting
4.5.0-rc3 4.5.0-rc3
mmotm-20160209 nodelru-v2
Swap Ins 14 0
Swap Outs 873 0
DMA allocs 0 0
DMA32 allocs 38259888 36320496
Normal allocs 64762073 66488556
Movable allocs 0 0
Allocation stalls 3584 0
Direct pages scanned 736769 0
Kswapd pages scanned 77818637 78836064
Kswapd pages reclaimed 77782378 78812260
Direct pages reclaimed 736548 0
Kswapd efficiency 99% 99%
Kswapd velocity 11179.907 13092.256
Direct efficiency 99% 100%
Direct velocity 105.849 0.000
The series does not swap the workload and it never stalls on direct reclaim. There
is a slight increase in kswapd scans but it's offset by the elimination of direct
scans and the overall scanning velocity is not noticably higher. While it's not
reported here, the overall IO stats and CPU usage over time are very similar. kswapd
CPU usage is slightly elevated but (0.5% usage to roughly 1.2% usage over time) but
that is acceptable given the lack of direct reclaim.
tiobench
--------
tiobench is a flawed benchmark but it's very important in this case. tiobench
benefited from a bug prior to the fair zone allocation policy that allowed
old pages to be artificially preserved. The visible impact was that performance
exceeded the physical capabilities of the disk. With this patch applied the results are
tiobench Throughput
tiobench Throughput
4.6.0-rc1 4.6.0-rc1
vanilla nodelru-v3
Hmean PotentialReadSpeed 85.84 ( 0.00%) 86.20 ( 0.42%)
Hmean SeqRead-MB/sec-1 84.48 ( 0.00%) 84.60 ( 0.14%)
Hmean SeqRead-MB/sec-2 75.69 ( 0.00%) 75.44 ( -0.34%)
Hmean SeqRead-MB/sec-4 77.35 ( 0.00%) 77.62 ( 0.35%)
Hmean SeqRead-MB/sec-8 68.29 ( 0.00%) 68.58 ( 0.43%)
Hmean SeqRead-MB/sec-16 62.82 ( 0.00%) 62.72 ( -0.15%)
Hmean RandRead-MB/sec-1 0.93 ( 0.00%) 0.88 ( -4.69%)
Hmean RandRead-MB/sec-2 1.11 ( 0.00%) 1.08 ( -3.20%)
Hmean RandRead-MB/sec-4 1.52 ( 0.00%) 1.48 ( -2.86%)
Hmean RandRead-MB/sec-8 1.70 ( 0.00%) 1.70 ( -0.26%)
Hmean RandRead-MB/sec-16 1.96 ( 0.00%) 1.91 ( -2.49%)
Hmean SeqWrite-MB/sec-1 83.01 ( 0.00%) 83.07 ( 0.07%)
Hmean SeqWrite-MB/sec-2 77.80 ( 0.00%) 78.20 ( 0.52%)
Hmean SeqWrite-MB/sec-4 81.68 ( 0.00%) 81.72 ( 0.05%)
Hmean SeqWrite-MB/sec-8 78.17 ( 0.00%) 78.41 ( 0.31%)
Hmean SeqWrite-MB/sec-16 80.08 ( 0.00%) 80.08 ( 0.01%)
Hmean RandWrite-MB/sec-1 1.17 ( 0.00%) 1.17 ( -0.03%)
Hmean RandWrite-MB/sec-2 1.02 ( 0.00%) 1.06 ( 4.21%)
Hmean RandWrite-MB/sec-4 1.02 ( 0.00%) 1.04 ( 2.32%)
Hmean RandWrite-MB/sec-8 0.95 ( 0.00%) 0.97 ( 1.75%)
Hmean RandWrite-MB/sec-16 0.95 ( 0.00%) 0.96 ( 0.97%)
Note that the performance is almost identical allowing us to conclude that
the correct reclaim behaviour granted by the fair zone allocation policy
is preserved.
stutter
-------
stutter simulates a simple workload. One part uses a lot of anonymous
memory, a second measures mmap latency and a third copies a large file.
The primary metric is checking for mmap latency.
stutter
4.6.0-rc1 4.6.0-rc1
vanilla nodelru-v3
Min mmap 13.4442 ( 0.00%) 13.6705 ( -1.68%)
1st-qrtle mmap 38.0442 ( 0.00%) 37.7842 ( 0.68%)
2nd-qrtle mmap 78.5109 ( 0.00%) 40.3648 ( 48.59%)
3rd-qrtle mmap 86.7806 ( 0.00%) 46.2499 ( 46.70%)
Max-90% mmap 89.7028 ( 0.00%) 86.5790 ( 3.48%)
Max-93% mmap 90.6776 ( 0.00%) 89.5367 ( 1.26%)
Max-95% mmap 91.1678 ( 0.00%) 90.3138 ( 0.94%)
Max-99% mmap 92.0036 ( 0.00%) 93.2003 ( -1.30%)
Max mmap 167.0073 ( 0.00%) 94.5935 ( 43.36%)
Mean mmap 68.7672 ( 0.00%) 48.9853 ( 28.77%)
Best99%Mean mmap 68.5246 ( 0.00%) 48.5354 ( 29.17%)
Best95%Mean mmap 67.5540 ( 0.00%) 46.7102 ( 30.86%)
Best90%Mean mmap 66.2798 ( 0.00%) 44.3547 ( 33.08%)
Best50%Mean mmap 50.7730 ( 0.00%) 37.1298 ( 26.87%)
Best10%Mean mmap 35.8311 ( 0.00%) 33.6910 ( 5.97%)
Best5%Mean mmap 34.0159 ( 0.00%) 31.4259 ( 7.61%)
Best1%Mean mmap 22.1306 ( 0.00%) 24.8851 (-12.45%)
4.6.0-rc1 4.6.0-rc1
vanillanodelru-v3r10
User 1.51 0.97
System 138.03 122.58
Elapsed 2420.90 2394.80
The VM stats in this case were not that intresting and are very roughly comparable.
Page allocator intensive workloads showed few differences as the cost
of the fair zone allocation policy does not dominate from a userspace
perspective but a microbench of just the allocator shows a difference
4.6.0-rc1 4.6.0-rc1
vanilla nodelru-v3
Min total-odr0-1 725.00 ( 0.00%) 697.00 ( 3.86%)
Min total-odr0-2 559.00 ( 0.00%) 527.00 ( 5.72%)
Min total-odr0-4 459.00 ( 0.00%) 436.00 ( 5.01%)
Min total-odr0-8 403.00 ( 0.00%) 391.00 ( 2.98%)
Min total-odr0-16 329.00 ( 0.00%) 366.00 (-11.25%)
Min total-odr0-32 365.00 ( 0.00%) 355.00 ( 2.74%)
Min total-odr0-64 297.00 ( 0.00%) 348.00 (-17.17%)
Min total-odr0-128 752.00 ( 0.00%) 344.00 ( 54.26%)
Min total-odr0-256 385.00 ( 0.00%) 379.00 ( 1.56%)
Min total-odr0-512 899.00 ( 0.00%) 414.00 ( 53.95%)
Min total-odr0-1024 763.00 ( 0.00%) 530.00 ( 30.54%)
Min total-odr0-2048 982.00 ( 0.00%) 469.00 ( 52.24%)
Min total-odr0-4096 928.00 ( 0.00%) 526.00 ( 43.32%)
Min total-odr0-8192 1007.00 ( 0.00%) 768.00 ( 23.73%)
Min total-odr0-16384 375.00 ( 0.00%) 366.00 ( 2.40%)
This series is not without its hazards. There are at least three areas
that I'm concerned with even though I could not reproduce any problems in
that area.
1. Reclaim/compaction is going to be affected because the amount of reclaim is
no longer targetted at a specific zone. Compaction works on a per-zone basis
so there is no guarantee that reclaiming a few THP's worth page pages will
have a positive impact on compaction success rates.
2. The Slab/LRU reclaim ratio is affected because the frequency the shrinkers
are called is now different. This may or may not be a problem but if it
is, it'll be because shrinkers are not called enough and some balancing
is required.
3. The anon/file reclaim ratio may be affected. Pages about to be dirtied are
distributed between zones and the fair zone allocation policy used to do
something very similar for anon. The distribution is now different but not
necessarily in any way that matters but it's still worth bearing in mind.
Documentation/cgroup-v1/memcg_test.txt | 4 +-
Documentation/cgroup-v1/memory.txt | 4 +-
arch/s390/appldata/appldata_mem.c | 2 +-
arch/tile/mm/pgtable.c | 18 +-
drivers/base/node.c | 73 +--
drivers/staging/android/lowmemorykiller.c | 12 +-
fs/fs-writeback.c | 4 +-
fs/fuse/file.c | 8 +-
fs/nfs/internal.h | 2 +-
fs/nfs/write.c | 2 +-
fs/proc/meminfo.c | 14 +-
include/linux/backing-dev.h | 2 +-
include/linux/memcontrol.h | 30 +-
include/linux/mm_inline.h | 4 +-
include/linux/mm_types.h | 2 +-
include/linux/mmzone.h | 156 +++---
include/linux/swap.h | 13 +-
include/linux/topology.h | 2 +-
include/linux/vm_event_item.h | 14 +-
include/linux/vmstat.h | 111 +++-
include/linux/writeback.h | 2 +-
include/trace/events/vmscan.h | 40 +-
include/trace/events/writeback.h | 10 +-
kernel/power/snapshot.c | 10 +-
kernel/sysctl.c | 4 +-
mm/backing-dev.c | 14 +-
mm/compaction.c | 24 +-
mm/filemap.c | 14 +-
mm/huge_memory.c | 14 +-
mm/internal.h | 11 +-
mm/memcontrol.c | 235 ++++-----
mm/memory-failure.c | 4 +-
mm/memory_hotplug.c | 7 +-
mm/mempolicy.c | 2 +-
mm/migrate.c | 35 +-
mm/mlock.c | 12 +-
mm/page-writeback.c | 119 ++---
mm/page_alloc.c | 289 +++++-----
mm/page_idle.c | 4 +-
mm/rmap.c | 15 +-
mm/shmem.c | 12 +-
mm/swap.c | 66 +--
mm/swap_state.c | 4 +-
mm/util.c | 4 +-
mm/vmscan.c | 847 ++++++++++++++----------------
mm/vmstat.c | 369 ++++++++++---
mm/workingset.c | 53 +-
47 files changed, 1476 insertions(+), 1221 deletions(-)
--
2.6.4
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim
2016-04-15 9:13 [PATCH 00/27] Move LRU page reclaim from zones to nodes v5 Mel Gorman
@ 2016-04-15 9:13 ` Mel Gorman
0 siblings, 0 replies; 10+ messages in thread
From: Mel Gorman @ 2016-04-15 9:13 UTC (permalink / raw)
To: Andrew Morton, Linux-MM
Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner,
Jesper Dangaard Brouer, LKML, Mel Gorman
As reclaim is now per-node based, convert zone_reclaim to be node_reclaim.
It is possible that a node will be reclaimed multiple times if it has
multiple zones but this is unavoidable without caching all nodes traversed
so far. The documentation and interface to userspace is the same from
a configuration perspective and will will be similar in behaviour unless
the node-local allocation requests were also limited to lower zones.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
include/linux/mmzone.h | 18 +++++------
include/linux/swap.h | 9 +++---
include/linux/topology.h | 2 +-
kernel/sysctl.c | 4 +--
mm/huge_memory.c | 4 +--
mm/internal.h | 8 ++---
mm/page_alloc.c | 24 ++++++++++-----
mm/vmscan.c | 77 ++++++++++++++++++++++++------------------------
8 files changed, 77 insertions(+), 69 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8ffe0208d9c7..53dd1a6aa444 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -363,14 +363,6 @@ struct zone {
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_NUMA
- /*
- * zone reclaim becomes active if more unmapped pages exist.
- */
- unsigned long min_unmapped_pages;
- unsigned long min_slab_pages;
-#endif /* CONFIG_NUMA */
-
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
@@ -515,7 +507,6 @@ struct zone {
} ____cacheline_internodealigned_in_smp;
enum zone_flags {
- ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};
@@ -531,6 +522,7 @@ enum pgdat_flags {
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
+ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
};
static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -679,6 +671,14 @@ typedef struct pglist_data {
*/
unsigned long totalreserve_pages;
+#ifdef CONFIG_NUMA
+ /*
+ * zone reclaim becomes active if more unmapped pages exist.
+ */
+ unsigned long min_unmapped_pages;
+ unsigned long min_slab_pages;
+#endif /* CONFIG_NUMA */
+
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index aa566cec54fb..1eefd583cdfb 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -333,13 +333,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages;
#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
{
return 0;
}
diff --git a/include/linux/topology.h b/include/linux/topology.h
index afce69296ac0..cb0775e1ee4b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..27148ed6bf6a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1476,8 +1476,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6d73f1a566ae..39ab35a92e53 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2197,10 +2197,10 @@ static bool khugepaged_scan_abort(int nid)
int i;
/*
- * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!zone_reclaim_mode)
+ if (!node_reclaim_mode)
return false;
/* If there is a count for this node already, it must be acceptable */
diff --git a/mm/internal.h b/mm/internal.h
index 5f8914e7af39..ec08fdfc04fe 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -427,10 +427,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-#define ZONE_RECLAIM_NOSCAN -2
-#define ZONE_RECLAIM_FULL -1
-#define ZONE_RECLAIM_SOME 0
-#define ZONE_RECLAIM_SUCCESS 1
+#define NODE_RECLAIM_NOSCAN -2
+#define NODE_RECLAIM_FULL -1
+#define NODE_RECLAIM_SOME 0
+#define NODE_RECLAIM_SUCCESS 1
extern int hwpoison_filter(struct page *p);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 01b92a22b53c..fa6534bc4e98 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2983,16 +2983,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (zone_reclaim_mode == 0 ||
+ if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
- ret = zone_reclaim(zone, gfp_mask, order);
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
+ case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
- case ZONE_RECLAIM_FULL:
+ case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
@@ -5780,9 +5780,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+ pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+ pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
zone->zone_pgdat = pgdat;
@@ -6807,6 +6807,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6814,8 +6815,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
@@ -6823,6 +6827,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6830,8 +6835,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1931e98d2f67..dfd8bdfadfe4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3568,12 +3568,12 @@ module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
- * Zone reclaim mode
+ * Node reclaim mode
*
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
* the watermarks.
*/
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
@@ -3581,14 +3581,14 @@ int zone_reclaim_mode __read_mostly;
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
/*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
@@ -3614,7 +3614,7 @@ static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct pglist_data *pgdat)
{
unsigned long nr_pagecache_reclaimable;
unsigned long delta = 0;
@@ -3625,14 +3625,14 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
* pages like swapcache and node_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_UNMAP)
- nr_pagecache_reclaimable = node_page_state(zone->zone_pgdat, NR_FILE_PAGES);
+ if (node_reclaim_mode & RECLAIM_UNMAP)
+ nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
else
- nr_pagecache_reclaimable = node_unmapped_file_pages(zone->zone_pgdat);
+ nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */
- if (!(zone_reclaim_mode & RECLAIM_WRITE))
- delta += node_page_state(zone->zone_pgdat, NR_FILE_DIRTY);
+ if (!(node_reclaim_mode & RECLAIM_WRITE))
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3642,23 +3642,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
}
/*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
*/
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
+ int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
- .priority = ZONE_RECLAIM_PRIORITY,
- .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
- .reclaim_idx = zone_idx(zone),
+ .reclaim_idx = classzone_idx,
};
cond_resched();
@@ -3672,13 +3673,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ if (zone_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
- shrink_node(zone->zone_pgdat, &sc, zone_idx(zone));
+ shrink_node(pgdat, &sc, classzone_idx);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
@@ -3688,49 +3689,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
return sc.nr_reclaimed >= nr_pages;
}
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
- int node_id;
int ret;
/*
- * Zone reclaim reclaims unmapped file backed pages and
+ * Node reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.
*
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
- * thrown out if the zone is overallocated. So we do not reclaim
- * if less than a specified percentage of the zone is used by
+ * thrown out if the node is overallocated. So we do not reclaim
+ * if less than a specified percentage of the node is used by
* unmapped file backed pages.
*/
- if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
- return ZONE_RECLAIM_FULL;
+ if (zone_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+ sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ return NODE_RECLAIM_FULL;
- if (!pgdat_reclaimable(zone->zone_pgdat))
- return ZONE_RECLAIM_FULL;
+ if (!pgdat_reclaimable(pgdat))
+ return NODE_RECLAIM_FULL;
/*
* Do not scan if the allocation should not be delayed.
*/
if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
- return ZONE_RECLAIM_NOSCAN;
+ return NODE_RECLAIM_NOSCAN;
/*
- * Only run zone reclaim on the local zone or on zones that do not
+ * Only run node reclaim on the local node or on nodes that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
- node_id = zone_to_nid(zone);
- if (node_state(node_id, N_CPU) && node_id != numa_node_id())
- return ZONE_RECLAIM_NOSCAN;
+ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+ return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
- return ZONE_RECLAIM_NOSCAN;
+ if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ return NODE_RECLAIM_NOSCAN;
- ret = __zone_reclaim(zone, gfp_mask, order);
- clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ ret = __node_reclaim(pgdat, gfp_mask, order);
+ clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 00/27] Move LRU page reclaim from zones to nodes v6
@ 2016-06-09 18:04 Mel Gorman
2016-06-09 18:04 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
0 siblings, 1 reply; 10+ messages in thread
From: Mel Gorman @ 2016-06-09 18:04 UTC (permalink / raw)
To: Andrew Morton, Linux-MM
Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
This is only lightly tested as I've had stability problems during boot
that have nothing to do with the series. It's based on mmots as of June
6th. Very little has changed with the big exception of "mm, vmscan:
Move LRU lists to node" because it had to adapt to per-zone changes in
should_reclaim_retry and compaction_zonelist_suitable.
Changelog since v5
o Rebase and adjust to changes
Changelog since v4
o Rebase on top of v3 of page allocator optimisation series
Changelog since v3
o Rebase on top of the page allocator optimisation series
o Remove RFC tag
This is the latest version of a series that moves LRUs from the zones to
the node that is based upon 4.6-rc3 plus the page allocator optimisation
series. Conceptually, this is simple but there are a lot of details. Some
of the broad motivations for this are;
1. The residency of a page partially depends on what zone the page was
allocated from. This is partially combatted by the fair zone allocation
policy but that is a partial solution that introduces overhead in the
page allocator paths.
2. Currently, reclaim on node 0 behaves slightly different to node 1. For
example, direct reclaim scans in zonelist order and reclaims even if
the zone is over the high watermark regardless of the age of pages
in that LRU. Kswapd on the other hand starts reclaim on the highest
unbalanced zone. A difference in distribution of file/anon pages due
to when they were allocated results can result in a difference in
again. While the fair zone allocation policy mitigates some of the
problems here, the page reclaim results on a multi-zone node will
always be different to a single-zone node.
it was scheduled on as a result.
3. kswapd and the page allocator scan zones in the opposite order to
avoid interfering with each other but it's sensitive to timing. This
mitigates the page allocator using pages that were allocated very recently
in the ideal case but it's sensitive to timing. When kswapd is allocating
from lower zones then it's great but during the rebalancing of the highest
zone, the page allocator and kswapd interfere with each other. It's worse
if the highest zone is small and difficult to balance.
4. slab shrinkers are node-based which makes it harder to identify the exact
relationship between slab reclaim and LRU reclaim.
The reason we have zone-based reclaim is that we used to have
large highmem zones in common configurations and it was necessary
to quickly find ZONE_NORMAL pages for reclaim. Today, this is much
less of a concern as machines with lots of memory will (or should) use
64-bit kernels. Combinations of 32-bit hardware and 64-bit hardware are
rare. Machines that do use highmem should have relatively low highmem:lowmem
ratios than we worried about in the past.
Conceptually, moving to node LRUs should be easier to understand. The
page allocator plays fewer tricks to game reclaim and reclaim behaves
similarly on all nodes.
The series got basic testing this time on a UMA machine. The page allocator
microbenchmark highlights the gain from removing the fair zone allocation
policy
4.7.0-rc2 4.7.0-rc2
mmotm-20160606 nodelru-v6r2
Min total-odr0-1 500.00 ( 0.00%) 475.00 ( 5.00%)
Min total-odr0-2 358.00 ( 0.00%) 343.00 ( 4.19%)
Min total-odr0-4 292.00 ( 0.00%) 279.00 ( 4.45%)
Min total-odr0-8 253.00 ( 0.00%) 242.00 ( 4.35%)
Min total-odr0-16 275.00 ( 0.00%) 226.00 ( 17.82%)
Min total-odr0-32 225.00 ( 0.00%) 215.00 ( 4.44%)
Min total-odr0-64 219.00 ( 0.00%) 210.00 ( 4.11%)
Min total-odr0-128 216.00 ( 0.00%) 207.00 ( 4.17%)
Min total-odr0-256 243.00 ( 0.00%) 246.00 ( -1.23%)
Min total-odr0-512 276.00 ( 0.00%) 265.00 ( 3.99%)
Min total-odr0-1024 290.00 ( 0.00%) 287.00 ( 1.03%)
Min total-odr0-2048 303.00 ( 0.00%) 296.00 ( 2.31%)
Min total-odr0-4096 312.00 ( 0.00%) 310.00 ( 0.64%)
Min total-odr0-8192 320.00 ( 0.00%) 308.00 ( 3.75%)
Min total-odr0-16384 320.00 ( 0.00%) 308.00 ( 3.75%)
Min total-odr1-1 737.00 ( 0.00%) 707.00 ( 4.07%)
Min total-odr1-2 547.00 ( 0.00%) 521.00 ( 4.75%)
Min total-odr1-4 620.00 ( 0.00%) 418.00 ( 32.58%)
Min total-odr1-8 386.00 ( 0.00%) 367.00 ( 4.92%)
Min total-odr1-16 361.00 ( 0.00%) 340.00 ( 5.82%)
Min total-odr1-32 352.00 ( 0.00%) 328.00 ( 6.82%)
Min total-odr1-64 345.00 ( 0.00%) 324.00 ( 6.09%)
Min total-odr1-128 347.00 ( 0.00%) 328.00 ( 5.48%)
Min total-odr1-256 347.00 ( 0.00%) 329.00 ( 5.19%)
Min total-odr1-512 354.00 ( 0.00%) 332.00 ( 6.21%)
Min total-odr1-1024 355.00 ( 0.00%) 337.00 ( 5.07%)
Min total-odr1-2048 358.00 ( 0.00%) 345.00 ( 3.63%)
Min total-odr1-4096 360.00 ( 0.00%) 346.00 ( 3.89%)
Min total-odr1-8192 360.00 ( 0.00%) 347.00 ( 3.61%)
A basic IO benchmark based on varying numbers of dd running in parallel
showed nothing interesting other than differences in what zones were
scanned due to the fair zone allocation policy being removed.
This series is not without its hazards. There are at least three areas
that I'm concerned with even though I could not reproduce any problems in
that area.
1. Reclaim/compaction is going to be affected because the amount of reclaim is
no longer targetted at a specific zone. Compaction works on a per-zone basis
so there is no guarantee that reclaiming a few THP's worth page pages will
have a positive impact on compaction success rates.
2. The Slab/LRU reclaim ratio is affected because the frequency the shrinkers
are called is now different. This may or may not be a problem but if it
is, it'll be because shrinkers are not called enough and some balancing
is required.
3. The anon/file reclaim ratio may be affected. Pages about to be dirtied are
distributed between zones and the fair zone allocation policy used to do
something very similar for anon. The distribution is now different but not
necessarily in any way that matters but it's still worth bearing in mind.
Documentation/cgroup-v1/memcg_test.txt | 4 +-
Documentation/cgroup-v1/memory.txt | 4 +-
arch/s390/appldata/appldata_mem.c | 2 +-
arch/tile/mm/pgtable.c | 18 +-
drivers/base/node.c | 73 +--
drivers/staging/android/lowmemorykiller.c | 12 +-
fs/fs-writeback.c | 4 +-
fs/fuse/file.c | 8 +-
fs/nfs/internal.h | 2 +-
fs/nfs/write.c | 2 +-
fs/proc/meminfo.c | 14 +-
include/linux/backing-dev.h | 2 +-
include/linux/memcontrol.h | 30 +-
include/linux/mm_inline.h | 2 +-
include/linux/mm_types.h | 2 +-
include/linux/mmzone.h | 157 +++---
include/linux/swap.h | 15 +-
include/linux/topology.h | 2 +-
include/linux/vm_event_item.h | 14 +-
include/linux/vmstat.h | 111 +++-
include/linux/writeback.h | 2 +-
include/trace/events/vmscan.h | 40 +-
include/trace/events/writeback.h | 10 +-
kernel/power/snapshot.c | 10 +-
kernel/sysctl.c | 4 +-
mm/backing-dev.c | 15 +-
mm/compaction.c | 39 +-
mm/filemap.c | 14 +-
mm/huge_memory.c | 33 +-
mm/internal.h | 11 +-
mm/memcontrol.c | 235 ++++-----
mm/memory-failure.c | 4 +-
mm/memory_hotplug.c | 7 +-
mm/mempolicy.c | 2 +-
mm/migrate.c | 35 +-
mm/mlock.c | 12 +-
mm/page-writeback.c | 124 +++--
mm/page_alloc.c | 271 +++++-----
mm/page_idle.c | 4 +-
mm/rmap.c | 15 +-
mm/shmem.c | 12 +-
mm/swap.c | 66 +--
mm/swap_state.c | 4 +-
mm/util.c | 4 +-
mm/vmscan.c | 829 +++++++++++++++---------------
mm/vmstat.c | 374 +++++++++++---
mm/workingset.c | 52 +-
47 files changed, 1489 insertions(+), 1217 deletions(-)
--
2.6.4
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim
2016-06-09 18:04 [PATCH 00/27] Move LRU page reclaim from zones to nodes v6 Mel Gorman
@ 2016-06-09 18:04 ` Mel Gorman
2016-06-17 10:55 ` Vlastimil Babka
0 siblings, 1 reply; 10+ messages in thread
From: Mel Gorman @ 2016-06-09 18:04 UTC (permalink / raw)
To: Andrew Morton, Linux-MM
Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
As reclaim is now per-node based, convert zone_reclaim to be node_reclaim.
It is possible that a node will be reclaimed multiple times if it has
multiple zones but this is unavoidable without caching all nodes traversed
so far. The documentation and interface to userspace is the same from
a configuration perspective and will will be similar in behaviour unless
the node-local allocation requests were also limited to lower zones.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
include/linux/mmzone.h | 18 +++++------
include/linux/swap.h | 9 +++---
include/linux/topology.h | 2 +-
kernel/sysctl.c | 4 +--
mm/huge_memory.c | 4 +--
mm/internal.h | 8 ++---
mm/page_alloc.c | 24 ++++++++++-----
mm/vmscan.c | 77 ++++++++++++++++++++++++------------------------
8 files changed, 77 insertions(+), 69 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9cc75f88bb8e..d826d203185e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -366,14 +366,6 @@ struct zone {
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_NUMA
- /*
- * zone reclaim becomes active if more unmapped pages exist.
- */
- unsigned long min_unmapped_pages;
- unsigned long min_slab_pages;
-#endif /* CONFIG_NUMA */
-
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
@@ -518,7 +510,6 @@ struct zone {
} ____cacheline_internodealigned_in_smp;
enum zone_flags {
- ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};
@@ -534,6 +525,7 @@ enum pgdat_flags {
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
+ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
};
static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -682,6 +674,14 @@ typedef struct pglist_data {
*/
unsigned long totalreserve_pages;
+#ifdef CONFIG_NUMA
+ /*
+ * zone reclaim becomes active if more unmapped pages exist.
+ */
+ unsigned long min_unmapped_pages;
+ unsigned long min_slab_pages;
+#endif /* CONFIG_NUMA */
+
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5d1eac6259d2..bae1615e698d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -334,13 +334,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages;
#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
{
return 0;
}
diff --git a/include/linux/topology.h b/include/linux/topology.h
index afce69296ac0..cb0775e1ee4b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35f0dcb1cb4f..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c368db5cadca..0512b863a441 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2227,10 +2227,10 @@ static bool khugepaged_scan_abort(int nid)
int i;
/*
- * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!zone_reclaim_mode)
+ if (!node_reclaim_mode)
return false;
/* If there is a count for this node already, it must be acceptable */
diff --git a/mm/internal.h b/mm/internal.h
index d5e4e7db141d..5231344a9e52 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -434,10 +434,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-#define ZONE_RECLAIM_NOSCAN -2
-#define ZONE_RECLAIM_FULL -1
-#define ZONE_RECLAIM_SOME 0
-#define ZONE_RECLAIM_SUCCESS 1
+#define NODE_RECLAIM_NOSCAN -2
+#define NODE_RECLAIM_FULL -1
+#define NODE_RECLAIM_SOME 0
+#define NODE_RECLAIM_SUCCESS 1
extern int hwpoison_filter(struct page *p);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8faeb0cb21f0..a249128999a8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2947,16 +2947,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (zone_reclaim_mode == 0 ||
+ if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
- ret = zone_reclaim(zone, gfp_mask, order);
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
+ case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
- case ZONE_RECLAIM_FULL:
+ case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
@@ -5948,9 +5948,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+ pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+ pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
zone->zone_pgdat = pgdat;
@@ -6928,6 +6928,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6935,8 +6936,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
@@ -6944,6 +6948,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6951,8 +6956,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b9cff9047ac0..4e6bb656afb3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3534,12 +3534,12 @@ module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
- * Zone reclaim mode
+ * Node reclaim mode
*
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
* the watermarks.
*/
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
@@ -3547,14 +3547,14 @@ int zone_reclaim_mode __read_mostly;
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
/*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
@@ -3580,7 +3580,7 @@ static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct pglist_data *pgdat)
{
unsigned long nr_pagecache_reclaimable;
unsigned long delta = 0;
@@ -3591,14 +3591,14 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
* pages like swapcache and node_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_UNMAP)
- nr_pagecache_reclaimable = node_page_state(zone->zone_pgdat, NR_FILE_PAGES);
+ if (node_reclaim_mode & RECLAIM_UNMAP)
+ nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
else
- nr_pagecache_reclaimable = node_unmapped_file_pages(zone->zone_pgdat);
+ nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */
- if (!(zone_reclaim_mode & RECLAIM_WRITE))
- delta += node_page_state(zone->zone_pgdat, NR_FILE_DIRTY);
+ if (!(node_reclaim_mode & RECLAIM_WRITE))
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3608,23 +3608,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
}
/*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
*/
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
+ int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
- .priority = ZONE_RECLAIM_PRIORITY,
- .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
- .reclaim_idx = zone_idx(zone),
+ .reclaim_idx = classzone_idx,
};
cond_resched();
@@ -3638,13 +3639,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ if (zone_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
- shrink_node(zone->zone_pgdat, &sc, zone_idx(zone));
+ shrink_node(pgdat, &sc, classzone_idx);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
@@ -3654,49 +3655,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
return sc.nr_reclaimed >= nr_pages;
}
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
- int node_id;
int ret;
/*
- * Zone reclaim reclaims unmapped file backed pages and
+ * Node reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.
*
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
- * thrown out if the zone is overallocated. So we do not reclaim
- * if less than a specified percentage of the zone is used by
+ * thrown out if the node is overallocated. So we do not reclaim
+ * if less than a specified percentage of the node is used by
* unmapped file backed pages.
*/
- if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
- return ZONE_RECLAIM_FULL;
+ if (zone_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+ sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ return NODE_RECLAIM_FULL;
- if (!pgdat_reclaimable(zone->zone_pgdat))
- return ZONE_RECLAIM_FULL;
+ if (!pgdat_reclaimable(pgdat))
+ return NODE_RECLAIM_FULL;
/*
* Do not scan if the allocation should not be delayed.
*/
if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
- return ZONE_RECLAIM_NOSCAN;
+ return NODE_RECLAIM_NOSCAN;
/*
- * Only run zone reclaim on the local zone or on zones that do not
+ * Only run node reclaim on the local node or on nodes that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
- node_id = zone_to_nid(zone);
- if (node_state(node_id, N_CPU) && node_id != numa_node_id())
- return ZONE_RECLAIM_NOSCAN;
+ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+ return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
- return ZONE_RECLAIM_NOSCAN;
+ if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ return NODE_RECLAIM_NOSCAN;
- ret = __zone_reclaim(zone, gfp_mask, order);
- clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ ret = __node_reclaim(pgdat, gfp_mask, order);
+ clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim
2016-06-09 18:04 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
@ 2016-06-17 10:55 ` Vlastimil Babka
0 siblings, 0 replies; 10+ messages in thread
From: Vlastimil Babka @ 2016-06-17 10:55 UTC (permalink / raw)
To: Mel Gorman, Andrew Morton, Linux-MM; +Cc: Rik van Riel, Johannes Weiner, LKML
On 06/09/2016 08:04 PM, Mel Gorman wrote:
> As reclaim is now per-node based, convert zone_reclaim to be node_reclaim.
> It is possible that a node will be reclaimed multiple times if it has
> multiple zones but this is unavoidable without caching all nodes traversed
> so far. The documentation and interface to userspace is the same from
> a configuration perspective and will will be similar in behaviour unless
> the node-local allocation requests were also limited to lower zones.
>
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
[...]
> @@ -682,6 +674,14 @@ typedef struct pglist_data {
> */
> unsigned long totalreserve_pages;
>
> +#ifdef CONFIG_NUMA
> + /*
> + * zone reclaim becomes active if more unmapped pages exist.
node reclaim
> + */
> + unsigned long min_unmapped_pages;
> + unsigned long min_slab_pages;
> +#endif /* CONFIG_NUMA */
> +
> /* Write-intensive fields used from the page allocator */
> ZONE_PADDING(_pad1_)
> spinlock_t lru_lock;
[...]
> @@ -3580,7 +3580,7 @@ static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
> }
>
> /* Work out how many page cache pages we can reclaim in this reclaim_mode */
> -static unsigned long zone_pagecache_reclaimable(struct zone *zone)
> +static unsigned long zone_pagecache_reclaimable(struct pglist_data *pgdat)
Rename to node_pagecache_reclaimable?
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 00/27] Move LRU page reclaim from zones to nodes v7
@ 2016-06-21 14:15 Mel Gorman
2016-06-21 14:16 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
0 siblings, 1 reply; 10+ messages in thread
From: Mel Gorman @ 2016-06-21 14:15 UTC (permalink / raw)
To: Andrew Morton, Linux-MM
Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
(sorry for resend, the previous attempt didn't go through fully for
some reason)
The bulk of the updates are in response to review from Vlastimil Babka
and received a lot more testing than v6.
Changelog since v6
o Correct reclaim_idx when direct reclaiming for memcg
o Also account LRU pages per zone for compaction/reclaim
o Add page_pgdat helper with more efficient lookup
o Init pgdat LRU lock only once
o Slight optimisation to wake_all_kswapds
o Always wake kcompactd when kswapd is going to sleep
o Rebase to mmotm as of June 15th, 2016
Changelog since v5
o Rebase and adjust to changes
Changelog since v4
o Rebase on top of v3 of page allocator optimisation series
Changelog since v3
o Rebase on top of the page allocator optimisation series
o Remove RFC tag
This is the latest version of a series that moves LRUs from the zones to
the node that is based upon 4.6-rc3 plus the page allocator optimisation
series. Conceptually, this is simple but there are a lot of details. Some
of the broad motivations for this are;
1. The residency of a page partially depends on what zone the page was
allocated from. This is partially combatted by the fair zone allocation
policy but that is a partial solution that introduces overhead in the
page allocator paths.
2. Currently, reclaim on node 0 behaves slightly different to node 1. For
example, direct reclaim scans in zonelist order and reclaims even if
the zone is over the high watermark regardless of the age of pages
in that LRU. Kswapd on the other hand starts reclaim on the highest
unbalanced zone. A difference in distribution of file/anon pages due
to when they were allocated results can result in a difference in
again. While the fair zone allocation policy mitigates some of the
problems here, the page reclaim results on a multi-zone node will
always be different to a single-zone node.
it was scheduled on as a result.
3. kswapd and the page allocator scan zones in the opposite order to
avoid interfering with each other but it's sensitive to timing. This
mitigates the page allocator using pages that were allocated very recently
in the ideal case but it's sensitive to timing. When kswapd is allocating
from lower zones then it's great but during the rebalancing of the highest
zone, the page allocator and kswapd interfere with each other. It's worse
if the highest zone is small and difficult to balance.
4. slab shrinkers are node-based which makes it harder to identify the exact
relationship between slab reclaim and LRU reclaim.
The reason we have zone-based reclaim is that we used to have
large highmem zones in common configurations and it was necessary
to quickly find ZONE_NORMAL pages for reclaim. Today, this is much
less of a concern as machines with lots of memory will (or should) use
64-bit kernels. Combinations of 32-bit hardware and 64-bit hardware are
rare. Machines that do use highmem should have relatively low highmem:lowmem
ratios than we worried about in the past.
Conceptually, moving to node LRUs should be easier to understand. The
page allocator plays fewer tricks to game reclaim and reclaim behaves
similarly on all nodes.
The series has been tested on a 16 core UMA machine and a 2-socket 48 core
NUMA machine. The UMA results are presented in most cases as the NUMA machine
behaved similarly.
pagealloc
---------
This is a microbenchmark that shows the benefit of removing the fair zone
allocation policy. It was tested uip to order-4 but only orders 0 and 1 are
shown as the other orders were comparable.
4.7.0-rc3 4.7.0-rc3
mmotm-20160615 nodelru-v7r17
Min total-odr0-1 485.00 ( 0.00%) 462.00 ( 4.74%)
Min total-odr0-2 354.00 ( 0.00%) 341.00 ( 3.67%)
Min total-odr0-4 285.00 ( 0.00%) 277.00 ( 2.81%)
Min total-odr0-8 249.00 ( 0.00%) 240.00 ( 3.61%)
Min total-odr0-16 230.00 ( 0.00%) 224.00 ( 2.61%)
Min total-odr0-32 222.00 ( 0.00%) 215.00 ( 3.15%)
Min total-odr0-64 216.00 ( 0.00%) 210.00 ( 2.78%)
Min total-odr0-128 214.00 ( 0.00%) 208.00 ( 2.80%)
Min total-odr0-256 248.00 ( 0.00%) 233.00 ( 6.05%)
Min total-odr0-512 277.00 ( 0.00%) 270.00 ( 2.53%)
Min total-odr0-1024 294.00 ( 0.00%) 284.00 ( 3.40%)
Min total-odr0-2048 308.00 ( 0.00%) 298.00 ( 3.25%)
Min total-odr0-4096 318.00 ( 0.00%) 307.00 ( 3.46%)
Min total-odr0-8192 322.00 ( 0.00%) 308.00 ( 4.35%)
Min total-odr0-16384 324.00 ( 0.00%) 309.00 ( 4.63%)
Min total-odr1-1 729.00 ( 0.00%) 686.00 ( 5.90%)
Min total-odr1-2 533.00 ( 0.00%) 520.00 ( 2.44%)
Min total-odr1-4 434.00 ( 0.00%) 415.00 ( 4.38%)
Min total-odr1-8 390.00 ( 0.00%) 364.00 ( 6.67%)
Min total-odr1-16 359.00 ( 0.00%) 335.00 ( 6.69%)
Min total-odr1-32 356.00 ( 0.00%) 327.00 ( 8.15%)
Min total-odr1-64 356.00 ( 0.00%) 321.00 ( 9.83%)
Min total-odr1-128 356.00 ( 0.00%) 333.00 ( 6.46%)
Min total-odr1-256 354.00 ( 0.00%) 337.00 ( 4.80%)
Min total-odr1-512 366.00 ( 0.00%) 340.00 ( 7.10%)
Min total-odr1-1024 373.00 ( 0.00%) 354.00 ( 5.09%)
Min total-odr1-2048 375.00 ( 0.00%) 354.00 ( 5.60%)
Min total-odr1-4096 374.00 ( 0.00%) 354.00 ( 5.35%)
Min total-odr1-8192 370.00 ( 0.00%) 355.00 ( 4.05%)
This shows a steady improvement throughout. The primary benefit is from
reduced system CPU usage which is obvious from the overall times;
4.7.0-rc3 4.7.0-rc3
mmotm-20160615 nodelru-v7
User 174.06 174.58
System 2656.78 2485.21
Elapsed 2885.07 2713.67
The vmstats also showed that the fair zone allocation policy was definitely
removed as can be seen here;
4.7.0-rc3 4.7.0-rc3
mmotm-20160615nodelru-v7r17
DMA32 allocs 28794408561 0
Normal allocs 48431969998 77226313470
Movable allocs 0 0
tiobench on ext4
----------------
tiobench is a benchmark that artifically benefits if old pages remain resident
while new pages get reclaimed. The fair zone allocation policy mitigates this
problem so pages age fairly. While the benchmark has problems, it is important
that tiobench performance remains constant as it implies that page aging
problems that the fair zone allocation policy fixes are not re-introduced.
4.7.0-rc3 4.7.0-rc3
mmotm-20160615 nodelru-v7r17
Min PotentialReadSpeed 90.24 ( 0.00%) 90.14 ( -0.11%)
Min SeqRead-MB/sec-1 80.63 ( 0.00%) 83.09 ( 3.05%)
Min SeqRead-MB/sec-2 71.91 ( 0.00%) 72.44 ( 0.74%)
Min SeqRead-MB/sec-4 75.20 ( 0.00%) 74.32 ( -1.17%)
Min SeqRead-MB/sec-8 65.30 ( 0.00%) 65.21 ( -0.14%)
Min SeqRead-MB/sec-16 62.62 ( 0.00%) 62.12 ( -0.80%)
Min RandRead-MB/sec-1 0.90 ( 0.00%) 0.94 ( 4.44%)
Min RandRead-MB/sec-2 0.96 ( 0.00%) 0.97 ( 1.04%)
Min RandRead-MB/sec-4 1.43 ( 0.00%) 1.41 ( -1.40%)
Min RandRead-MB/sec-8 1.67 ( 0.00%) 1.72 ( 2.99%)
Min RandRead-MB/sec-16 1.77 ( 0.00%) 1.86 ( 5.08%)
Min SeqWrite-MB/sec-1 78.12 ( 0.00%) 79.78 ( 2.12%)
Min SeqWrite-MB/sec-2 72.74 ( 0.00%) 73.23 ( 0.67%)
Min SeqWrite-MB/sec-4 79.40 ( 0.00%) 78.32 ( -1.36%)
Min SeqWrite-MB/sec-8 73.18 ( 0.00%) 71.40 ( -2.43%)
Min SeqWrite-MB/sec-16 75.82 ( 0.00%) 75.24 ( -0.76%)
Min RandWrite-MB/sec-1 1.18 ( 0.00%) 1.15 ( -2.54%)
Min RandWrite-MB/sec-2 1.05 ( 0.00%) 0.99 ( -5.71%)
Min RandWrite-MB/sec-4 1.00 ( 0.00%) 0.96 ( -4.00%)
Min RandWrite-MB/sec-8 0.91 ( 0.00%) 0.92 ( 1.10%)
Min RandWrite-MB/sec-16 0.92 ( 0.00%) 0.92 ( 0.00%)
This shows that the series has little or not impact on tiobench which is
desirable. It indicates that the fair zone allocation policy was removed
in a manner that didn't reintroduce one class of page aging bug. There
were only minor differences in overall reclaim activity
4.7.0-rc3 4.7.0-rc3
mmotm-20160615nodelru-v7r17
Minor Faults 640992 640721
Major Faults 728 623
Swap Ins 0 0
Swap Outs 0 0
DMA allocs 0 0
DMA32 allocs 46174282 44219717
Normal allocs 77949344 79858024
Movable allocs 0 0
Allocation stalls 38 76
Direct pages scanned 17463 34865
Kswapd pages scanned 93331163 93302388
Kswapd pages reclaimed 93328173 93299677
Direct pages reclaimed 17463 34865
Kswapd efficiency 99% 99%
Kswapd velocity 13729.855 13755.612
Direct efficiency 100% 100%
Direct velocity 2.569 5.140
Percentage direct scans 0% 0%
Page writes by reclaim 0 0
Page writes file 0 0
Page writes anon 0 0
Page reclaim immediate 54 36
kswapd activity was roughly comparable. There was slight differences
in direct reclaim activity but negligible in the context of the overall
workload (velocity of 5 pages per second with the patches applied, 2 pages
per second in the baseline kernel).
pgbench read-only large configuration on ext4
---------------------------------------------
pgbench is a database benchmark that can be sensitive to page reclaim
decisions. This also checks if removing the fair zone allocation policy
is safe
pgbench Transactions
4.7.0-rc3 4.7.0-rc3
mmotm-20160615 nodelru-v7r17
Hmean 1 191.00 ( 0.00%) 193.67 ( 1.40%)
Hmean 5 338.59 ( 0.00%) 336.99 ( -0.47%)
Hmean 12 374.03 ( 0.00%) 386.15 ( 3.24%)
Hmean 21 372.24 ( 0.00%) 372.02 ( -0.06%)
Hmean 30 383.98 ( 0.00%) 370.69 ( -3.46%)
Hmean 32 431.01 ( 0.00%) 438.47 ( 1.73%)
Negligible differences again. As with tiobench, overall reclaim activity
was comparable.
bonnie++ on ext4
----------------
No interesting performance difference, negligible differences on reclaim
stats.
paralleldd on ext4
------------------
This workload uses varying numbers of dd instances to read large amounts of
data from disk.
paralleldd
4.7.0-rc3 4.7.0-rc3
mmotm-20160615 nodelru-v7r17
Amean Elapsd-1 181.57 ( 0.00%) 179.63 ( 1.07%)
Amean Elapsd-3 188.29 ( 0.00%) 183.68 ( 2.45%)
Amean Elapsd-5 188.02 ( 0.00%) 181.73 ( 3.35%)
Amean Elapsd-7 186.07 ( 0.00%) 184.11 ( 1.05%)
Amean Elapsd-12 188.16 ( 0.00%) 183.51 ( 2.47%)
Amean Elapsd-16 189.03 ( 0.00%) 181.27 ( 4.10%)
4.7.0-rc3 4.7.0-rc3
mmotm-20160615nodelru-v7r17
User 1439.23 1433.37
System 8332.31 8216.01
Elapsed 3619.80 3532.69
There is a slight gain in performance, some of which is from the reduced system
CPU usage. There areminor differences in reclaim activity but nothing significant
4.7.0-rc3 4.7.0-rc3
mmotm-20160615nodelru-v7r17
Minor Faults 362486 358215
Major Faults 1143 1113
Swap Ins 26 0
Swap Outs 2920 482
DMA allocs 0 0
DMA32 allocs 31568814 28598887
Normal allocs 46539922 49514444
Movable allocs 0 0
Allocation stalls 0 0
Direct pages scanned 0 0
Kswapd pages scanned 40886878 40849710
Kswapd pages reclaimed 40869923 40835207
Direct pages reclaimed 0 0
Kswapd efficiency 99% 99%
Kswapd velocity 11295.342 11563.344
Direct efficiency 100% 100%
Direct velocity 0.000 0.000
Slabs scanned 131673 126099
Direct inode steals 57 60
Kswapd inode steals 762 18
It basically shows that kswapd was active at roughly the same rate in
both kernels. There was also comparable slab scanning activity and direct
reclaim was avoided in both cases. There appears to be a large difference
in numbers of inodes reclaimed but the workload has few active inodes and
is likely a timing artifact. It's interesting to note that the node-lru
did not swap in any pages but given the low swap activity, it's unlikely
to be significant.
stutter
-------
stutter simulates a simple workload. One part uses a lot of anonymous
memory, a second measures mmap latency and a third copies a large file.
The primary metric is checking for mmap latency.
stutter
4.7.0-rc3 4.7.0-rc3
mmotm-20160615 nodelru-v7r17
Min mmap 16.8422 ( 0.00%) 15.9821 ( 5.11%)
1st-qrtle mmap 57.8709 ( 0.00%) 58.0794 ( -0.36%)
2nd-qrtle mmap 58.4335 ( 0.00%) 59.4286 ( -1.70%)
3rd-qrtle mmap 58.6957 ( 0.00%) 59.6862 ( -1.69%)
Max-90% mmap 58.9388 ( 0.00%) 59.8759 ( -1.59%)
Max-93% mmap 59.0505 ( 0.00%) 59.9333 ( -1.50%)
Max-95% mmap 59.1877 ( 0.00%) 59.9844 ( -1.35%)
Max-99% mmap 60.3237 ( 0.00%) 60.2337 ( 0.15%)
Max mmap 285.6454 ( 0.00%) 135.6006 ( 52.53%)
Mean mmap 57.8366 ( 0.00%) 58.4884 ( -1.13%)
This shows that there is a slight impact on mmap latency but that
the worst-case outlier is much improved. As the problem with this
benchmark used to be that the kernel stalled for minutes, this
difference is negligible.
Some of the vmstats are interesting
4.7.0-rc3 4.7.0-rc3
mmotm-20160615nodelru-v7r17
Swap Ins 58 42
Swap Outs 0 0
Allocation stalls 16 0
Direct pages scanned 1374 0
Kswapd pages scanned 42454910 41782544
Kswapd pages reclaimed 41571035 41781833
Direct pages reclaimed 1167 0
Kswapd efficiency 97% 99%
Kswapd velocity 14774.479 14223.796
Direct efficiency 84% 100%
Direct velocity 0.478 0.000
Percentage direct scans 0% 0%
Page writes by reclaim 696918 0
Page writes file 696918 0
Page writes anon 0 0
Page reclaim immediate 2940 137
Sector Reads 81644424 81699544
Sector Writes 99193620 98862160
Page rescued immediate 0 0
Slabs scanned 1279838 22640
kswapd and direct reclaim activity are similar but the node LRU series
did not attempt to trigger any page writes from reclaim context.
This series is not without its hazards. There are at least three areas
that I'm concerned with even though I could not reproduce any problems in
that area.
1. Reclaim/compaction is going to be affected because the amount of reclaim is
no longer targetted at a specific zone. Compaction works on a per-zone basis
so there is no guarantee that reclaiming a few THP's worth page pages will
have a positive impact on compaction success rates.
2. The Slab/LRU reclaim ratio is affected because the frequency the shrinkers
are called is now different. This may or may not be a problem but if it
is, it'll be because shrinkers are not called enough and some balancing
is required.
3. The anon/file reclaim ratio may be affected. Pages about to be dirtied are
distributed between zones and the fair zone allocation policy used to do
something very similar for anon. The distribution is now different but not
necessarily in any way that matters but it's still worth bearing in mind.
Documentation/cgroup-v1/memcg_test.txt | 4 +-
Documentation/cgroup-v1/memory.txt | 4 +-
arch/s390/appldata/appldata_mem.c | 2 +-
arch/tile/mm/pgtable.c | 18 +-
drivers/base/node.c | 73 +--
drivers/staging/android/lowmemorykiller.c | 12 +-
fs/fs-writeback.c | 4 +-
fs/fuse/file.c | 8 +-
fs/nfs/internal.h | 2 +-
fs/nfs/write.c | 2 +-
fs/proc/meminfo.c | 14 +-
include/linux/backing-dev.h | 2 +-
include/linux/memcontrol.h | 32 +-
include/linux/mm.h | 5 +
include/linux/mm_inline.h | 21 +-
include/linux/mm_types.h | 2 +-
include/linux/mmzone.h | 158 +++---
include/linux/swap.h | 23 +-
include/linux/topology.h | 2 +-
include/linux/vm_event_item.h | 14 +-
include/linux/vmstat.h | 111 +++-
include/linux/writeback.h | 2 +-
include/trace/events/vmscan.h | 63 ++-
include/trace/events/writeback.h | 10 +-
kernel/power/snapshot.c | 10 +-
kernel/sysctl.c | 4 +-
mm/backing-dev.c | 15 +-
mm/compaction.c | 28 +-
mm/filemap.c | 14 +-
mm/huge_memory.c | 33 +-
mm/internal.h | 11 +-
mm/memcontrol.c | 246 ++++----
mm/memory-failure.c | 4 +-
mm/memory_hotplug.c | 7 +-
mm/mempolicy.c | 2 +-
mm/migrate.c | 35 +-
mm/mlock.c | 12 +-
mm/page-writeback.c | 124 ++--
mm/page_alloc.c | 268 ++++-----
mm/page_idle.c | 4 +-
mm/rmap.c | 14 +-
mm/shmem.c | 12 +-
mm/swap.c | 66 +--
mm/swap_state.c | 4 +-
mm/util.c | 4 +-
mm/vmscan.c | 901 +++++++++++++++---------------
mm/vmstat.c | 376 ++++++++++---
mm/workingset.c | 54 +-
48 files changed, 1573 insertions(+), 1263 deletions(-)
--
2.6.4
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim
2016-06-21 14:15 [PATCH 00/27] Move LRU page reclaim from zones to nodes v7 Mel Gorman
@ 2016-06-21 14:16 ` Mel Gorman
0 siblings, 0 replies; 10+ messages in thread
From: Mel Gorman @ 2016-06-21 14:16 UTC (permalink / raw)
To: Andrew Morton, Linux-MM
Cc: Rik van Riel, Vlastimil Babka, Johannes Weiner, LKML, Mel Gorman
As reclaim is now per-node based, convert zone_reclaim to be node_reclaim.
It is possible that a node will be reclaimed multiple times if it has
multiple zones but this is unavoidable without caching all nodes traversed
so far. The documentation and interface to userspace is the same from
a configuration perspective and will will be similar in behaviour unless
the node-local allocation requests were also limited to lower zones.
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
include/linux/mmzone.h | 18 +++++------
include/linux/swap.h | 9 +++---
include/linux/topology.h | 2 +-
kernel/sysctl.c | 4 +--
mm/huge_memory.c | 4 +--
mm/internal.h | 8 ++---
mm/page_alloc.c | 24 ++++++++++-----
mm/vmscan.c | 77 ++++++++++++++++++++++++------------------------
8 files changed, 77 insertions(+), 69 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9c59b8540cb7..79fb9f6efc55 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -369,14 +369,6 @@ struct zone {
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_NUMA
- /*
- * zone reclaim becomes active if more unmapped pages exist.
- */
- unsigned long min_unmapped_pages;
- unsigned long min_slab_pages;
-#endif /* CONFIG_NUMA */
-
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
@@ -521,7 +513,6 @@ struct zone {
} ____cacheline_internodealigned_in_smp;
enum zone_flags {
- ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};
@@ -536,6 +527,7 @@ enum pgdat_flags {
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
+ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
};
static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -684,6 +676,14 @@ typedef struct pglist_data {
*/
unsigned long totalreserve_pages;
+#ifdef CONFIG_NUMA
+ /*
+ * zone reclaim becomes active if more unmapped pages exist.
+ */
+ unsigned long min_unmapped_pages;
+ unsigned long min_slab_pages;
+#endif /* CONFIG_NUMA */
+
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2a23ddc96edd..b17cc4830fa6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -326,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages;
#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
{
return 0;
}
diff --git a/include/linux/topology.h b/include/linux/topology.h
index afce69296ac0..cb0775e1ee4b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35f0dcb1cb4f..53954631a4e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1508,8 +1508,8 @@ static struct ctl_table vm_table[] = {
#ifdef CONFIG_NUMA
{
.procname = "zone_reclaim_mode",
- .data = &zone_reclaim_mode,
- .maxlen = sizeof(zone_reclaim_mode),
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
.extra1 = &zero,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c218d7aafcde..d5dd6533de32 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2220,10 +2220,10 @@ static bool khugepaged_scan_abort(int nid)
int i;
/*
- * If zone_reclaim_mode is disabled, then no extra effort is made to
+ * If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
- if (!zone_reclaim_mode)
+ if (!node_reclaim_mode)
return false;
/* If there is a count for this node already, it must be acceptable */
diff --git a/mm/internal.h b/mm/internal.h
index 8df888469bdc..4abb2336e127 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -435,10 +435,10 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-#define ZONE_RECLAIM_NOSCAN -2
-#define ZONE_RECLAIM_FULL -1
-#define ZONE_RECLAIM_SOME 0
-#define ZONE_RECLAIM_SUCCESS 1
+#define NODE_RECLAIM_NOSCAN -2
+#define NODE_RECLAIM_FULL -1
+#define NODE_RECLAIM_SOME 0
+#define NODE_RECLAIM_SUCCESS 1
extern int hwpoison_filter(struct page *p);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fc2eaa122770..78e5abc41857 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2947,16 +2947,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (zone_reclaim_mode == 0 ||
+ if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
- ret = zone_reclaim(zone, gfp_mask, order);
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
+ case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
- case ZONE_RECLAIM_FULL:
+ case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
@@ -5944,9 +5944,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+ pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+ pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
zone->zone_pgdat = pgdat;
@@ -6923,6 +6923,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6930,8 +6931,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
@@ -6939,6 +6943,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6946,8 +6951,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 456f2d209651..4fa1fee5e486 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3574,12 +3574,12 @@ module_init(kswapd_init)
#ifdef CONFIG_NUMA
/*
- * Zone reclaim mode
+ * Node reclaim mode
*
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
* the watermarks.
*/
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
@@ -3587,14 +3587,14 @@ int zone_reclaim_mode __read_mostly;
#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
* of a node considered for each zone_reclaim. 4 scans 1/16th of
* a zone.
*/
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
/*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
* occur.
*/
int sysctl_min_unmapped_ratio = 1;
@@ -3620,7 +3620,7 @@ static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
{
unsigned long nr_pagecache_reclaimable;
unsigned long delta = 0;
@@ -3631,14 +3631,14 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
* pages like swapcache and node_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_UNMAP)
- nr_pagecache_reclaimable = node_page_state(zone->zone_pgdat, NR_FILE_PAGES);
+ if (node_reclaim_mode & RECLAIM_UNMAP)
+ nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
else
- nr_pagecache_reclaimable = node_unmapped_file_pages(zone->zone_pgdat);
+ nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */
- if (!(zone_reclaim_mode & RECLAIM_WRITE))
- delta += node_page_state(zone->zone_pgdat, NR_FILE_DIRTY);
+ if (!(node_reclaim_mode & RECLAIM_WRITE))
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3648,23 +3648,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
}
/*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
*/
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
+ int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
.order = order,
- .priority = ZONE_RECLAIM_PRIORITY,
- .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+ .priority = NODE_RECLAIM_PRIORITY,
+ .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
- .reclaim_idx = zone_idx(zone),
+ .reclaim_idx = classzone_idx,
};
cond_resched();
@@ -3678,13 +3679,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
- shrink_node(zone->zone_pgdat, &sc, zone_idx(zone));
+ shrink_node(pgdat, &sc, classzone_idx);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
@@ -3694,49 +3695,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
return sc.nr_reclaimed >= nr_pages;
}
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
- int node_id;
int ret;
/*
- * Zone reclaim reclaims unmapped file backed pages and
+ * Node reclaim reclaims unmapped file backed pages and
* slab pages if we are over the defined limits.
*
* A small portion of unmapped file backed pages is needed for
* file I/O otherwise pages read by file I/O will be immediately
- * thrown out if the zone is overallocated. So we do not reclaim
- * if less than a specified percentage of the zone is used by
+ * thrown out if the node is overallocated. So we do not reclaim
+ * if less than a specified percentage of the node is used by
* unmapped file backed pages.
*/
- if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
- return ZONE_RECLAIM_FULL;
+ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+ sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ return NODE_RECLAIM_FULL;
- if (!pgdat_reclaimable(zone->zone_pgdat))
- return ZONE_RECLAIM_FULL;
+ if (!pgdat_reclaimable(pgdat))
+ return NODE_RECLAIM_FULL;
/*
* Do not scan if the allocation should not be delayed.
*/
if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
- return ZONE_RECLAIM_NOSCAN;
+ return NODE_RECLAIM_NOSCAN;
/*
- * Only run zone reclaim on the local zone or on zones that do not
+ * Only run node reclaim on the local node or on nodes that do not
* have associated processors. This will favor the local processor
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
- node_id = zone_to_nid(zone);
- if (node_state(node_id, N_CPU) && node_id != numa_node_id())
- return ZONE_RECLAIM_NOSCAN;
+ if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+ return NODE_RECLAIM_NOSCAN;
- if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
- return ZONE_RECLAIM_NOSCAN;
+ if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+ return NODE_RECLAIM_NOSCAN;
- ret = __zone_reclaim(zone, gfp_mask, order);
- clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+ ret = __node_reclaim(pgdat, gfp_mask, order);
+ clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
--
2.6.4
^ permalink raw reply related [flat|nested] 10+ messages in thread
end of thread, other threads:[~2016-06-21 14:27 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-06 11:22 [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
2016-04-06 11:22 ` [PATCH 23/27] mm, vmscan: Add classzone information to tracepoints Mel Gorman
2016-04-06 11:22 ` [PATCH 24/27] mm, page_alloc: Remove fair zone allocation policy Mel Gorman
2016-04-06 11:22 ` [PATCH 25/27] mm: page_alloc: Cache the last node whose dirty limit is reached Mel Gorman
2016-04-06 11:22 ` [PATCH 26/27] mm: vmstat: Replace __count_zone_vm_events with a zone id equivalent Mel Gorman
2016-04-06 11:22 ` [PATCH 27/27] mm: vmstat: Account per-zone stalls and pages skipped during reclaim Mel Gorman
2016-04-15 9:13 [PATCH 00/27] Move LRU page reclaim from zones to nodes v5 Mel Gorman
2016-04-15 9:13 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
2016-06-09 18:04 [PATCH 00/27] Move LRU page reclaim from zones to nodes v6 Mel Gorman
2016-06-09 18:04 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
2016-06-17 10:55 ` Vlastimil Babka
2016-06-21 14:15 [PATCH 00/27] Move LRU page reclaim from zones to nodes v7 Mel Gorman
2016-06-21 14:16 ` [PATCH 22/27] mm: Convert zone_reclaim to node_reclaim Mel Gorman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).