* [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-20 13:45 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-20 13:45 UTC (permalink / raw)
To: Andrew Morton
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Wu Fengguang, Chris Mason,
Minchan Kim
This patch allows allocators to pass __GFP_WRITE when they know in
advance that the allocated page will be written to and become dirty
soon. The page allocator will then attempt to distribute those
allocations across zones, such that no single zone will end up full of
dirty, and thus more or less, unreclaimable pages.
The global dirty limits are put in proportion to the respective zone's
amount of dirtyable memory and allocations diverted to other zones
when the limit is reached.
For now, the problem remains for NUMA configurations where the zones
allowed for allocation are in sum not big enough to trigger the global
dirty limits, but a future approach to solve this can reuse the
per-zone dirty limit infrastructure laid out in this patch to have
dirty throttling and the flusher threads consider individual zones.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
include/linux/gfp.h | 4 ++-
include/linux/writeback.h | 1 +
mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
mm/page_alloc.c | 22 ++++++++++++++-
4 files changed, 80 insertions(+), 13 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf..50efc7e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,6 +36,7 @@ struct vm_area_struct;
#endif
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
+#define ___GFP_WRITE 0x1000000u
/*
* GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
/*
* This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +94,7 @@ struct vm_area_struct;
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a5f495f..c96ee0c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
static inline void laptop_sync_completion(void) { }
#endif
void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
extern unsigned long global_dirty_limit;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9f896db..1fc714c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
static struct prop_descriptor vm_completions;
static struct prop_descriptor vm_dirties;
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ unsigned long x;
+ /*
+ * To keep a reasonable ratio between dirty memory and lowmem,
+ * highmem is not considered dirtyable on a global level.
+ *
+ * But we allow individual highmem zones to hold a potentially
+ * bigger share of that global amount of dirty pages as long
+ * as they have enough free or reclaimable pages around.
+ */
+ x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
+ x += zone_reclaimable_pages(zone);
+ return x;
+}
+
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
@@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
}
/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ * dirty_limits - background-writeback and dirty-throttling thresholds
*
* Calculate the dirty thresholds based on sysctl parameters
* - vm.dirty_background_ratio or vm.dirty_background_bytes
@@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
* The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
* real-time tasks.
*/
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+static void dirty_limits(struct zone *zone,
+ unsigned long *pbackground,
+ unsigned long *pdirty)
{
+ unsigned long uninitialized_var(zone_memory);
+ unsigned long available_memory;
+ unsigned long global_memory;
unsigned long background;
- unsigned long dirty;
- unsigned long uninitialized_var(available_memory);
struct task_struct *tsk;
+ unsigned long dirty;
- if (!vm_dirty_bytes || !dirty_background_bytes)
- available_memory = determine_dirtyable_memory();
+ global_memory = determine_dirtyable_memory();
+ if (zone)
+ available_memory = zone_memory = zone_dirtyable_memory(zone);
+ else
+ available_memory = global_memory;
- if (vm_dirty_bytes)
+ if (vm_dirty_bytes) {
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else
+ if (zone)
+ dirty = dirty * zone_memory / global_memory;
+ } else
dirty = (vm_dirty_ratio * available_memory) / 100;
- if (dirty_background_bytes)
+ if (dirty_background_bytes) {
background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
- else
+ if (zone)
+ background = background * zone_memory / global_memory;
+ } else
background = (dirty_background_ratio * available_memory) / 100;
if (background >= dirty)
@@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
background += background / 4;
dirty += dirty / 4;
}
+ if (!zone)
+ trace_global_dirty_state(background, dirty);
*pbackground = background;
*pdirty = dirty;
- trace_global_dirty_state(background, dirty);
+}
+
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ dirty_limits(NULL, pbackground, pdirty);
}
/**
@@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
}
+bool zone_dirty_ok(struct zone *zone)
+{
+ unsigned long background_thresh, dirty_thresh;
+
+ dirty_limits(zone, &background_thresh, &dirty_thresh);
+
+ return zone_page_state(zone, NR_FILE_DIRTY) +
+ zone_page_state(zone, NR_UNSTABLE_NFS) +
+ zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
+}
+
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e8e2ee..3cca043 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1368,6 +1368,7 @@ failed:
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -1667,6 +1668,25 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * This may look like it would increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But once they are full, the
+ * allocations fall back to lower zones anyway, and
+ * then this check actually protects the lower zones
+ * from a flood of dirty page allocations.
+ *
+ * XXX: Allow allocations to potentially exceed the
+ * per-zone dirty limit in the slowpath before going
+ * into reclaim, which is important when NUMA nodes
+ * are not big enough to reach the global limit. The
+ * proper fix on these setups will require awareness
+ * of zones in the dirty-throttling and the flusher
+ * threads.
+ */
+ if (!(alloc_flags & ALLOC_SLOWPATH) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -2111,7 +2131,7 @@ restart:
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ alloc_flags = gfp_to_alloc_flags(gfp_mask) | ALLOC_SLOWPATH;
/*
* Find the true preferred zone if the allocation is unconstrained by
--
1.7.6
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 117+ messages in thread
* [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-20 13:45 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-20 13:45 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
This patch allows allocators to pass __GFP_WRITE when they know in
advance that the allocated page will be written to and become dirty
soon. The page allocator will then attempt to distribute those
allocations across zones, such that no single zone will end up full of
dirty, and thus more or less, unreclaimable pages.
The global dirty limits are put in proportion to the respective zone's
amount of dirtyable memory and allocations diverted to other zones
when the limit is reached.
For now, the problem remains for NUMA configurations where the zones
allowed for allocation are in sum not big enough to trigger the global
dirty limits, but a future approach to solve this can reuse the
per-zone dirty limit infrastructure laid out in this patch to have
dirty throttling and the flusher threads consider individual zones.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
include/linux/gfp.h | 4 ++-
include/linux/writeback.h | 1 +
mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
mm/page_alloc.c | 22 ++++++++++++++-
4 files changed, 80 insertions(+), 13 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf..50efc7e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,6 +36,7 @@ struct vm_area_struct;
#endif
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
+#define ___GFP_WRITE 0x1000000u
/*
* GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
/*
* This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +94,7 @@ struct vm_area_struct;
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a5f495f..c96ee0c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
static inline void laptop_sync_completion(void) { }
#endif
void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
extern unsigned long global_dirty_limit;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9f896db..1fc714c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
static struct prop_descriptor vm_completions;
static struct prop_descriptor vm_dirties;
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ unsigned long x;
+ /*
+ * To keep a reasonable ratio between dirty memory and lowmem,
+ * highmem is not considered dirtyable on a global level.
+ *
+ * But we allow individual highmem zones to hold a potentially
+ * bigger share of that global amount of dirty pages as long
+ * as they have enough free or reclaimable pages around.
+ */
+ x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
+ x += zone_reclaimable_pages(zone);
+ return x;
+}
+
/*
* Work out the current dirty-memory clamping and background writeout
* thresholds.
@@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
}
/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ * dirty_limits - background-writeback and dirty-throttling thresholds
*
* Calculate the dirty thresholds based on sysctl parameters
* - vm.dirty_background_ratio or vm.dirty_background_bytes
@@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
* The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
* real-time tasks.
*/
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+static void dirty_limits(struct zone *zone,
+ unsigned long *pbackground,
+ unsigned long *pdirty)
{
+ unsigned long uninitialized_var(zone_memory);
+ unsigned long available_memory;
+ unsigned long global_memory;
unsigned long background;
- unsigned long dirty;
- unsigned long uninitialized_var(available_memory);
struct task_struct *tsk;
+ unsigned long dirty;
- if (!vm_dirty_bytes || !dirty_background_bytes)
- available_memory = determine_dirtyable_memory();
+ global_memory = determine_dirtyable_memory();
+ if (zone)
+ available_memory = zone_memory = zone_dirtyable_memory(zone);
+ else
+ available_memory = global_memory;
- if (vm_dirty_bytes)
+ if (vm_dirty_bytes) {
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else
+ if (zone)
+ dirty = dirty * zone_memory / global_memory;
+ } else
dirty = (vm_dirty_ratio * available_memory) / 100;
- if (dirty_background_bytes)
+ if (dirty_background_bytes) {
background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
- else
+ if (zone)
+ background = background * zone_memory / global_memory;
+ } else
background = (dirty_background_ratio * available_memory) / 100;
if (background >= dirty)
@@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
background += background / 4;
dirty += dirty / 4;
}
+ if (!zone)
+ trace_global_dirty_state(background, dirty);
*pbackground = background;
*pdirty = dirty;
- trace_global_dirty_state(background, dirty);
+}
+
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ dirty_limits(NULL, pbackground, pdirty);
}
/**
@@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
}
}
+bool zone_dirty_ok(struct zone *zone)
+{
+ unsigned long background_thresh, dirty_thresh;
+
+ dirty_limits(zone, &background_thresh, &dirty_thresh);
+
+ return zone_page_state(zone, NR_FILE_DIRTY) +
+ zone_page_state(zone, NR_UNSTABLE_NFS) +
+ zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
+}
+
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e8e2ee..3cca043 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1368,6 +1368,7 @@ failed:
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -1667,6 +1668,25 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * This may look like it would increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But once they are full, the
+ * allocations fall back to lower zones anyway, and
+ * then this check actually protects the lower zones
+ * from a flood of dirty page allocations.
+ *
+ * XXX: Allow allocations to potentially exceed the
+ * per-zone dirty limit in the slowpath before going
+ * into reclaim, which is important when NUMA nodes
+ * are not big enough to reach the global limit. The
+ * proper fix on these setups will require awareness
+ * of zones in the dirty-throttling and the flusher
+ * threads.
+ */
+ if (!(alloc_flags & ALLOC_SLOWPATH) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -2111,7 +2131,7 @@ restart:
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ alloc_flags = gfp_to_alloc_flags(gfp_mask) | ALLOC_SLOWPATH;
/*
* Find the true preferred zone if the allocation is unconstrained by
--
1.7.6
^ permalink raw reply related [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
2011-09-20 13:45 ` Johannes Weiner
(?)
@ 2011-09-20 18:36 ` Rik van Riel
-1 siblings, 0 replies; 117+ messages in thread
From: Rik van Riel @ 2011-09-20 18:36 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On 09/20/2011 09:45 AM, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> Signed-off-by: Johannes Weiner<jweiner@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
The amount of work done in a __GFP_WRITE allocation looks
a little daunting, but doing that a million times probably
outweighs waiting on the disk even once, so...
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-20 18:36 ` Rik van Riel
0 siblings, 0 replies; 117+ messages in thread
From: Rik van Riel @ 2011-09-20 18:36 UTC (permalink / raw)
To: Johannes Weiner
Cc: linux-ext4, Jan Kara, linux-btrfs, linux-kernel, xfs,
Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu Fengguang,
Chris Mason, Minchan Kim
On 09/20/2011 09:45 AM, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> Signed-off-by: Johannes Weiner<jweiner@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
The amount of work done in a __GFP_WRITE allocation looks
a little daunting, but doing that a million times probably
outweighs waiting on the disk even once, so...
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-20 18:36 ` Rik van Riel
0 siblings, 0 replies; 117+ messages in thread
From: Rik van Riel @ 2011-09-20 18:36 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On 09/20/2011 09:45 AM, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> Signed-off-by: Johannes Weiner<jweiner@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
The amount of work done in a __GFP_WRITE allocation looks
a little daunting, but doing that a million times probably
outweighs waiting on the disk even once, so...
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
2011-09-20 13:45 ` Johannes Weiner
(?)
@ 2011-09-21 11:04 ` Shaohua Li
-1 siblings, 0 replies; 117+ messages in thread
From: Shaohua Li @ 2011-09-21 11:04 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner, Wu,
Fengguang, Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On Tue, 2011-09-20 at 21:45 +0800, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> ---
> include/linux/gfp.h | 4 ++-
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> mm/page_alloc.c | 22 ++++++++++++++-
> 4 files changed, 80 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 3a76faf..50efc7e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
> @@ -85,6 +86,7 @@ struct vm_area_struct;
>
> #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
>
> /*
> * This may seem redundant, but it's a way of annotating false positives vs.
> @@ -92,7 +94,7 @@ struct vm_area_struct;
> */
> #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
>
> -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
>
> /* This equals 0, but use constants in case they ever change */
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index a5f495f..c96ee0c 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> static inline void laptop_sync_completion(void) { }
> #endif
> void throttle_vm_writeout(gfp_t gfp_mask);
> +bool zone_dirty_ok(struct zone *zone);
>
> extern unsigned long global_dirty_limit;
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 9f896db..1fc714c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> static struct prop_descriptor vm_completions;
> static struct prop_descriptor vm_dirties;
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
> + *
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
> /*
> * Work out the current dirty-memory clamping and background writeout
> * thresholds.
> @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> }
>
> /*
> - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> + * dirty_limits - background-writeback and dirty-throttling thresholds
> *
> * Calculate the dirty thresholds based on sysctl parameters
> * - vm.dirty_background_ratio or vm.dirty_background_bytes
> @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> * real-time tasks.
> */
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
> @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> background += background / 4;
> dirty += dirty / 4;
> }
> + if (!zone)
> + trace_global_dirty_state(background, dirty);
> *pbackground = background;
> *pdirty = dirty;
> - trace_global_dirty_state(background, dirty);
> +}
> +
> +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +{
> + dirty_limits(NULL, pbackground, pdirty);
> }
>
> /**
> @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> }
> }
>
> +bool zone_dirty_ok(struct zone *zone)
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
> +
> /*
> * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7e8e2ee..3cca043 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1368,6 +1368,7 @@ failed:
> #define ALLOC_HARDER 0x10 /* try to alloc harder */
> #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
>
> #ifdef CONFIG_FAIL_PAGE_ALLOC
>
> @@ -1667,6 +1668,25 @@ zonelist_scan:
> if ((alloc_flags & ALLOC_CPUSET) &&
> !cpuset_zone_allowed_softwall(zone, gfp_mask))
> continue;
> + /*
> + * This may look like it would increase pressure on
> + * lower zones by failing allocations in higher zones
> + * before they are full. But once they are full, the
> + * allocations fall back to lower zones anyway, and
> + * then this check actually protects the lower zones
> + * from a flood of dirty page allocations.
if increasing pressure on lower zones isn't a problem since higher zones
will eventually be full, how about a workload without too many writes,
so higher zones will not be full. In such case, increasing low zone
pressure sounds not good.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 11:04 ` Shaohua Li
0 siblings, 0 replies; 117+ messages in thread
From: Shaohua Li @ 2011-09-21 11:04 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner, Wu,
Fengguang, Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On Tue, 2011-09-20 at 21:45 +0800, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> ---
> include/linux/gfp.h | 4 ++-
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> mm/page_alloc.c | 22 ++++++++++++++-
> 4 files changed, 80 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 3a76faf..50efc7e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
> @@ -85,6 +86,7 @@ struct vm_area_struct;
>
> #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
>
> /*
> * This may seem redundant, but it's a way of annotating false positives vs.
> @@ -92,7 +94,7 @@ struct vm_area_struct;
> */
> #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
>
> -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
>
> /* This equals 0, but use constants in case they ever change */
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index a5f495f..c96ee0c 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> static inline void laptop_sync_completion(void) { }
> #endif
> void throttle_vm_writeout(gfp_t gfp_mask);
> +bool zone_dirty_ok(struct zone *zone);
>
> extern unsigned long global_dirty_limit;
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 9f896db..1fc714c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> static struct prop_descriptor vm_completions;
> static struct prop_descriptor vm_dirties;
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
> + *
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
> /*
> * Work out the current dirty-memory clamping and background writeout
> * thresholds.
> @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> }
>
> /*
> - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> + * dirty_limits - background-writeback and dirty-throttling thresholds
> *
> * Calculate the dirty thresholds based on sysctl parameters
> * - vm.dirty_background_ratio or vm.dirty_background_bytes
> @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> * real-time tasks.
> */
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
> @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> background += background / 4;
> dirty += dirty / 4;
> }
> + if (!zone)
> + trace_global_dirty_state(background, dirty);
> *pbackground = background;
> *pdirty = dirty;
> - trace_global_dirty_state(background, dirty);
> +}
> +
> +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +{
> + dirty_limits(NULL, pbackground, pdirty);
> }
>
> /**
> @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> }
> }
>
> +bool zone_dirty_ok(struct zone *zone)
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
> +
> /*
> * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7e8e2ee..3cca043 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1368,6 +1368,7 @@ failed:
> #define ALLOC_HARDER 0x10 /* try to alloc harder */
> #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
>
> #ifdef CONFIG_FAIL_PAGE_ALLOC
>
> @@ -1667,6 +1668,25 @@ zonelist_scan:
> if ((alloc_flags & ALLOC_CPUSET) &&
> !cpuset_zone_allowed_softwall(zone, gfp_mask))
> continue;
> + /*
> + * This may look like it would increase pressure on
> + * lower zones by failing allocations in higher zones
> + * before they are full. But once they are full, the
> + * allocations fall back to lower zones anyway, and
> + * then this check actually protects the lower zones
> + * from a flood of dirty page allocations.
if increasing pressure on lower zones isn't a problem since higher zones
will eventually be full, how about a workload without too many writes,
so higher zones will not be full. In such case, increasing low zone
pressure sounds not good.
Thanks,
Shaohua
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 11:04 ` Shaohua Li
0 siblings, 0 replies; 117+ messages in thread
From: Shaohua Li @ 2011-09-21 11:04 UTC (permalink / raw)
To: Johannes Weiner
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu, Fengguang,
Chris Mason, Minchan Kim
On Tue, 2011-09-20 at 21:45 +0800, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> ---
> include/linux/gfp.h | 4 ++-
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> mm/page_alloc.c | 22 ++++++++++++++-
> 4 files changed, 80 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 3a76faf..50efc7e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
> @@ -85,6 +86,7 @@ struct vm_area_struct;
>
> #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
>
> /*
> * This may seem redundant, but it's a way of annotating false positives vs.
> @@ -92,7 +94,7 @@ struct vm_area_struct;
> */
> #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
>
> -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
>
> /* This equals 0, but use constants in case they ever change */
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index a5f495f..c96ee0c 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> static inline void laptop_sync_completion(void) { }
> #endif
> void throttle_vm_writeout(gfp_t gfp_mask);
> +bool zone_dirty_ok(struct zone *zone);
>
> extern unsigned long global_dirty_limit;
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 9f896db..1fc714c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> static struct prop_descriptor vm_completions;
> static struct prop_descriptor vm_dirties;
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
> + *
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
> /*
> * Work out the current dirty-memory clamping and background writeout
> * thresholds.
> @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> }
>
> /*
> - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> + * dirty_limits - background-writeback and dirty-throttling thresholds
> *
> * Calculate the dirty thresholds based on sysctl parameters
> * - vm.dirty_background_ratio or vm.dirty_background_bytes
> @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> * real-time tasks.
> */
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
> @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> background += background / 4;
> dirty += dirty / 4;
> }
> + if (!zone)
> + trace_global_dirty_state(background, dirty);
> *pbackground = background;
> *pdirty = dirty;
> - trace_global_dirty_state(background, dirty);
> +}
> +
> +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +{
> + dirty_limits(NULL, pbackground, pdirty);
> }
>
> /**
> @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> }
> }
>
> +bool zone_dirty_ok(struct zone *zone)
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
> +
> /*
> * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7e8e2ee..3cca043 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1368,6 +1368,7 @@ failed:
> #define ALLOC_HARDER 0x10 /* try to alloc harder */
> #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
>
> #ifdef CONFIG_FAIL_PAGE_ALLOC
>
> @@ -1667,6 +1668,25 @@ zonelist_scan:
> if ((alloc_flags & ALLOC_CPUSET) &&
> !cpuset_zone_allowed_softwall(zone, gfp_mask))
> continue;
> + /*
> + * This may look like it would increase pressure on
> + * lower zones by failing allocations in higher zones
> + * before they are full. But once they are full, the
> + * allocations fall back to lower zones anyway, and
> + * then this check actually protects the lower zones
> + * from a flood of dirty page allocations.
if increasing pressure on lower zones isn't a problem since higher zones
will eventually be full, how about a workload without too many writes,
so higher zones will not be full. In such case, increasing low zone
pressure sounds not good.
Thanks,
Shaohua
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
2011-09-21 11:04 ` Shaohua Li
(?)
@ 2011-09-21 13:35 ` Johannes Weiner
-1 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-21 13:35 UTC (permalink / raw)
To: Shaohua Li
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner, Wu,
Fengguang, Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On Wed, Sep 21, 2011 at 07:04:28PM +0800, Shaohua Li wrote:
> On Tue, 2011-09-20 at 21:45 +0800, Johannes Weiner wrote:
> > This patch allows allocators to pass __GFP_WRITE when they know in
> > advance that the allocated page will be written to and become dirty
> > soon. The page allocator will then attempt to distribute those
> > allocations across zones, such that no single zone will end up full of
> > dirty, and thus more or less, unreclaimable pages.
> >
> > The global dirty limits are put in proportion to the respective zone's
> > amount of dirtyable memory and allocations diverted to other zones
> > when the limit is reached.
> >
> > For now, the problem remains for NUMA configurations where the zones
> > allowed for allocation are in sum not big enough to trigger the global
> > dirty limits, but a future approach to solve this can reuse the
> > per-zone dirty limit infrastructure laid out in this patch to have
> > dirty throttling and the flusher threads consider individual zones.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > ---
> > include/linux/gfp.h | 4 ++-
> > include/linux/writeback.h | 1 +
> > mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> > mm/page_alloc.c | 22 ++++++++++++++-
> > 4 files changed, 80 insertions(+), 13 deletions(-)
> >
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 3a76faf..50efc7e 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -36,6 +36,7 @@ struct vm_area_struct;
> > #endif
> > #define ___GFP_NO_KSWAPD 0x400000u
> > #define ___GFP_OTHER_NODE 0x800000u
> > +#define ___GFP_WRITE 0x1000000u
> >
> > /*
> > * GFP bitmasks..
> > @@ -85,6 +86,7 @@ struct vm_area_struct;
> >
> > #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> > #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> > +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
> >
> > /*
> > * This may seem redundant, but it's a way of annotating false positives vs.
> > @@ -92,7 +94,7 @@ struct vm_area_struct;
> > */
> > #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
> >
> > -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> > +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> > #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
> >
> > /* This equals 0, but use constants in case they ever change */
> > diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> > index a5f495f..c96ee0c 100644
> > --- a/include/linux/writeback.h
> > +++ b/include/linux/writeback.h
> > @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> > static inline void laptop_sync_completion(void) { }
> > #endif
> > void throttle_vm_writeout(gfp_t gfp_mask);
> > +bool zone_dirty_ok(struct zone *zone);
> >
> > extern unsigned long global_dirty_limit;
> >
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 9f896db..1fc714c 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> > static struct prop_descriptor vm_completions;
> > static struct prop_descriptor vm_dirties;
> >
> > +static unsigned long zone_dirtyable_memory(struct zone *zone)
> > +{
> > + unsigned long x;
> > + /*
> > + * To keep a reasonable ratio between dirty memory and lowmem,
> > + * highmem is not considered dirtyable on a global level.
> > + *
> > + * But we allow individual highmem zones to hold a potentially
> > + * bigger share of that global amount of dirty pages as long
> > + * as they have enough free or reclaimable pages around.
> > + */
> > + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> > + x += zone_reclaimable_pages(zone);
> > + return x;
> > +}
> > +
> > /*
> > * Work out the current dirty-memory clamping and background writeout
> > * thresholds.
> > @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> > }
> >
> > /*
> > - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> > + * dirty_limits - background-writeback and dirty-throttling thresholds
> > *
> > * Calculate the dirty thresholds based on sysctl parameters
> > * - vm.dirty_background_ratio or vm.dirty_background_bytes
> > @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> > * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> > * real-time tasks.
> > */
> > -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +static void dirty_limits(struct zone *zone,
> > + unsigned long *pbackground,
> > + unsigned long *pdirty)
> > {
> > + unsigned long uninitialized_var(zone_memory);
> > + unsigned long available_memory;
> > + unsigned long global_memory;
> > unsigned long background;
> > - unsigned long dirty;
> > - unsigned long uninitialized_var(available_memory);
> > struct task_struct *tsk;
> > + unsigned long dirty;
> >
> > - if (!vm_dirty_bytes || !dirty_background_bytes)
> > - available_memory = determine_dirtyable_memory();
> > + global_memory = determine_dirtyable_memory();
> > + if (zone)
> > + available_memory = zone_memory = zone_dirtyable_memory(zone);
> > + else
> > + available_memory = global_memory;
> >
> > - if (vm_dirty_bytes)
> > + if (vm_dirty_bytes) {
> > dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + dirty = dirty * zone_memory / global_memory;
> > + } else
> > dirty = (vm_dirty_ratio * available_memory) / 100;
> >
> > - if (dirty_background_bytes)
> > + if (dirty_background_bytes) {
> > background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + background = background * zone_memory / global_memory;
> > + } else
> > background = (dirty_background_ratio * available_memory) / 100;
> >
> > if (background >= dirty)
> > @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > background += background / 4;
> > dirty += dirty / 4;
> > }
> > + if (!zone)
> > + trace_global_dirty_state(background, dirty);
> > *pbackground = background;
> > *pdirty = dirty;
> > - trace_global_dirty_state(background, dirty);
> > +}
> > +
> > +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +{
> > + dirty_limits(NULL, pbackground, pdirty);
> > }
> >
> > /**
> > @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> > }
> > }
> >
> > +bool zone_dirty_ok(struct zone *zone)
> > +{
> > + unsigned long background_thresh, dirty_thresh;
> > +
> > + dirty_limits(zone, &background_thresh, &dirty_thresh);
> > +
> > + return zone_page_state(zone, NR_FILE_DIRTY) +
> > + zone_page_state(zone, NR_UNSTABLE_NFS) +
> > + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> > +}
> > +
> > /*
> > * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> > */
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 7e8e2ee..3cca043 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1368,6 +1368,7 @@ failed:
> > #define ALLOC_HARDER 0x10 /* try to alloc harder */
> > #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> > #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> > +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
> >
> > #ifdef CONFIG_FAIL_PAGE_ALLOC
> >
> > @@ -1667,6 +1668,25 @@ zonelist_scan:
> > if ((alloc_flags & ALLOC_CPUSET) &&
> > !cpuset_zone_allowed_softwall(zone, gfp_mask))
> > continue;
> > + /*
> > + * This may look like it would increase pressure on
> > + * lower zones by failing allocations in higher zones
> > + * before they are full. But once they are full, the
> > + * allocations fall back to lower zones anyway, and
> > + * then this check actually protects the lower zones
> > + * from a flood of dirty page allocations.
> if increasing pressure on lower zones isn't a problem since higher zones
> will eventually be full, how about a workload without too many writes,
> so higher zones will not be full. In such case, increasing low zone
> pressure sounds not good.
While there is a shift of dirty pages possible in workloads that were
able to completely fit those pages in the highest zone, the extent of
that shift is limited, which should prevent it from becoming a
practical burden for the lower zones.
Because the number of dirtyable pages does include neither a zone's
lowmem reserves, nor the watermarks, nor kernel allocations, a lower
zone does not receive a bigger share than it can afford when the
allocations are diverted from the higher zones.
To put it into perspective: with these patches there could be an
increased allocation latency for a workload with a writer of fixed
size fitting into the Normal zone and an allocator that suddenly
requires more than 3G (~ DMA32 size minus the 20% allowable dirty
pages) DMA32 memory. That sounds a bit artificial, to me at least.
And without these patches, we encounter exactly those allocation
latencies on a regular basis when writing files larger than memory.
Hannes
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 13:35 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-21 13:35 UTC (permalink / raw)
To: Shaohua Li
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu, Fengguang,
Chris Mason, Minchan Kim
On Wed, Sep 21, 2011 at 07:04:28PM +0800, Shaohua Li wrote:
> On Tue, 2011-09-20 at 21:45 +0800, Johannes Weiner wrote:
> > This patch allows allocators to pass __GFP_WRITE when they know in
> > advance that the allocated page will be written to and become dirty
> > soon. The page allocator will then attempt to distribute those
> > allocations across zones, such that no single zone will end up full of
> > dirty, and thus more or less, unreclaimable pages.
> >
> > The global dirty limits are put in proportion to the respective zone's
> > amount of dirtyable memory and allocations diverted to other zones
> > when the limit is reached.
> >
> > For now, the problem remains for NUMA configurations where the zones
> > allowed for allocation are in sum not big enough to trigger the global
> > dirty limits, but a future approach to solve this can reuse the
> > per-zone dirty limit infrastructure laid out in this patch to have
> > dirty throttling and the flusher threads consider individual zones.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > ---
> > include/linux/gfp.h | 4 ++-
> > include/linux/writeback.h | 1 +
> > mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> > mm/page_alloc.c | 22 ++++++++++++++-
> > 4 files changed, 80 insertions(+), 13 deletions(-)
> >
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 3a76faf..50efc7e 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -36,6 +36,7 @@ struct vm_area_struct;
> > #endif
> > #define ___GFP_NO_KSWAPD 0x400000u
> > #define ___GFP_OTHER_NODE 0x800000u
> > +#define ___GFP_WRITE 0x1000000u
> >
> > /*
> > * GFP bitmasks..
> > @@ -85,6 +86,7 @@ struct vm_area_struct;
> >
> > #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> > #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> > +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
> >
> > /*
> > * This may seem redundant, but it's a way of annotating false positives vs.
> > @@ -92,7 +94,7 @@ struct vm_area_struct;
> > */
> > #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
> >
> > -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> > +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> > #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
> >
> > /* This equals 0, but use constants in case they ever change */
> > diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> > index a5f495f..c96ee0c 100644
> > --- a/include/linux/writeback.h
> > +++ b/include/linux/writeback.h
> > @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> > static inline void laptop_sync_completion(void) { }
> > #endif
> > void throttle_vm_writeout(gfp_t gfp_mask);
> > +bool zone_dirty_ok(struct zone *zone);
> >
> > extern unsigned long global_dirty_limit;
> >
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 9f896db..1fc714c 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> > static struct prop_descriptor vm_completions;
> > static struct prop_descriptor vm_dirties;
> >
> > +static unsigned long zone_dirtyable_memory(struct zone *zone)
> > +{
> > + unsigned long x;
> > + /*
> > + * To keep a reasonable ratio between dirty memory and lowmem,
> > + * highmem is not considered dirtyable on a global level.
> > + *
> > + * But we allow individual highmem zones to hold a potentially
> > + * bigger share of that global amount of dirty pages as long
> > + * as they have enough free or reclaimable pages around.
> > + */
> > + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> > + x += zone_reclaimable_pages(zone);
> > + return x;
> > +}
> > +
> > /*
> > * Work out the current dirty-memory clamping and background writeout
> > * thresholds.
> > @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> > }
> >
> > /*
> > - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> > + * dirty_limits - background-writeback and dirty-throttling thresholds
> > *
> > * Calculate the dirty thresholds based on sysctl parameters
> > * - vm.dirty_background_ratio or vm.dirty_background_bytes
> > @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> > * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> > * real-time tasks.
> > */
> > -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +static void dirty_limits(struct zone *zone,
> > + unsigned long *pbackground,
> > + unsigned long *pdirty)
> > {
> > + unsigned long uninitialized_var(zone_memory);
> > + unsigned long available_memory;
> > + unsigned long global_memory;
> > unsigned long background;
> > - unsigned long dirty;
> > - unsigned long uninitialized_var(available_memory);
> > struct task_struct *tsk;
> > + unsigned long dirty;
> >
> > - if (!vm_dirty_bytes || !dirty_background_bytes)
> > - available_memory = determine_dirtyable_memory();
> > + global_memory = determine_dirtyable_memory();
> > + if (zone)
> > + available_memory = zone_memory = zone_dirtyable_memory(zone);
> > + else
> > + available_memory = global_memory;
> >
> > - if (vm_dirty_bytes)
> > + if (vm_dirty_bytes) {
> > dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + dirty = dirty * zone_memory / global_memory;
> > + } else
> > dirty = (vm_dirty_ratio * available_memory) / 100;
> >
> > - if (dirty_background_bytes)
> > + if (dirty_background_bytes) {
> > background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + background = background * zone_memory / global_memory;
> > + } else
> > background = (dirty_background_ratio * available_memory) / 100;
> >
> > if (background >= dirty)
> > @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > background += background / 4;
> > dirty += dirty / 4;
> > }
> > + if (!zone)
> > + trace_global_dirty_state(background, dirty);
> > *pbackground = background;
> > *pdirty = dirty;
> > - trace_global_dirty_state(background, dirty);
> > +}
> > +
> > +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +{
> > + dirty_limits(NULL, pbackground, pdirty);
> > }
> >
> > /**
> > @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> > }
> > }
> >
> > +bool zone_dirty_ok(struct zone *zone)
> > +{
> > + unsigned long background_thresh, dirty_thresh;
> > +
> > + dirty_limits(zone, &background_thresh, &dirty_thresh);
> > +
> > + return zone_page_state(zone, NR_FILE_DIRTY) +
> > + zone_page_state(zone, NR_UNSTABLE_NFS) +
> > + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> > +}
> > +
> > /*
> > * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> > */
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 7e8e2ee..3cca043 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1368,6 +1368,7 @@ failed:
> > #define ALLOC_HARDER 0x10 /* try to alloc harder */
> > #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> > #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> > +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
> >
> > #ifdef CONFIG_FAIL_PAGE_ALLOC
> >
> > @@ -1667,6 +1668,25 @@ zonelist_scan:
> > if ((alloc_flags & ALLOC_CPUSET) &&
> > !cpuset_zone_allowed_softwall(zone, gfp_mask))
> > continue;
> > + /*
> > + * This may look like it would increase pressure on
> > + * lower zones by failing allocations in higher zones
> > + * before they are full. But once they are full, the
> > + * allocations fall back to lower zones anyway, and
> > + * then this check actually protects the lower zones
> > + * from a flood of dirty page allocations.
> if increasing pressure on lower zones isn't a problem since higher zones
> will eventually be full, how about a workload without too many writes,
> so higher zones will not be full. In such case, increasing low zone
> pressure sounds not good.
While there is a shift of dirty pages possible in workloads that were
able to completely fit those pages in the highest zone, the extent of
that shift is limited, which should prevent it from becoming a
practical burden for the lower zones.
Because the number of dirtyable pages does include neither a zone's
lowmem reserves, nor the watermarks, nor kernel allocations, a lower
zone does not receive a bigger share than it can afford when the
allocations are diverted from the higher zones.
To put it into perspective: with these patches there could be an
increased allocation latency for a workload with a writer of fixed
size fitting into the Normal zone and an allocator that suddenly
requires more than 3G (~ DMA32 size minus the 20% allowable dirty
pages) DMA32 memory. That sounds a bit artificial, to me at least.
And without these patches, we encounter exactly those allocation
latencies on a regular basis when writing files larger than memory.
Hannes
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 13:35 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-21 13:35 UTC (permalink / raw)
To: Shaohua Li
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner, Wu,
Fengguang, Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On Wed, Sep 21, 2011 at 07:04:28PM +0800, Shaohua Li wrote:
> On Tue, 2011-09-20 at 21:45 +0800, Johannes Weiner wrote:
> > This patch allows allocators to pass __GFP_WRITE when they know in
> > advance that the allocated page will be written to and become dirty
> > soon. The page allocator will then attempt to distribute those
> > allocations across zones, such that no single zone will end up full of
> > dirty, and thus more or less, unreclaimable pages.
> >
> > The global dirty limits are put in proportion to the respective zone's
> > amount of dirtyable memory and allocations diverted to other zones
> > when the limit is reached.
> >
> > For now, the problem remains for NUMA configurations where the zones
> > allowed for allocation are in sum not big enough to trigger the global
> > dirty limits, but a future approach to solve this can reuse the
> > per-zone dirty limit infrastructure laid out in this patch to have
> > dirty throttling and the flusher threads consider individual zones.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> > ---
> > include/linux/gfp.h | 4 ++-
> > include/linux/writeback.h | 1 +
> > mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> > mm/page_alloc.c | 22 ++++++++++++++-
> > 4 files changed, 80 insertions(+), 13 deletions(-)
> >
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index 3a76faf..50efc7e 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -36,6 +36,7 @@ struct vm_area_struct;
> > #endif
> > #define ___GFP_NO_KSWAPD 0x400000u
> > #define ___GFP_OTHER_NODE 0x800000u
> > +#define ___GFP_WRITE 0x1000000u
> >
> > /*
> > * GFP bitmasks..
> > @@ -85,6 +86,7 @@ struct vm_area_struct;
> >
> > #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> > #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> > +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
> >
> > /*
> > * This may seem redundant, but it's a way of annotating false positives vs.
> > @@ -92,7 +94,7 @@ struct vm_area_struct;
> > */
> > #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
> >
> > -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> > +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> > #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
> >
> > /* This equals 0, but use constants in case they ever change */
> > diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> > index a5f495f..c96ee0c 100644
> > --- a/include/linux/writeback.h
> > +++ b/include/linux/writeback.h
> > @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> > static inline void laptop_sync_completion(void) { }
> > #endif
> > void throttle_vm_writeout(gfp_t gfp_mask);
> > +bool zone_dirty_ok(struct zone *zone);
> >
> > extern unsigned long global_dirty_limit;
> >
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 9f896db..1fc714c 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> > static struct prop_descriptor vm_completions;
> > static struct prop_descriptor vm_dirties;
> >
> > +static unsigned long zone_dirtyable_memory(struct zone *zone)
> > +{
> > + unsigned long x;
> > + /*
> > + * To keep a reasonable ratio between dirty memory and lowmem,
> > + * highmem is not considered dirtyable on a global level.
> > + *
> > + * But we allow individual highmem zones to hold a potentially
> > + * bigger share of that global amount of dirty pages as long
> > + * as they have enough free or reclaimable pages around.
> > + */
> > + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> > + x += zone_reclaimable_pages(zone);
> > + return x;
> > +}
> > +
> > /*
> > * Work out the current dirty-memory clamping and background writeout
> > * thresholds.
> > @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> > }
> >
> > /*
> > - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> > + * dirty_limits - background-writeback and dirty-throttling thresholds
> > *
> > * Calculate the dirty thresholds based on sysctl parameters
> > * - vm.dirty_background_ratio or vm.dirty_background_bytes
> > @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> > * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> > * real-time tasks.
> > */
> > -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +static void dirty_limits(struct zone *zone,
> > + unsigned long *pbackground,
> > + unsigned long *pdirty)
> > {
> > + unsigned long uninitialized_var(zone_memory);
> > + unsigned long available_memory;
> > + unsigned long global_memory;
> > unsigned long background;
> > - unsigned long dirty;
> > - unsigned long uninitialized_var(available_memory);
> > struct task_struct *tsk;
> > + unsigned long dirty;
> >
> > - if (!vm_dirty_bytes || !dirty_background_bytes)
> > - available_memory = determine_dirtyable_memory();
> > + global_memory = determine_dirtyable_memory();
> > + if (zone)
> > + available_memory = zone_memory = zone_dirtyable_memory(zone);
> > + else
> > + available_memory = global_memory;
> >
> > - if (vm_dirty_bytes)
> > + if (vm_dirty_bytes) {
> > dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + dirty = dirty * zone_memory / global_memory;
> > + } else
> > dirty = (vm_dirty_ratio * available_memory) / 100;
> >
> > - if (dirty_background_bytes)
> > + if (dirty_background_bytes) {
> > background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + background = background * zone_memory / global_memory;
> > + } else
> > background = (dirty_background_ratio * available_memory) / 100;
> >
> > if (background >= dirty)
> > @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > background += background / 4;
> > dirty += dirty / 4;
> > }
> > + if (!zone)
> > + trace_global_dirty_state(background, dirty);
> > *pbackground = background;
> > *pdirty = dirty;
> > - trace_global_dirty_state(background, dirty);
> > +}
> > +
> > +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +{
> > + dirty_limits(NULL, pbackground, pdirty);
> > }
> >
> > /**
> > @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> > }
> > }
> >
> > +bool zone_dirty_ok(struct zone *zone)
> > +{
> > + unsigned long background_thresh, dirty_thresh;
> > +
> > + dirty_limits(zone, &background_thresh, &dirty_thresh);
> > +
> > + return zone_page_state(zone, NR_FILE_DIRTY) +
> > + zone_page_state(zone, NR_UNSTABLE_NFS) +
> > + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> > +}
> > +
> > /*
> > * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> > */
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 7e8e2ee..3cca043 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1368,6 +1368,7 @@ failed:
> > #define ALLOC_HARDER 0x10 /* try to alloc harder */
> > #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> > #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> > +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
> >
> > #ifdef CONFIG_FAIL_PAGE_ALLOC
> >
> > @@ -1667,6 +1668,25 @@ zonelist_scan:
> > if ((alloc_flags & ALLOC_CPUSET) &&
> > !cpuset_zone_allowed_softwall(zone, gfp_mask))
> > continue;
> > + /*
> > + * This may look like it would increase pressure on
> > + * lower zones by failing allocations in higher zones
> > + * before they are full. But once they are full, the
> > + * allocations fall back to lower zones anyway, and
> > + * then this check actually protects the lower zones
> > + * from a flood of dirty page allocations.
> if increasing pressure on lower zones isn't a problem since higher zones
> will eventually be full, how about a workload without too many writes,
> so higher zones will not be full. In such case, increasing low zone
> pressure sounds not good.
While there is a shift of dirty pages possible in workloads that were
able to completely fit those pages in the highest zone, the extent of
that shift is limited, which should prevent it from becoming a
practical burden for the lower zones.
Because the number of dirtyable pages does include neither a zone's
lowmem reserves, nor the watermarks, nor kernel allocations, a lower
zone does not receive a bigger share than it can afford when the
allocations are diverted from the higher zones.
To put it into perspective: with these patches there could be an
increased allocation latency for a workload with a writer of fixed
size fitting into the Normal zone and an allocator that suddenly
requires more than 3G (~ DMA32 size minus the 20% allowable dirty
pages) DMA32 memory. That sounds a bit artificial, to me at least.
And without these patches, we encounter exactly those allocation
latencies on a regular basis when writing files larger than memory.
Hannes
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
2011-09-20 13:45 ` Johannes Weiner
(?)
@ 2011-09-21 14:30 ` Mel Gorman
-1 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-21 14:30 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On Tue, Sep 20, 2011 at 03:45:13PM +0200, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
I know this came up the last time but an explanation why lowmem
pressure is not expected to be a problem should be in the changelog.
"At first glance, it would appear that there is a lowmem pressure risk
but it is not the case. Highmem is not considered dirtyable memory.
Hence, if highmem is very large, the global amount of dirty memory
will fit in the highmem zone without falling back to the lower zones
and causing lowmem pressure. If highmem is small then the amount of
pages that 'spill over' to lower zones is limited and no likely to
significantly increase the risk of lowmem pressure due to things like
pagetable page allocations for example. In other words, the timing of
when lowmem pressure happens changes but overall the pressure is roughly
the same".
or something.
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
While I think this particular point is important, I don't think it
should be a show stopped for the series.
I'm going to steal Andrew's line here as well - you explain what you are
doing in the patch leader but not why.
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> ---
> include/linux/gfp.h | 4 ++-
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> mm/page_alloc.c | 22 ++++++++++++++-
> 4 files changed, 80 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 3a76faf..50efc7e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
> @@ -85,6 +86,7 @@ struct vm_area_struct;
>
> #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
>
> /*
> * This may seem redundant, but it's a way of annotating false positives vs.
> @@ -92,7 +94,7 @@ struct vm_area_struct;
> */
> #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
>
> -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
>
> /* This equals 0, but use constants in case they ever change */
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index a5f495f..c96ee0c 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> static inline void laptop_sync_completion(void) { }
> #endif
> void throttle_vm_writeout(gfp_t gfp_mask);
> +bool zone_dirty_ok(struct zone *zone);
>
> extern unsigned long global_dirty_limit;
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 9f896db..1fc714c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> static struct prop_descriptor vm_completions;
> static struct prop_descriptor vm_dirties;
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
> + *
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
> /*
> * Work out the current dirty-memory clamping and background writeout
> * thresholds.
> @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> }
>
> /*
> - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> + * dirty_limits - background-writeback and dirty-throttling thresholds
> *
> * Calculate the dirty thresholds based on sysctl parameters
> * - vm.dirty_background_ratio or vm.dirty_background_bytes
> @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> * real-time tasks.
> */
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
> @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> background += background / 4;
> dirty += dirty / 4;
> }
> + if (!zone)
> + trace_global_dirty_state(background, dirty);
> *pbackground = background;
> *pdirty = dirty;
> - trace_global_dirty_state(background, dirty);
> +}
> +
> +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +{
> + dirty_limits(NULL, pbackground, pdirty);
> }
>
> /**
> @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> }
> }
>
> +bool zone_dirty_ok(struct zone *zone)
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
> +
> /*
> * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7e8e2ee..3cca043 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1368,6 +1368,7 @@ failed:
> #define ALLOC_HARDER 0x10 /* try to alloc harder */
> #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
>
> #ifdef CONFIG_FAIL_PAGE_ALLOC
>
> @@ -1667,6 +1668,25 @@ zonelist_scan:
> if ((alloc_flags & ALLOC_CPUSET) &&
> !cpuset_zone_allowed_softwall(zone, gfp_mask))
> continue;
> + /*
> + * This may look like it would increase pressure on
> + * lower zones by failing allocations in higher zones
> + * before they are full. But once they are full, the
> + * allocations fall back to lower zones anyway, and
> + * then this check actually protects the lower zones
> + * from a flood of dirty page allocations.
> + *
> + * XXX: Allow allocations to potentially exceed the
> + * per-zone dirty limit in the slowpath before going
> + * into reclaim, which is important when NUMA nodes
> + * are not big enough to reach the global limit. The
> + * proper fix on these setups will require awareness
> + * of zones in the dirty-throttling and the flusher
> + * threads.
> + */
Here would be a good reason to explain why we sometimes allow
__GFP_WRITE pages to fall back to lower zones. As it is, the reader
is required to remember that this affects LRU ordering and when/if
reclaim tries to write back the page.
> + if (!(alloc_flags & ALLOC_SLOWPATH) &&
> + (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
> + goto this_zone_full;
>
> BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
> if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
> @@ -2111,7 +2131,7 @@ restart:
> * reclaim. Now things get more complex, so set up alloc_flags according
> * to how we want to proceed.
> */
> - alloc_flags = gfp_to_alloc_flags(gfp_mask);
> + alloc_flags = gfp_to_alloc_flags(gfp_mask) | ALLOC_SLOWPATH;
>
Instead of adding ALLOC_SLOWPATH, check for ALLOC_WMARK_LOW which is
only set in the fast path.
> /*
> * Find the true preferred zone if the allocation is unconstrained by
Functionally, I did not find a problem with the patch.
--
Mel Gorman
SUSE Labs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 14:30 ` Mel Gorman
0 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-21 14:30 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel
On Tue, Sep 20, 2011 at 03:45:13PM +0200, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
I know this came up the last time but an explanation why lowmem
pressure is not expected to be a problem should be in the changelog.
"At first glance, it would appear that there is a lowmem pressure risk
but it is not the case. Highmem is not considered dirtyable memory.
Hence, if highmem is very large, the global amount of dirty memory
will fit in the highmem zone without falling back to the lower zones
and causing lowmem pressure. If highmem is small then the amount of
pages that 'spill over' to lower zones is limited and no likely to
significantly increase the risk of lowmem pressure due to things like
pagetable page allocations for example. In other words, the timing of
when lowmem pressure happens changes but overall the pressure is roughly
the same".
or something.
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
While I think this particular point is important, I don't think it
should be a show stopped for the series.
I'm going to steal Andrew's line here as well - you explain what you are
doing in the patch leader but not why.
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> ---
> include/linux/gfp.h | 4 ++-
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> mm/page_alloc.c | 22 ++++++++++++++-
> 4 files changed, 80 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 3a76faf..50efc7e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
> @@ -85,6 +86,7 @@ struct vm_area_struct;
>
> #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
>
> /*
> * This may seem redundant, but it's a way of annotating false positives vs.
> @@ -92,7 +94,7 @@ struct vm_area_struct;
> */
> #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
>
> -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
>
> /* This equals 0, but use constants in case they ever change */
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index a5f495f..c96ee0c 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> static inline void laptop_sync_completion(void) { }
> #endif
> void throttle_vm_writeout(gfp_t gfp_mask);
> +bool zone_dirty_ok(struct zone *zone);
>
> extern unsigned long global_dirty_limit;
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 9f896db..1fc714c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> static struct prop_descriptor vm_completions;
> static struct prop_descriptor vm_dirties;
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
> + *
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
> /*
> * Work out the current dirty-memory clamping and background writeout
> * thresholds.
> @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> }
>
> /*
> - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> + * dirty_limits - background-writeback and dirty-throttling thresholds
> *
> * Calculate the dirty thresholds based on sysctl parameters
> * - vm.dirty_background_ratio or vm.dirty_background_bytes
> @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> * real-time tasks.
> */
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
> @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> background += background / 4;
> dirty += dirty / 4;
> }
> + if (!zone)
> + trace_global_dirty_state(background, dirty);
> *pbackground = background;
> *pdirty = dirty;
> - trace_global_dirty_state(background, dirty);
> +}
> +
> +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +{
> + dirty_limits(NULL, pbackground, pdirty);
> }
>
> /**
> @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> }
> }
>
> +bool zone_dirty_ok(struct zone *zone)
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
> +
> /*
> * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7e8e2ee..3cca043 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1368,6 +1368,7 @@ failed:
> #define ALLOC_HARDER 0x10 /* try to alloc harder */
> #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
>
> #ifdef CONFIG_FAIL_PAGE_ALLOC
>
> @@ -1667,6 +1668,25 @@ zonelist_scan:
> if ((alloc_flags & ALLOC_CPUSET) &&
> !cpuset_zone_allowed_softwall(zone, gfp_mask))
> continue;
> + /*
> + * This may look like it would increase pressure on
> + * lower zones by failing allocations in higher zones
> + * before they are full. But once they are full, the
> + * allocations fall back to lower zones anyway, and
> + * then this check actually protects the lower zones
> + * from a flood of dirty page allocations.
> + *
> + * XXX: Allow allocations to potentially exceed the
> + * per-zone dirty limit in the slowpath before going
> + * into reclaim, which is important when NUMA nodes
> + * are not big enough to reach the global limit. The
> + * proper fix on these setups will require awareness
> + * of zones in the dirty-throttling and the flusher
> + * threads.
> + */
Here would be a good reason to explain why we sometimes allow
__GFP_WRITE pages to fall back to lower zones. As it is, the reader
is required to remember that this affects LRU ordering and when/if
reclaim tries to write back the page.
> + if (!(alloc_flags & ALLOC_SLOWPATH) &&
> + (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
> + goto this_zone_full;
>
> BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
> if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
> @@ -2111,7 +2131,7 @@ restart:
> * reclaim. Now things get more complex, so set up alloc_flags according
> * to how we want to proceed.
> */
> - alloc_flags = gfp_to_alloc_flags(gfp_mask);
> + alloc_flags = gfp_to_alloc_flags(gfp_mask) | ALLOC_SLOWPATH;
>
Instead of adding ALLOC_SLOWPATH, check for ALLOC_WMARK_LOW which is
only set in the fast path.
> /*
> * Find the true preferred zone if the allocation is unconstrained by
Functionally, I did not find a problem with the patch.
--
Mel Gorman
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 14:30 ` Mel Gorman
0 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-21 14:30 UTC (permalink / raw)
To: Johannes Weiner
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Minchan Kim,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu Fengguang,
Chris Mason
On Tue, Sep 20, 2011 at 03:45:13PM +0200, Johannes Weiner wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
>
I know this came up the last time but an explanation why lowmem
pressure is not expected to be a problem should be in the changelog.
"At first glance, it would appear that there is a lowmem pressure risk
but it is not the case. Highmem is not considered dirtyable memory.
Hence, if highmem is very large, the global amount of dirty memory
will fit in the highmem zone without falling back to the lower zones
and causing lowmem pressure. If highmem is small then the amount of
pages that 'spill over' to lower zones is limited and no likely to
significantly increase the risk of lowmem pressure due to things like
pagetable page allocations for example. In other words, the timing of
when lowmem pressure happens changes but overall the pressure is roughly
the same".
or something.
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory and allocations diverted to other zones
> when the limit is reached.
>
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
While I think this particular point is important, I don't think it
should be a show stopped for the series.
I'm going to steal Andrew's line here as well - you explain what you are
doing in the patch leader but not why.
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
> ---
> include/linux/gfp.h | 4 ++-
> include/linux/writeback.h | 1 +
> mm/page-writeback.c | 66 +++++++++++++++++++++++++++++++++++++-------
> mm/page_alloc.c | 22 ++++++++++++++-
> 4 files changed, 80 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 3a76faf..50efc7e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
> @@ -85,6 +86,7 @@ struct vm_area_struct;
>
> #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
> #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
> +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
>
> /*
> * This may seem redundant, but it's a way of annotating false positives vs.
> @@ -92,7 +94,7 @@ struct vm_area_struct;
> */
> #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
>
> -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
> +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
> #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
>
> /* This equals 0, but use constants in case they ever change */
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index a5f495f..c96ee0c 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
> static inline void laptop_sync_completion(void) { }
> #endif
> void throttle_vm_writeout(gfp_t gfp_mask);
> +bool zone_dirty_ok(struct zone *zone);
>
> extern unsigned long global_dirty_limit;
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 9f896db..1fc714c 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -142,6 +142,22 @@ unsigned long global_dirty_limit;
> static struct prop_descriptor vm_completions;
> static struct prop_descriptor vm_dirties;
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
> + *
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
> /*
> * Work out the current dirty-memory clamping and background writeout
> * thresholds.
> @@ -417,7 +433,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> }
>
> /*
> - * global_dirty_limits - background-writeback and dirty-throttling thresholds
> + * dirty_limits - background-writeback and dirty-throttling thresholds
> *
> * Calculate the dirty thresholds based on sysctl parameters
> * - vm.dirty_background_ratio or vm.dirty_background_bytes
> @@ -425,24 +441,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
> * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> * real-time tasks.
> */
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
> @@ -452,9 +479,15 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> background += background / 4;
> dirty += dirty / 4;
> }
> + if (!zone)
> + trace_global_dirty_state(background, dirty);
> *pbackground = background;
> *pdirty = dirty;
> - trace_global_dirty_state(background, dirty);
> +}
> +
> +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +{
> + dirty_limits(NULL, pbackground, pdirty);
> }
>
> /**
> @@ -875,6 +908,17 @@ void throttle_vm_writeout(gfp_t gfp_mask)
> }
> }
>
> +bool zone_dirty_ok(struct zone *zone)
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
> +
> /*
> * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7e8e2ee..3cca043 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1368,6 +1368,7 @@ failed:
> #define ALLOC_HARDER 0x10 /* try to alloc harder */
> #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
> #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
> +#define ALLOC_SLOWPATH 0x80 /* allocator retrying */
>
> #ifdef CONFIG_FAIL_PAGE_ALLOC
>
> @@ -1667,6 +1668,25 @@ zonelist_scan:
> if ((alloc_flags & ALLOC_CPUSET) &&
> !cpuset_zone_allowed_softwall(zone, gfp_mask))
> continue;
> + /*
> + * This may look like it would increase pressure on
> + * lower zones by failing allocations in higher zones
> + * before they are full. But once they are full, the
> + * allocations fall back to lower zones anyway, and
> + * then this check actually protects the lower zones
> + * from a flood of dirty page allocations.
> + *
> + * XXX: Allow allocations to potentially exceed the
> + * per-zone dirty limit in the slowpath before going
> + * into reclaim, which is important when NUMA nodes
> + * are not big enough to reach the global limit. The
> + * proper fix on these setups will require awareness
> + * of zones in the dirty-throttling and the flusher
> + * threads.
> + */
Here would be a good reason to explain why we sometimes allow
__GFP_WRITE pages to fall back to lower zones. As it is, the reader
is required to remember that this affects LRU ordering and when/if
reclaim tries to write back the page.
> + if (!(alloc_flags & ALLOC_SLOWPATH) &&
> + (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
> + goto this_zone_full;
>
> BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
> if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
> @@ -2111,7 +2131,7 @@ restart:
> * reclaim. Now things get more complex, so set up alloc_flags according
> * to how we want to proceed.
> */
> - alloc_flags = gfp_to_alloc_flags(gfp_mask);
> + alloc_flags = gfp_to_alloc_flags(gfp_mask) | ALLOC_SLOWPATH;
>
Instead of adding ALLOC_SLOWPATH, check for ALLOC_WMARK_LOW which is
only set in the fast path.
> /*
> * Find the true preferred zone if the allocation is unconstrained by
Functionally, I did not find a problem with the patch.
--
Mel Gorman
SUSE Labs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
2011-09-20 13:45 ` Johannes Weiner
(?)
@ 2011-09-21 23:02 ` Andrew Morton
-1 siblings, 0 replies; 117+ messages in thread
From: Andrew Morton @ 2011-09-21 23:02 UTC (permalink / raw)
To: Johannes Weiner
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Tue, 20 Sep 2011 15:45:13 +0200
Johannes Weiner <jweiner@redhat.com> wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
Across all zones, or across the zones within the node or what? Some
more description of how all this plays with NUMA is needed, please.
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory
I don't know what this means. How can a global limit be controlled by
what is happening within each single zone? Please describe this design
concept fully.
> and allocations diverted to other zones
> when the limit is reached.
hm.
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> ...
>
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
>
> ...
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
Appears to return the number of pages in a particular zone which are
considered "dirtyable". Some discussion of how this decision is made
would be illuminating.
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
Whereabouts in the kernel is this policy implemented?
determine_dirtyable_memory()? It does (or can) consider highmem
pages? Comment seems wrong?
Should we rename determine_dirtyable_memory() to
global_dirtyable_memory(), to get some sense of its relationship with
zone_dirtyable_memory()?
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
>
> ...
>
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
So passing zone==NULL alters dirty_limits()'s behaviour. Seems that it
flips the function between global_dirty_limits and zone_dirty_limits?
Would it be better if we actually had separate global_dirty_limits()
and zone_dirty_limits() rather than a magical mode?
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
>
> ...
>
> +bool zone_dirty_ok(struct zone *zone)
Full description of the return value, please.
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
We never needed to calculate &background_thresh,. I wonder if that
matters.
>
> ...
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 23:02 ` Andrew Morton
0 siblings, 0 replies; 117+ messages in thread
From: Andrew Morton @ 2011-09-21 23:02 UTC (permalink / raw)
To: Johannes Weiner
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu Fengguang,
Chris Mason, Minchan Kim
On Tue, 20 Sep 2011 15:45:13 +0200
Johannes Weiner <jweiner@redhat.com> wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
Across all zones, or across the zones within the node or what? Some
more description of how all this plays with NUMA is needed, please.
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory
I don't know what this means. How can a global limit be controlled by
what is happening within each single zone? Please describe this design
concept fully.
> and allocations diverted to other zones
> when the limit is reached.
hm.
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> ...
>
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
>
> ...
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
Appears to return the number of pages in a particular zone which are
considered "dirtyable". Some discussion of how this decision is made
would be illuminating.
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
Whereabouts in the kernel is this policy implemented?
determine_dirtyable_memory()? It does (or can) consider highmem
pages? Comment seems wrong?
Should we rename determine_dirtyable_memory() to
global_dirtyable_memory(), to get some sense of its relationship with
zone_dirtyable_memory()?
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
>
> ...
>
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
So passing zone==NULL alters dirty_limits()'s behaviour. Seems that it
flips the function between global_dirty_limits and zone_dirty_limits?
Would it be better if we actually had separate global_dirty_limits()
and zone_dirty_limits() rather than a magical mode?
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
>
> ...
>
> +bool zone_dirty_ok(struct zone *zone)
Full description of the return value, please.
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
We never needed to calculate &background_thresh,. I wonder if that
matters.
>
> ...
>
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-21 23:02 ` Andrew Morton
0 siblings, 0 replies; 117+ messages in thread
From: Andrew Morton @ 2011-09-21 23:02 UTC (permalink / raw)
To: Johannes Weiner
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Tue, 20 Sep 2011 15:45:13 +0200
Johannes Weiner <jweiner@redhat.com> wrote:
> This patch allows allocators to pass __GFP_WRITE when they know in
> advance that the allocated page will be written to and become dirty
> soon. The page allocator will then attempt to distribute those
> allocations across zones, such that no single zone will end up full of
> dirty, and thus more or less, unreclaimable pages.
Across all zones, or across the zones within the node or what? Some
more description of how all this plays with NUMA is needed, please.
> The global dirty limits are put in proportion to the respective zone's
> amount of dirtyable memory
I don't know what this means. How can a global limit be controlled by
what is happening within each single zone? Please describe this design
concept fully.
> and allocations diverted to other zones
> when the limit is reached.
hm.
> For now, the problem remains for NUMA configurations where the zones
> allowed for allocation are in sum not big enough to trigger the global
> dirty limits, but a future approach to solve this can reuse the
> per-zone dirty limit infrastructure laid out in this patch to have
> dirty throttling and the flusher threads consider individual zones.
>
> ...
>
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -36,6 +36,7 @@ struct vm_area_struct;
> #endif
> #define ___GFP_NO_KSWAPD 0x400000u
> #define ___GFP_OTHER_NODE 0x800000u
> +#define ___GFP_WRITE 0x1000000u
>
> /*
> * GFP bitmasks..
>
> ...
>
> +static unsigned long zone_dirtyable_memory(struct zone *zone)
Appears to return the number of pages in a particular zone which are
considered "dirtyable". Some discussion of how this decision is made
would be illuminating.
> +{
> + unsigned long x;
> + /*
> + * To keep a reasonable ratio between dirty memory and lowmem,
> + * highmem is not considered dirtyable on a global level.
Whereabouts in the kernel is this policy implemented?
determine_dirtyable_memory()? It does (or can) consider highmem
pages? Comment seems wrong?
Should we rename determine_dirtyable_memory() to
global_dirtyable_memory(), to get some sense of its relationship with
zone_dirtyable_memory()?
> + * But we allow individual highmem zones to hold a potentially
> + * bigger share of that global amount of dirty pages as long
> + * as they have enough free or reclaimable pages around.
> + */
> + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> + x += zone_reclaimable_pages(zone);
> + return x;
> +}
> +
>
> ...
>
> -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> +static void dirty_limits(struct zone *zone,
> + unsigned long *pbackground,
> + unsigned long *pdirty)
> {
> + unsigned long uninitialized_var(zone_memory);
> + unsigned long available_memory;
> + unsigned long global_memory;
> unsigned long background;
> - unsigned long dirty;
> - unsigned long uninitialized_var(available_memory);
> struct task_struct *tsk;
> + unsigned long dirty;
>
> - if (!vm_dirty_bytes || !dirty_background_bytes)
> - available_memory = determine_dirtyable_memory();
> + global_memory = determine_dirtyable_memory();
> + if (zone)
> + available_memory = zone_memory = zone_dirtyable_memory(zone);
> + else
> + available_memory = global_memory;
>
> - if (vm_dirty_bytes)
> + if (vm_dirty_bytes) {
> dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> - else
> + if (zone)
So passing zone==NULL alters dirty_limits()'s behaviour. Seems that it
flips the function between global_dirty_limits and zone_dirty_limits?
Would it be better if we actually had separate global_dirty_limits()
and zone_dirty_limits() rather than a magical mode?
> + dirty = dirty * zone_memory / global_memory;
> + } else
> dirty = (vm_dirty_ratio * available_memory) / 100;
>
> - if (dirty_background_bytes)
> + if (dirty_background_bytes) {
> background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> - else
> + if (zone)
> + background = background * zone_memory / global_memory;
> + } else
> background = (dirty_background_ratio * available_memory) / 100;
>
> if (background >= dirty)
>
> ...
>
> +bool zone_dirty_ok(struct zone *zone)
Full description of the return value, please.
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + dirty_limits(zone, &background_thresh, &dirty_thresh);
> +
> + return zone_page_state(zone, NR_FILE_DIRTY) +
> + zone_page_state(zone, NR_UNSTABLE_NFS) +
> + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> +}
We never needed to calculate &background_thresh,. I wonder if that
matters.
>
> ...
>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
2011-09-21 23:02 ` Andrew Morton
(?)
@ 2011-09-22 8:52 ` Johannes Weiner
-1 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-22 8:52 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> On Tue, 20 Sep 2011 15:45:13 +0200
> Johannes Weiner <jweiner@redhat.com> wrote:
>
> > This patch allows allocators to pass __GFP_WRITE when they know in
> > advance that the allocated page will be written to and become dirty
> > soon. The page allocator will then attempt to distribute those
> > allocations across zones, such that no single zone will end up full of
> > dirty, and thus more or less, unreclaimable pages.
>
> Across all zones, or across the zones within the node or what? Some
> more description of how all this plays with NUMA is needed, please.
Across the zones the allocator considers for allocation, which on NUMA
is determined by the allocating task's NUMA memory policy.
> > The global dirty limits are put in proportion to the respective zone's
> > amount of dirtyable memory
>
> I don't know what this means. How can a global limit be controlled by
> what is happening within each single zone? Please describe this design
> concept fully.
Yikes, it's mein English.
A zone's dirty limit is to the zone's contribution of dirtyable memory
what the global dirty limit is to the global amount of dirtyable
memory.
As a result, the sum of the dirty limits of all existing zones equals
the global dirty limit, such that no single zone receives more than
its fair share of the globally allowable dirty pages.
When the allocator tries to allocate from the list of allowable zones,
it skips those that have reached their maximum share of dirty pages.
> > For now, the problem remains for NUMA configurations where the zones
> > allowed for allocation are in sum not big enough to trigger the global
> > dirty limits, but a future approach to solve this can reuse the
> > per-zone dirty limit infrastructure laid out in this patch to have
> > dirty throttling and the flusher threads consider individual zones.
> > +static unsigned long zone_dirtyable_memory(struct zone *zone)
>
> Appears to return the number of pages in a particular zone which are
> considered "dirtyable". Some discussion of how this decision is made
> would be illuminating.
Is the proportional relationship between zones and the global level a
satisfactory explanation?
Because I am looking for a central place to explain all this.
> > +{
> > + unsigned long x;
> > + /*
> > + * To keep a reasonable ratio between dirty memory and lowmem,
> > + * highmem is not considered dirtyable on a global level.
>
> Whereabouts in the kernel is this policy implemented?
> determine_dirtyable_memory()? It does (or can) consider highmem
> pages? Comment seems wrong?
Yes, in determine_dirtyable_memory().
It is possible to configure an unreasonable ratio between dirty memory
and lowmem with the vm_highmem_is_dirtyable sysctl. The point is that
even though highmem is subtracted from the effective amount of global
dirtyable memory again (which is strictly a big-picture measure), we
only care about the individual zone here and so highmem can very much
always hold dirty pages up to its dirty limit.
> Should we rename determine_dirtyable_memory() to
> global_dirtyable_memory(), to get some sense of its relationship with
> zone_dirtyable_memory()?
Sounds good.
> > + * But we allow individual highmem zones to hold a potentially
> > + * bigger share of that global amount of dirty pages as long
> > + * as they have enough free or reclaimable pages around.
> > + */
> > + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> > + x += zone_reclaimable_pages(zone);
> > + return x;
> > +}
> > +
> >
> > ...
> >
> > -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +static void dirty_limits(struct zone *zone,
> > + unsigned long *pbackground,
> > + unsigned long *pdirty)
> > {
> > + unsigned long uninitialized_var(zone_memory);
> > + unsigned long available_memory;
> > + unsigned long global_memory;
> > unsigned long background;
> > - unsigned long dirty;
> > - unsigned long uninitialized_var(available_memory);
> > struct task_struct *tsk;
> > + unsigned long dirty;
> >
> > - if (!vm_dirty_bytes || !dirty_background_bytes)
> > - available_memory = determine_dirtyable_memory();
> > + global_memory = determine_dirtyable_memory();
> > + if (zone)
> > + available_memory = zone_memory = zone_dirtyable_memory(zone);
> > + else
> > + available_memory = global_memory;
> >
> > - if (vm_dirty_bytes)
> > + if (vm_dirty_bytes) {
> > dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
>
> So passing zone==NULL alters dirty_limits()'s behaviour. Seems that it
> flips the function between global_dirty_limits and zone_dirty_limits?
Yes.
> Would it be better if we actually had separate global_dirty_limits()
> and zone_dirty_limits() rather than a magical mode?
I did that the first time around, but Mel raised the valid point that
this will be bad for maintainability.
The global dirty limit and the per-zone dirty limit are not only
incidentally calculated the same way, they are intentionally similar
in the geometrical sense (modulo workarounds for not having fp
arithmetic), so it would be good to keep this stuff together.
But the same applies to determine_dirtyable_memory() and
zone_dirtyable_memory(), so they should be done the same way and I
don't care too much which that would be.
If noone complains, I would structure the code such that
global_dirtyable_memory() and zone_dirtyable_memory(), as well as
global_dirty_limits() and zone_dirty_limits() are separate functions
next to each other with a big fat comment above that block explaining
the per-zone dirty limits and the proportional relationship to the
global parameters.
> > + dirty = dirty * zone_memory / global_memory;
> > + } else
> > dirty = (vm_dirty_ratio * available_memory) / 100;
> >
> > - if (dirty_background_bytes)
> > + if (dirty_background_bytes) {
> > background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + background = background * zone_memory / global_memory;
> > + } else
> > background = (dirty_background_ratio * available_memory) / 100;
> >
> > if (background >= dirty)
> >
> > ...
> >
> > +bool zone_dirty_ok(struct zone *zone)
>
> Full description of the return value, please.
Returns false when the zone has reached its maximum share of the
global allowed dirty pages, true otherwise.
>
> > +{
> > + unsigned long background_thresh, dirty_thresh;
> > +
> > + dirty_limits(zone, &background_thresh, &dirty_thresh);
> > +
> > + return zone_page_state(zone, NR_FILE_DIRTY) +
> > + zone_page_state(zone, NR_UNSTABLE_NFS) +
> > + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> > +}
>
> We never needed to calculate &background_thresh,. I wonder if that
> matters.
I didn't think dirty_limits() could take another branch, but if I
split up the function I will drop it. It's not rocket science and can
be easily added on demand.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-22 8:52 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-22 8:52 UTC (permalink / raw)
To: Andrew Morton
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu Fengguang,
Chris Mason, Minchan Kim
On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> On Tue, 20 Sep 2011 15:45:13 +0200
> Johannes Weiner <jweiner@redhat.com> wrote:
>
> > This patch allows allocators to pass __GFP_WRITE when they know in
> > advance that the allocated page will be written to and become dirty
> > soon. The page allocator will then attempt to distribute those
> > allocations across zones, such that no single zone will end up full of
> > dirty, and thus more or less, unreclaimable pages.
>
> Across all zones, or across the zones within the node or what? Some
> more description of how all this plays with NUMA is needed, please.
Across the zones the allocator considers for allocation, which on NUMA
is determined by the allocating task's NUMA memory policy.
> > The global dirty limits are put in proportion to the respective zone's
> > amount of dirtyable memory
>
> I don't know what this means. How can a global limit be controlled by
> what is happening within each single zone? Please describe this design
> concept fully.
Yikes, it's mein English.
A zone's dirty limit is to the zone's contribution of dirtyable memory
what the global dirty limit is to the global amount of dirtyable
memory.
As a result, the sum of the dirty limits of all existing zones equals
the global dirty limit, such that no single zone receives more than
its fair share of the globally allowable dirty pages.
When the allocator tries to allocate from the list of allowable zones,
it skips those that have reached their maximum share of dirty pages.
> > For now, the problem remains for NUMA configurations where the zones
> > allowed for allocation are in sum not big enough to trigger the global
> > dirty limits, but a future approach to solve this can reuse the
> > per-zone dirty limit infrastructure laid out in this patch to have
> > dirty throttling and the flusher threads consider individual zones.
> > +static unsigned long zone_dirtyable_memory(struct zone *zone)
>
> Appears to return the number of pages in a particular zone which are
> considered "dirtyable". Some discussion of how this decision is made
> would be illuminating.
Is the proportional relationship between zones and the global level a
satisfactory explanation?
Because I am looking for a central place to explain all this.
> > +{
> > + unsigned long x;
> > + /*
> > + * To keep a reasonable ratio between dirty memory and lowmem,
> > + * highmem is not considered dirtyable on a global level.
>
> Whereabouts in the kernel is this policy implemented?
> determine_dirtyable_memory()? It does (or can) consider highmem
> pages? Comment seems wrong?
Yes, in determine_dirtyable_memory().
It is possible to configure an unreasonable ratio between dirty memory
and lowmem with the vm_highmem_is_dirtyable sysctl. The point is that
even though highmem is subtracted from the effective amount of global
dirtyable memory again (which is strictly a big-picture measure), we
only care about the individual zone here and so highmem can very much
always hold dirty pages up to its dirty limit.
> Should we rename determine_dirtyable_memory() to
> global_dirtyable_memory(), to get some sense of its relationship with
> zone_dirtyable_memory()?
Sounds good.
> > + * But we allow individual highmem zones to hold a potentially
> > + * bigger share of that global amount of dirty pages as long
> > + * as they have enough free or reclaimable pages around.
> > + */
> > + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> > + x += zone_reclaimable_pages(zone);
> > + return x;
> > +}
> > +
> >
> > ...
> >
> > -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +static void dirty_limits(struct zone *zone,
> > + unsigned long *pbackground,
> > + unsigned long *pdirty)
> > {
> > + unsigned long uninitialized_var(zone_memory);
> > + unsigned long available_memory;
> > + unsigned long global_memory;
> > unsigned long background;
> > - unsigned long dirty;
> > - unsigned long uninitialized_var(available_memory);
> > struct task_struct *tsk;
> > + unsigned long dirty;
> >
> > - if (!vm_dirty_bytes || !dirty_background_bytes)
> > - available_memory = determine_dirtyable_memory();
> > + global_memory = determine_dirtyable_memory();
> > + if (zone)
> > + available_memory = zone_memory = zone_dirtyable_memory(zone);
> > + else
> > + available_memory = global_memory;
> >
> > - if (vm_dirty_bytes)
> > + if (vm_dirty_bytes) {
> > dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
>
> So passing zone==NULL alters dirty_limits()'s behaviour. Seems that it
> flips the function between global_dirty_limits and zone_dirty_limits?
Yes.
> Would it be better if we actually had separate global_dirty_limits()
> and zone_dirty_limits() rather than a magical mode?
I did that the first time around, but Mel raised the valid point that
this will be bad for maintainability.
The global dirty limit and the per-zone dirty limit are not only
incidentally calculated the same way, they are intentionally similar
in the geometrical sense (modulo workarounds for not having fp
arithmetic), so it would be good to keep this stuff together.
But the same applies to determine_dirtyable_memory() and
zone_dirtyable_memory(), so they should be done the same way and I
don't care too much which that would be.
If noone complains, I would structure the code such that
global_dirtyable_memory() and zone_dirtyable_memory(), as well as
global_dirty_limits() and zone_dirty_limits() are separate functions
next to each other with a big fat comment above that block explaining
the per-zone dirty limits and the proportional relationship to the
global parameters.
> > + dirty = dirty * zone_memory / global_memory;
> > + } else
> > dirty = (vm_dirty_ratio * available_memory) / 100;
> >
> > - if (dirty_background_bytes)
> > + if (dirty_background_bytes) {
> > background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + background = background * zone_memory / global_memory;
> > + } else
> > background = (dirty_background_ratio * available_memory) / 100;
> >
> > if (background >= dirty)
> >
> > ...
> >
> > +bool zone_dirty_ok(struct zone *zone)
>
> Full description of the return value, please.
Returns false when the zone has reached its maximum share of the
global allowed dirty pages, true otherwise.
>
> > +{
> > + unsigned long background_thresh, dirty_thresh;
> > +
> > + dirty_limits(zone, &background_thresh, &dirty_thresh);
> > +
> > + return zone_page_state(zone, NR_FILE_DIRTY) +
> > + zone_page_state(zone, NR_UNSTABLE_NFS) +
> > + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> > +}
>
> We never needed to calculate &background_thresh,. I wonder if that
> matters.
I didn't think dirty_limits() could take another branch, but if I
split up the function I will drop it. It's not rocket science and can
be easily added on demand.
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/4] mm: writeback: distribute write pages across allowable zones
@ 2011-09-22 8:52 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-22 8:52 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> On Tue, 20 Sep 2011 15:45:13 +0200
> Johannes Weiner <jweiner@redhat.com> wrote:
>
> > This patch allows allocators to pass __GFP_WRITE when they know in
> > advance that the allocated page will be written to and become dirty
> > soon. The page allocator will then attempt to distribute those
> > allocations across zones, such that no single zone will end up full of
> > dirty, and thus more or less, unreclaimable pages.
>
> Across all zones, or across the zones within the node or what? Some
> more description of how all this plays with NUMA is needed, please.
Across the zones the allocator considers for allocation, which on NUMA
is determined by the allocating task's NUMA memory policy.
> > The global dirty limits are put in proportion to the respective zone's
> > amount of dirtyable memory
>
> I don't know what this means. How can a global limit be controlled by
> what is happening within each single zone? Please describe this design
> concept fully.
Yikes, it's mein English.
A zone's dirty limit is to the zone's contribution of dirtyable memory
what the global dirty limit is to the global amount of dirtyable
memory.
As a result, the sum of the dirty limits of all existing zones equals
the global dirty limit, such that no single zone receives more than
its fair share of the globally allowable dirty pages.
When the allocator tries to allocate from the list of allowable zones,
it skips those that have reached their maximum share of dirty pages.
> > For now, the problem remains for NUMA configurations where the zones
> > allowed for allocation are in sum not big enough to trigger the global
> > dirty limits, but a future approach to solve this can reuse the
> > per-zone dirty limit infrastructure laid out in this patch to have
> > dirty throttling and the flusher threads consider individual zones.
> > +static unsigned long zone_dirtyable_memory(struct zone *zone)
>
> Appears to return the number of pages in a particular zone which are
> considered "dirtyable". Some discussion of how this decision is made
> would be illuminating.
Is the proportional relationship between zones and the global level a
satisfactory explanation?
Because I am looking for a central place to explain all this.
> > +{
> > + unsigned long x;
> > + /*
> > + * To keep a reasonable ratio between dirty memory and lowmem,
> > + * highmem is not considered dirtyable on a global level.
>
> Whereabouts in the kernel is this policy implemented?
> determine_dirtyable_memory()? It does (or can) consider highmem
> pages? Comment seems wrong?
Yes, in determine_dirtyable_memory().
It is possible to configure an unreasonable ratio between dirty memory
and lowmem with the vm_highmem_is_dirtyable sysctl. The point is that
even though highmem is subtracted from the effective amount of global
dirtyable memory again (which is strictly a big-picture measure), we
only care about the individual zone here and so highmem can very much
always hold dirty pages up to its dirty limit.
> Should we rename determine_dirtyable_memory() to
> global_dirtyable_memory(), to get some sense of its relationship with
> zone_dirtyable_memory()?
Sounds good.
> > + * But we allow individual highmem zones to hold a potentially
> > + * bigger share of that global amount of dirty pages as long
> > + * as they have enough free or reclaimable pages around.
> > + */
> > + x = zone_page_state(zone, NR_FREE_PAGES) - zone->totalreserve_pages;
> > + x += zone_reclaimable_pages(zone);
> > + return x;
> > +}
> > +
> >
> > ...
> >
> > -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> > +static void dirty_limits(struct zone *zone,
> > + unsigned long *pbackground,
> > + unsigned long *pdirty)
> > {
> > + unsigned long uninitialized_var(zone_memory);
> > + unsigned long available_memory;
> > + unsigned long global_memory;
> > unsigned long background;
> > - unsigned long dirty;
> > - unsigned long uninitialized_var(available_memory);
> > struct task_struct *tsk;
> > + unsigned long dirty;
> >
> > - if (!vm_dirty_bytes || !dirty_background_bytes)
> > - available_memory = determine_dirtyable_memory();
> > + global_memory = determine_dirtyable_memory();
> > + if (zone)
> > + available_memory = zone_memory = zone_dirtyable_memory(zone);
> > + else
> > + available_memory = global_memory;
> >
> > - if (vm_dirty_bytes)
> > + if (vm_dirty_bytes) {
> > dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
>
> So passing zone==NULL alters dirty_limits()'s behaviour. Seems that it
> flips the function between global_dirty_limits and zone_dirty_limits?
Yes.
> Would it be better if we actually had separate global_dirty_limits()
> and zone_dirty_limits() rather than a magical mode?
I did that the first time around, but Mel raised the valid point that
this will be bad for maintainability.
The global dirty limit and the per-zone dirty limit are not only
incidentally calculated the same way, they are intentionally similar
in the geometrical sense (modulo workarounds for not having fp
arithmetic), so it would be good to keep this stuff together.
But the same applies to determine_dirtyable_memory() and
zone_dirtyable_memory(), so they should be done the same way and I
don't care too much which that would be.
If noone complains, I would structure the code such that
global_dirtyable_memory() and zone_dirtyable_memory(), as well as
global_dirty_limits() and zone_dirty_limits() are separate functions
next to each other with a big fat comment above that block explaining
the per-zone dirty limits and the proportional relationship to the
global parameters.
> > + dirty = dirty * zone_memory / global_memory;
> > + } else
> > dirty = (vm_dirty_ratio * available_memory) / 100;
> >
> > - if (dirty_background_bytes)
> > + if (dirty_background_bytes) {
> > background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
> > - else
> > + if (zone)
> > + background = background * zone_memory / global_memory;
> > + } else
> > background = (dirty_background_ratio * available_memory) / 100;
> >
> > if (background >= dirty)
> >
> > ...
> >
> > +bool zone_dirty_ok(struct zone *zone)
>
> Full description of the return value, please.
Returns false when the zone has reached its maximum share of the
global allowed dirty pages, true otherwise.
>
> > +{
> > + unsigned long background_thresh, dirty_thresh;
> > +
> > + dirty_limits(zone, &background_thresh, &dirty_thresh);
> > +
> > + return zone_page_state(zone, NR_FILE_DIRTY) +
> > + zone_page_state(zone, NR_UNSTABLE_NFS) +
> > + zone_page_state(zone, NR_WRITEBACK) <= dirty_thresh;
> > +}
>
> We never needed to calculate &background_thresh,. I wonder if that
> matters.
I didn't think dirty_limits() could take another branch, but if I
split up the function I will drop it. It's not rocket science and can
be easily added on demand.
^ permalink raw reply [flat|nested] 117+ messages in thread
* [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
2011-09-22 8:52 ` Johannes Weiner
(?)
@ 2011-09-23 14:41 ` Johannes Weiner
-1 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-23 14:41 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > Should we rename determine_dirtyable_memory() to
> > global_dirtyable_memory(), to get some sense of its relationship with
> > zone_dirtyable_memory()?
>
> Sounds good.
---
The next patch will introduce per-zone dirty limiting functions in
addition to the traditional global dirty limiting.
Rename determine_dirtyable_memory() to global_dirtyable_memory()
before adding the zone-specific version, and fix up its documentation.
Also, move the functions to determine the dirtyable memory and the
function to calculate the dirty limit based on that together so that
their relationship is more apparent and that they can be commented on
as a group.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
mm/page-writeback.c | 92 +++++++++++++++++++++++++-------------------------
1 files changed, 46 insertions(+), 46 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c8acf8a..78604a6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -186,12 +186,12 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
}
/**
- * determine_dirtyable_memory - amount of memory that may be used
+ * global_dirtyable_memory - number of globally dirtyable pages
*
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
+ * Returns the global number of pages potentially available for dirty
+ * page cache. This is the base value for the global dirty limits.
*/
-static unsigned long determine_dirtyable_memory(void)
+static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
@@ -205,6 +205,47 @@ static unsigned long determine_dirtyable_memory(void)
}
/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ unsigned long background;
+ unsigned long dirty;
+ unsigned long uninitialized_var(available_memory);
+ struct task_struct *tsk;
+
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = global_dirtyable_memory();
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
+
+ if (dirty_background_bytes)
+ background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ else
+ background = (dirty_background_ratio * available_memory) / 100;
+
+ if (background >= dirty)
+ background = dirty / 2;
+ tsk = current;
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ background += background / 4;
+ dirty += dirty / 4;
+ }
+ *pbackground = background;
+ *pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
+}
+
+/*
* couple the period to the dirty_ratio:
*
* period/2 ~ roundup_pow_of_two(dirty limit)
@@ -216,7 +257,7 @@ static int calc_period_shift(void)
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+ dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
}
@@ -416,47 +457,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
return max(thresh, global_dirty_limit);
}
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
- *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio or vm.dirty_background_bytes
- * - vm.dirty_ratio or vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
-{
- unsigned long background;
- unsigned long dirty;
- unsigned long uninitialized_var(available_memory);
- struct task_struct *tsk;
-
- if (!vm_dirty_bytes || !dirty_background_bytes)
- available_memory = determine_dirtyable_memory();
-
- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else
- dirty = (vm_dirty_ratio * available_memory) / 100;
-
- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
- else
- background = (dirty_background_ratio * available_memory) / 100;
-
- if (background >= dirty)
- background = dirty / 2;
- tsk = current;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- background += background / 4;
- dirty += dirty / 4;
- }
- *pbackground = background;
- *pdirty = dirty;
- trace_global_dirty_state(background, dirty);
-}
-
/**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
* @bdi: the backing_dev_info to query
--
1.7.6.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 117+ messages in thread
* [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
@ 2011-09-23 14:41 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-23 14:41 UTC (permalink / raw)
To: Andrew Morton
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu Fengguang,
Chris Mason, Minchan Kim
On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > Should we rename determine_dirtyable_memory() to
> > global_dirtyable_memory(), to get some sense of its relationship with
> > zone_dirtyable_memory()?
>
> Sounds good.
---
The next patch will introduce per-zone dirty limiting functions in
addition to the traditional global dirty limiting.
Rename determine_dirtyable_memory() to global_dirtyable_memory()
before adding the zone-specific version, and fix up its documentation.
Also, move the functions to determine the dirtyable memory and the
function to calculate the dirty limit based on that together so that
their relationship is more apparent and that they can be commented on
as a group.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
mm/page-writeback.c | 92 +++++++++++++++++++++++++-------------------------
1 files changed, 46 insertions(+), 46 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c8acf8a..78604a6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -186,12 +186,12 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
}
/**
- * determine_dirtyable_memory - amount of memory that may be used
+ * global_dirtyable_memory - number of globally dirtyable pages
*
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
+ * Returns the global number of pages potentially available for dirty
+ * page cache. This is the base value for the global dirty limits.
*/
-static unsigned long determine_dirtyable_memory(void)
+static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
@@ -205,6 +205,47 @@ static unsigned long determine_dirtyable_memory(void)
}
/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ unsigned long background;
+ unsigned long dirty;
+ unsigned long uninitialized_var(available_memory);
+ struct task_struct *tsk;
+
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = global_dirtyable_memory();
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
+
+ if (dirty_background_bytes)
+ background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ else
+ background = (dirty_background_ratio * available_memory) / 100;
+
+ if (background >= dirty)
+ background = dirty / 2;
+ tsk = current;
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ background += background / 4;
+ dirty += dirty / 4;
+ }
+ *pbackground = background;
+ *pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
+}
+
+/*
* couple the period to the dirty_ratio:
*
* period/2 ~ roundup_pow_of_two(dirty limit)
@@ -216,7 +257,7 @@ static int calc_period_shift(void)
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+ dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
}
@@ -416,47 +457,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
return max(thresh, global_dirty_limit);
}
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
- *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio or vm.dirty_background_bytes
- * - vm.dirty_ratio or vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
-{
- unsigned long background;
- unsigned long dirty;
- unsigned long uninitialized_var(available_memory);
- struct task_struct *tsk;
-
- if (!vm_dirty_bytes || !dirty_background_bytes)
- available_memory = determine_dirtyable_memory();
-
- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else
- dirty = (vm_dirty_ratio * available_memory) / 100;
-
- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
- else
- background = (dirty_background_ratio * available_memory) / 100;
-
- if (background >= dirty)
- background = dirty / 2;
- tsk = current;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- background += background / 4;
- dirty += dirty / 4;
- }
- *pbackground = background;
- *pdirty = dirty;
- trace_global_dirty_state(background, dirty);
-}
-
/**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
* @bdi: the backing_dev_info to query
--
1.7.6.2
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 117+ messages in thread
* [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
@ 2011-09-23 14:41 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-23 14:41 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > Should we rename determine_dirtyable_memory() to
> > global_dirtyable_memory(), to get some sense of its relationship with
> > zone_dirtyable_memory()?
>
> Sounds good.
---
The next patch will introduce per-zone dirty limiting functions in
addition to the traditional global dirty limiting.
Rename determine_dirtyable_memory() to global_dirtyable_memory()
before adding the zone-specific version, and fix up its documentation.
Also, move the functions to determine the dirtyable memory and the
function to calculate the dirty limit based on that together so that
their relationship is more apparent and that they can be commented on
as a group.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
mm/page-writeback.c | 92 +++++++++++++++++++++++++-------------------------
1 files changed, 46 insertions(+), 46 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c8acf8a..78604a6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -186,12 +186,12 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
}
/**
- * determine_dirtyable_memory - amount of memory that may be used
+ * global_dirtyable_memory - number of globally dirtyable pages
*
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
+ * Returns the global number of pages potentially available for dirty
+ * page cache. This is the base value for the global dirty limits.
*/
-static unsigned long determine_dirtyable_memory(void)
+static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
@@ -205,6 +205,47 @@ static unsigned long determine_dirtyable_memory(void)
}
/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ unsigned long background;
+ unsigned long dirty;
+ unsigned long uninitialized_var(available_memory);
+ struct task_struct *tsk;
+
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = global_dirtyable_memory();
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
+
+ if (dirty_background_bytes)
+ background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ else
+ background = (dirty_background_ratio * available_memory) / 100;
+
+ if (background >= dirty)
+ background = dirty / 2;
+ tsk = current;
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ background += background / 4;
+ dirty += dirty / 4;
+ }
+ *pbackground = background;
+ *pdirty = dirty;
+ trace_global_dirty_state(background, dirty);
+}
+
+/*
* couple the period to the dirty_ratio:
*
* period/2 ~ roundup_pow_of_two(dirty limit)
@@ -216,7 +257,7 @@ static int calc_period_shift(void)
if (vm_dirty_bytes)
dirty_total = vm_dirty_bytes / PAGE_SIZE;
else
- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+ dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
100;
return 2 + ilog2(dirty_total - 1);
}
@@ -416,47 +457,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
return max(thresh, global_dirty_limit);
}
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
- *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio or vm.dirty_background_bytes
- * - vm.dirty_ratio or vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
-{
- unsigned long background;
- unsigned long dirty;
- unsigned long uninitialized_var(available_memory);
- struct task_struct *tsk;
-
- if (!vm_dirty_bytes || !dirty_background_bytes)
- available_memory = determine_dirtyable_memory();
-
- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else
- dirty = (vm_dirty_ratio * available_memory) / 100;
-
- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
- else
- background = (dirty_background_ratio * available_memory) / 100;
-
- if (background >= dirty)
- background = dirty / 2;
- tsk = current;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- background += background / 4;
- dirty += dirty / 4;
- }
- *pbackground = background;
- *pdirty = dirty;
- trace_global_dirty_state(background, dirty);
-}
-
/**
* bdi_dirty_limit - @bdi's share of dirty throttling threshold
* @bdi: the backing_dev_info to query
--
1.7.6.2
^ permalink raw reply related [flat|nested] 117+ messages in thread
* Re: [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
2011-09-23 14:41 ` Johannes Weiner
(?)
@ 2011-09-28 5:57 ` Minchan Kim
-1 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 5:57 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:41:07PM +0200, Johannes Weiner wrote:
> On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> > On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > > Should we rename determine_dirtyable_memory() to
> > > global_dirtyable_memory(), to get some sense of its relationship with
> > > zone_dirtyable_memory()?
> >
> > Sounds good.
>
> ---
>
> The next patch will introduce per-zone dirty limiting functions in
> addition to the traditional global dirty limiting.
>
> Rename determine_dirtyable_memory() to global_dirtyable_memory()
> before adding the zone-specific version, and fix up its documentation.
>
> Also, move the functions to determine the dirtyable memory and the
> function to calculate the dirty limit based on that together so that
> their relationship is more apparent and that they can be commented on
> as a group.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
--
Kinds regards,
Minchan Kim
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
@ 2011-09-28 5:57 ` Minchan Kim
0 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 5:57 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Rik van Riel, linux-ext4, Jan Kara, linux-btrfs,
linux-kernel, xfs, Christoph Hellwig, linux-mm, Andreas Dilger,
Mel Gorman, linux-fsdevel, Theodore Ts'o, Andrew Morton,
Wu Fengguang, Chris Mason
On Fri, Sep 23, 2011 at 04:41:07PM +0200, Johannes Weiner wrote:
> On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> > On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > > Should we rename determine_dirtyable_memory() to
> > > global_dirtyable_memory(), to get some sense of its relationship with
> > > zone_dirtyable_memory()?
> >
> > Sounds good.
>
> ---
>
> The next patch will introduce per-zone dirty limiting functions in
> addition to the traditional global dirty limiting.
>
> Rename determine_dirtyable_memory() to global_dirtyable_memory()
> before adding the zone-specific version, and fix up its documentation.
>
> Also, move the functions to determine the dirtyable memory and the
> function to calculate the dirty limit based on that together so that
> their relationship is more apparent and that they can be commented on
> as a group.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
--
Kinds regards,
Minchan Kim
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
@ 2011-09-28 5:57 ` Minchan Kim
0 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 5:57 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:41:07PM +0200, Johannes Weiner wrote:
> On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> > On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > > Should we rename determine_dirtyable_memory() to
> > > global_dirtyable_memory(), to get some sense of its relationship with
> > > zone_dirtyable_memory()?
> >
> > Sounds good.
>
> ---
>
> The next patch will introduce per-zone dirty limiting functions in
> addition to the traditional global dirty limiting.
>
> Rename determine_dirtyable_memory() to global_dirtyable_memory()
> before adding the zone-specific version, and fix up its documentation.
>
> Also, move the functions to determine the dirtyable memory and the
> function to calculate the dirty limit based on that together so that
> their relationship is more apparent and that they can be commented on
> as a group.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
--
Kinds regards,
Minchan Kim
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
2011-09-23 14:41 ` Johannes Weiner
(?)
@ 2011-09-28 9:27 ` Mel Gorman
-1 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-28 9:27 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:41:07PM +0200, Johannes Weiner wrote:
> On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> > On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > > Should we rename determine_dirtyable_memory() to
> > > global_dirtyable_memory(), to get some sense of its relationship with
> > > zone_dirtyable_memory()?
> >
> > Sounds good.
>
> ---
>
> The next patch will introduce per-zone dirty limiting functions in
> addition to the traditional global dirty limiting.
>
> Rename determine_dirtyable_memory() to global_dirtyable_memory()
> before adding the zone-specific version, and fix up its documentation.
>
> Also, move the functions to determine the dirtyable memory and the
> function to calculate the dirty limit based on that together so that
> their relationship is more apparent and that they can be commented on
> as a group.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: Mel Gorman <mel@suse.de>
--
Mel Gorman
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
@ 2011-09-28 9:27 ` Mel Gorman
0 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-28 9:27 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Rik van Riel, linux-ext4, Jan Kara, linux-btrfs,
linux-kernel, xfs, Christoph Hellwig, linux-mm, Andreas Dilger,
Minchan Kim, linux-fsdevel, Theodore Ts'o, Andrew Morton,
Wu Fengguang, Chris Mason
On Fri, Sep 23, 2011 at 04:41:07PM +0200, Johannes Weiner wrote:
> On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> > On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > > Should we rename determine_dirtyable_memory() to
> > > global_dirtyable_memory(), to get some sense of its relationship with
> > > zone_dirtyable_memory()?
> >
> > Sounds good.
>
> ---
>
> The next patch will introduce per-zone dirty limiting functions in
> addition to the traditional global dirty limiting.
>
> Rename determine_dirtyable_memory() to global_dirtyable_memory()
> before adding the zone-specific version, and fix up its documentation.
>
> Also, move the functions to determine the dirtyable memory and the
> function to calculate the dirty limit based on that together so that
> their relationship is more apparent and that they can be commented on
> as a group.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: Mel Gorman <mel@suse.de>
--
Mel Gorman
SUSE Labs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 1/2/4] mm: writeback: cleanups in preparation for per-zone dirty limits
@ 2011-09-28 9:27 ` Mel Gorman
0 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-28 9:27 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:41:07PM +0200, Johannes Weiner wrote:
> On Thu, Sep 22, 2011 at 10:52:42AM +0200, Johannes Weiner wrote:
> > On Wed, Sep 21, 2011 at 04:02:26PM -0700, Andrew Morton wrote:
> > > Should we rename determine_dirtyable_memory() to
> > > global_dirtyable_memory(), to get some sense of its relationship with
> > > zone_dirtyable_memory()?
> >
> > Sounds good.
>
> ---
>
> The next patch will introduce per-zone dirty limiting functions in
> addition to the traditional global dirty limiting.
>
> Rename determine_dirtyable_memory() to global_dirtyable_memory()
> before adding the zone-specific version, and fix up its documentation.
>
> Also, move the functions to determine the dirtyable memory and the
> function to calculate the dirty limit based on that together so that
> their relationship is more apparent and that they can be commented on
> as a group.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: Mel Gorman <mel@suse.de>
--
Mel Gorman
SUSE Labs
^ permalink raw reply [flat|nested] 117+ messages in thread
* [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
2011-09-22 8:52 ` Johannes Weiner
(?)
@ 2011-09-23 14:42 ` Johannes Weiner
-1 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-23 14:42 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
The maximum number of dirty pages that exist in the system at any time
is determined by a number of pages considered dirtyable and a
user-configured percentage of those, or an absolute number in bytes.
This number of dirtyable pages is the sum of memory provided by all
the zones in the system minus their lowmem reserves and high
watermarks, so that the system can retain a healthy number of free
pages without having to reclaim dirty pages.
But there is a flaw in that we have a zoned page allocator which does
not care about the global state but rather the state of individual
memory zones. And right now there is nothing that prevents one zone
from filling up with dirty pages while other zones are spared, which
frequently leads to situations where kswapd, in order to restore the
watermark of free pages, does indeed have to write pages from that
zone's LRU list. This can interfere so badly with IO from the flusher
threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
requests from reclaim already, taking away the VM's only possibility
to keep such a zone balanced, aside from hoping the flushers will soon
clean pages from that zone.
Enter per-zone dirty limits. They are to a zone's dirtyable memory
what the global limit is to the global amount of dirtyable memory, and
try to make sure that no single zone receives more than its fair share
of the globally allowed dirty pages in the first place. As the number
of pages considered dirtyable exclude the zones' lowmem reserves and
high watermarks, the maximum number of dirty pages in a zone is such
that the zone can always be balanced without requiring page cleaning.
As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to
pass __GFP_WRITE when they know in advance that the page will be
written to and become dirty soon. The page allocator will then
attempt to allocate from the first zone of the zonelist - which on
NUMA is determined by the task's NUMA memory policy - that has not
exceeded its dirty limit.
At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case. With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations. Workloads
that previously could fit their dirty pages completely in the higher
zone may be forced to allocate from lower zones, but the amount of
pages that 'spill over' are limited themselves by the lower zones'
dirty constraints, and thus unlikely to become a problem.
For now, the problem of unfair dirty page distribution remains for
NUMA configurations where the zones allowed for allocation are in sum
not big enough to trigger the global dirty limits, wake up the flusher
threads and remedy the situation. Because of this, an allocation that
could not succeed on any of the considered zones is allowed to ignore
the dirty limits before going into direct reclaim or even failing the
allocation, until a future patch changes the global dirty throttling
and flusher thread activation so that they take individual zone states
into account.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
include/linux/gfp.h | 4 ++-
include/linux/writeback.h | 1 +
mm/page-writeback.c | 83 +++++++++++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 29 ++++++++++++++++
4 files changed, 116 insertions(+), 1 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf..50efc7e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,6 +36,7 @@ struct vm_area_struct;
#endif
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
+#define ___GFP_WRITE 0x1000000u
/*
* GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
/*
* This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +94,7 @@ struct vm_area_struct;
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a5f495f..c96ee0c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
static inline void laptop_sync_completion(void) { }
#endif
void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
extern unsigned long global_dirty_limit;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 78604a6..f60fd57 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -159,6 +159,25 @@ static struct prop_descriptor vm_dirties;
* We make sure that the background writeout level is below the adjusted
* clamping level.
*/
+
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied. Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -245,6 +264,70 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
trace_global_dirty_state(background, dirty);
}
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ /*
+ * The effective global number of dirtyable pages may exclude
+ * highmem as a big-picture measure to keep the ratio between
+ * dirty memory and lowmem reasonable.
+ *
+ * But this function is purely about the individual zone and a
+ * highmem zone can hold its share of dirty pages, so we don't
+ * care about vm_highmem_is_dirtyable here.
+ */
+ return zone_page_state(zone, NR_FREE_PAGES) +
+ zone_reclaimable_pages(zone) -
+ zone->dirty_balance_reserve;
+}
+
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+ unsigned long zone_memory = zone_dirtyable_memory(zone);
+ struct task_struct *tsk = current;
+ unsigned long dirty;
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+ zone_memory / global_dirtyable_memory();
+ else
+ dirty = vm_dirty_ratio * zone_memory / 100;
+
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+ dirty += dirty / 4;
+
+ return dirty;
+}
+
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+ unsigned long limit = zone_dirty_limit(zone);
+
+ return zone_page_state(zone, NR_FILE_DIRTY) +
+ zone_page_state(zone, NR_UNSTABLE_NFS) +
+ zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
+
/*
* couple the period to the dirty_ratio:
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8cba89..afaf59e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1675,6 +1675,35 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a zone that is within its dirty
+ * limit, such that no single zone holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the zone's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * This may look like it could increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But the pages that do spill
+ * over are limited as the lower zones are protected
+ * by this very same mechanism. It should not become
+ * a practical burden to them.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-zone dirty limit in the slowpath
+ * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * zones are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
--
1.7.6.2
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 117+ messages in thread
* [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-23 14:42 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-23 14:42 UTC (permalink / raw)
To: Andrew Morton
Cc: Rik van Riel, linux-ext4, Jan Kara, linux-btrfs, linux-kernel,
xfs, Christoph Hellwig, linux-mm, Andreas Dilger, Mel Gorman,
linux-fsdevel, Theodore Ts'o, Andrew Morton, Wu Fengguang,
Chris Mason, Minchan Kim
The maximum number of dirty pages that exist in the system at any time
is determined by a number of pages considered dirtyable and a
user-configured percentage of those, or an absolute number in bytes.
This number of dirtyable pages is the sum of memory provided by all
the zones in the system minus their lowmem reserves and high
watermarks, so that the system can retain a healthy number of free
pages without having to reclaim dirty pages.
But there is a flaw in that we have a zoned page allocator which does
not care about the global state but rather the state of individual
memory zones. And right now there is nothing that prevents one zone
from filling up with dirty pages while other zones are spared, which
frequently leads to situations where kswapd, in order to restore the
watermark of free pages, does indeed have to write pages from that
zone's LRU list. This can interfere so badly with IO from the flusher
threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
requests from reclaim already, taking away the VM's only possibility
to keep such a zone balanced, aside from hoping the flushers will soon
clean pages from that zone.
Enter per-zone dirty limits. They are to a zone's dirtyable memory
what the global limit is to the global amount of dirtyable memory, and
try to make sure that no single zone receives more than its fair share
of the globally allowed dirty pages in the first place. As the number
of pages considered dirtyable exclude the zones' lowmem reserves and
high watermarks, the maximum number of dirty pages in a zone is such
that the zone can always be balanced without requiring page cleaning.
As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to
pass __GFP_WRITE when they know in advance that the page will be
written to and become dirty soon. The page allocator will then
attempt to allocate from the first zone of the zonelist - which on
NUMA is determined by the task's NUMA memory policy - that has not
exceeded its dirty limit.
At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case. With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations. Workloads
that previously could fit their dirty pages completely in the higher
zone may be forced to allocate from lower zones, but the amount of
pages that 'spill over' are limited themselves by the lower zones'
dirty constraints, and thus unlikely to become a problem.
For now, the problem of unfair dirty page distribution remains for
NUMA configurations where the zones allowed for allocation are in sum
not big enough to trigger the global dirty limits, wake up the flusher
threads and remedy the situation. Because of this, an allocation that
could not succeed on any of the considered zones is allowed to ignore
the dirty limits before going into direct reclaim or even failing the
allocation, until a future patch changes the global dirty throttling
and flusher thread activation so that they take individual zone states
into account.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
include/linux/gfp.h | 4 ++-
include/linux/writeback.h | 1 +
mm/page-writeback.c | 83 +++++++++++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 29 ++++++++++++++++
4 files changed, 116 insertions(+), 1 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf..50efc7e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,6 +36,7 @@ struct vm_area_struct;
#endif
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
+#define ___GFP_WRITE 0x1000000u
/*
* GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
/*
* This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +94,7 @@ struct vm_area_struct;
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a5f495f..c96ee0c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
static inline void laptop_sync_completion(void) { }
#endif
void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
extern unsigned long global_dirty_limit;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 78604a6..f60fd57 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -159,6 +159,25 @@ static struct prop_descriptor vm_dirties;
* We make sure that the background writeout level is below the adjusted
* clamping level.
*/
+
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied. Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -245,6 +264,70 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
trace_global_dirty_state(background, dirty);
}
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ /*
+ * The effective global number of dirtyable pages may exclude
+ * highmem as a big-picture measure to keep the ratio between
+ * dirty memory and lowmem reasonable.
+ *
+ * But this function is purely about the individual zone and a
+ * highmem zone can hold its share of dirty pages, so we don't
+ * care about vm_highmem_is_dirtyable here.
+ */
+ return zone_page_state(zone, NR_FREE_PAGES) +
+ zone_reclaimable_pages(zone) -
+ zone->dirty_balance_reserve;
+}
+
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+ unsigned long zone_memory = zone_dirtyable_memory(zone);
+ struct task_struct *tsk = current;
+ unsigned long dirty;
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+ zone_memory / global_dirtyable_memory();
+ else
+ dirty = vm_dirty_ratio * zone_memory / 100;
+
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+ dirty += dirty / 4;
+
+ return dirty;
+}
+
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+ unsigned long limit = zone_dirty_limit(zone);
+
+ return zone_page_state(zone, NR_FILE_DIRTY) +
+ zone_page_state(zone, NR_UNSTABLE_NFS) +
+ zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
+
/*
* couple the period to the dirty_ratio:
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8cba89..afaf59e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1675,6 +1675,35 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a zone that is within its dirty
+ * limit, such that no single zone holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the zone's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * This may look like it could increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But the pages that do spill
+ * over are limited as the lower zones are protected
+ * by this very same mechanism. It should not become
+ * a practical burden to them.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-zone dirty limit in the slowpath
+ * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * zones are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
--
1.7.6.2
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply related [flat|nested] 117+ messages in thread
* [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-23 14:42 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-23 14:42 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
The maximum number of dirty pages that exist in the system at any time
is determined by a number of pages considered dirtyable and a
user-configured percentage of those, or an absolute number in bytes.
This number of dirtyable pages is the sum of memory provided by all
the zones in the system minus their lowmem reserves and high
watermarks, so that the system can retain a healthy number of free
pages without having to reclaim dirty pages.
But there is a flaw in that we have a zoned page allocator which does
not care about the global state but rather the state of individual
memory zones. And right now there is nothing that prevents one zone
from filling up with dirty pages while other zones are spared, which
frequently leads to situations where kswapd, in order to restore the
watermark of free pages, does indeed have to write pages from that
zone's LRU list. This can interfere so badly with IO from the flusher
threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
requests from reclaim already, taking away the VM's only possibility
to keep such a zone balanced, aside from hoping the flushers will soon
clean pages from that zone.
Enter per-zone dirty limits. They are to a zone's dirtyable memory
what the global limit is to the global amount of dirtyable memory, and
try to make sure that no single zone receives more than its fair share
of the globally allowed dirty pages in the first place. As the number
of pages considered dirtyable exclude the zones' lowmem reserves and
high watermarks, the maximum number of dirty pages in a zone is such
that the zone can always be balanced without requiring page cleaning.
As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to
pass __GFP_WRITE when they know in advance that the page will be
written to and become dirty soon. The page allocator will then
attempt to allocate from the first zone of the zonelist - which on
NUMA is determined by the task's NUMA memory policy - that has not
exceeded its dirty limit.
At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case. With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations. Workloads
that previously could fit their dirty pages completely in the higher
zone may be forced to allocate from lower zones, but the amount of
pages that 'spill over' are limited themselves by the lower zones'
dirty constraints, and thus unlikely to become a problem.
For now, the problem of unfair dirty page distribution remains for
NUMA configurations where the zones allowed for allocation are in sum
not big enough to trigger the global dirty limits, wake up the flusher
threads and remedy the situation. Because of this, an allocation that
could not succeed on any of the considered zones is allowed to ignore
the dirty limits before going into direct reclaim or even failing the
allocation, until a future patch changes the global dirty throttling
and flusher thread activation so that they take individual zone states
into account.
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
---
include/linux/gfp.h | 4 ++-
include/linux/writeback.h | 1 +
mm/page-writeback.c | 83 +++++++++++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 29 ++++++++++++++++
4 files changed, 116 insertions(+), 1 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf..50efc7e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,6 +36,7 @@ struct vm_area_struct;
#endif
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
+#define ___GFP_WRITE 0x1000000u
/*
* GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
/*
* This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +94,7 @@ struct vm_area_struct;
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a5f495f..c96ee0c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -104,6 +104,7 @@ void laptop_mode_timer_fn(unsigned long data);
static inline void laptop_sync_completion(void) { }
#endif
void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
extern unsigned long global_dirty_limit;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 78604a6..f60fd57 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -159,6 +159,25 @@ static struct prop_descriptor vm_dirties;
* We make sure that the background writeout level is below the adjusted
* clamping level.
*/
+
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied. Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -245,6 +264,70 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
trace_global_dirty_state(background, dirty);
}
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ /*
+ * The effective global number of dirtyable pages may exclude
+ * highmem as a big-picture measure to keep the ratio between
+ * dirty memory and lowmem reasonable.
+ *
+ * But this function is purely about the individual zone and a
+ * highmem zone can hold its share of dirty pages, so we don't
+ * care about vm_highmem_is_dirtyable here.
+ */
+ return zone_page_state(zone, NR_FREE_PAGES) +
+ zone_reclaimable_pages(zone) -
+ zone->dirty_balance_reserve;
+}
+
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+ unsigned long zone_memory = zone_dirtyable_memory(zone);
+ struct task_struct *tsk = current;
+ unsigned long dirty;
+
+ if (vm_dirty_bytes)
+ dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+ zone_memory / global_dirtyable_memory();
+ else
+ dirty = vm_dirty_ratio * zone_memory / 100;
+
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+ dirty += dirty / 4;
+
+ return dirty;
+}
+
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+ unsigned long limit = zone_dirty_limit(zone);
+
+ return zone_page_state(zone, NR_FILE_DIRTY) +
+ zone_page_state(zone, NR_UNSTABLE_NFS) +
+ zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
+
/*
* couple the period to the dirty_ratio:
*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8cba89..afaf59e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1675,6 +1675,35 @@ zonelist_scan:
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
+ /*
+ * When allocating a page cache page for writing, we
+ * want to get it from a zone that is within its dirty
+ * limit, such that no single zone holds more than its
+ * proportional share of globally allowed dirty pages.
+ * The dirty limits take into account the zone's
+ * lowmem reserves and high watermark so that kswapd
+ * should be able to balance it without having to
+ * write pages from its LRU list.
+ *
+ * This may look like it could increase pressure on
+ * lower zones by failing allocations in higher zones
+ * before they are full. But the pages that do spill
+ * over are limited as the lower zones are protected
+ * by this very same mechanism. It should not become
+ * a practical burden to them.
+ *
+ * XXX: For now, allow allocations to potentially
+ * exceed the per-zone dirty limit in the slowpath
+ * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * which is important when on a NUMA setup the allowed
+ * zones are together not big enough to reach the
+ * global limit. The proper fix for these situations
+ * will require awareness of zones in the
+ * dirty-throttling and the flusher threads.
+ */
+ if ((alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+ goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
--
1.7.6.2
^ permalink raw reply related [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
2011-09-23 14:42 ` Johannes Weiner
(?)
@ 2011-09-28 5:56 ` Minchan Kim
-1 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 5:56 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> The maximum number of dirty pages that exist in the system at any time
> is determined by a number of pages considered dirtyable and a
> user-configured percentage of those, or an absolute number in bytes.
It's explanation of old approach.
>
> This number of dirtyable pages is the sum of memory provided by all
> the zones in the system minus their lowmem reserves and high
> watermarks, so that the system can retain a healthy number of free
> pages without having to reclaim dirty pages.
It's a explanation of new approach.
>
> But there is a flaw in that we have a zoned page allocator which does
> not care about the global state but rather the state of individual
> memory zones. And right now there is nothing that prevents one zone
> from filling up with dirty pages while other zones are spared, which
> frequently leads to situations where kswapd, in order to restore the
> watermark of free pages, does indeed have to write pages from that
> zone's LRU list. This can interfere so badly with IO from the flusher
> threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> requests from reclaim already, taking away the VM's only possibility
> to keep such a zone balanced, aside from hoping the flushers will soon
> clean pages from that zone.
It's a explanation of old approach, again!
Shoudn't we move above phrase of new approach into below?
>
> Enter per-zone dirty limits. They are to a zone's dirtyable memory
> what the global limit is to the global amount of dirtyable memory, and
> try to make sure that no single zone receives more than its fair share
> of the globally allowed dirty pages in the first place. As the number
> of pages considered dirtyable exclude the zones' lowmem reserves and
> high watermarks, the maximum number of dirty pages in a zone is such
> that the zone can always be balanced without requiring page cleaning.
>
> As this is a placement decision in the page allocator and pages are
> dirtied only after the allocation, this patch allows allocators to
> pass __GFP_WRITE when they know in advance that the page will be
> written to and become dirty soon. The page allocator will then
> attempt to allocate from the first zone of the zonelist - which on
> NUMA is determined by the task's NUMA memory policy - that has not
> exceeded its dirty limit.
>
> At first glance, it would appear that the diversion to lower zones can
> increase pressure on them, but this is not the case. With a full high
> zone, allocations will be diverted to lower zones eventually, so it is
> more of a shift in timing of the lower zone allocations. Workloads
> that previously could fit their dirty pages completely in the higher
> zone may be forced to allocate from lower zones, but the amount of
> pages that 'spill over' are limited themselves by the lower zones'
> dirty constraints, and thus unlikely to become a problem.
That's a good justification.
>
> For now, the problem of unfair dirty page distribution remains for
> NUMA configurations where the zones allowed for allocation are in sum
> not big enough to trigger the global dirty limits, wake up the flusher
> threads and remedy the situation. Because of this, an allocation that
> could not succeed on any of the considered zones is allowed to ignore
> the dirty limits before going into direct reclaim or even failing the
> allocation, until a future patch changes the global dirty throttling
> and flusher thread activation so that they take individual zone states
> into account.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Otherwise, looks good to me.
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
--
Kinds regards,
Minchan Kim
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 5:56 ` Minchan Kim
0 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 5:56 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> The maximum number of dirty pages that exist in the system at any time
> is determined by a number of pages considered dirtyable and a
> user-configured percentage of those, or an absolute number in bytes.
It's explanation of old approach.
>
> This number of dirtyable pages is the sum of memory provided by all
> the zones in the system minus their lowmem reserves and high
> watermarks, so that the system can retain a healthy number of free
> pages without having to reclaim dirty pages.
It's a explanation of new approach.
>
> But there is a flaw in that we have a zoned page allocator which does
> not care about the global state but rather the state of individual
> memory zones. And right now there is nothing that prevents one zone
> from filling up with dirty pages while other zones are spared, which
> frequently leads to situations where kswapd, in order to restore the
> watermark of free pages, does indeed have to write pages from that
> zone's LRU list. This can interfere so badly with IO from the flusher
> threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> requests from reclaim already, taking away the VM's only possibility
> to keep such a zone balanced, aside from hoping the flushers will soon
> clean pages from that zone.
It's a explanation of old approach, again!
Shoudn't we move above phrase of new approach into below?
>
> Enter per-zone dirty limits. They are to a zone's dirtyable memory
> what the global limit is to the global amount of dirtyable memory, and
> try to make sure that no single zone receives more than its fair share
> of the globally allowed dirty pages in the first place. As the number
> of pages considered dirtyable exclude the zones' lowmem reserves and
> high watermarks, the maximum number of dirty pages in a zone is such
> that the zone can always be balanced without requiring page cleaning.
>
> As this is a placement decision in the page allocator and pages are
> dirtied only after the allocation, this patch allows allocators to
> pass __GFP_WRITE when they know in advance that the page will be
> written to and become dirty soon. The page allocator will then
> attempt to allocate from the first zone of the zonelist - which on
> NUMA is determined by the task's NUMA memory policy - that has not
> exceeded its dirty limit.
>
> At first glance, it would appear that the diversion to lower zones can
> increase pressure on them, but this is not the case. With a full high
> zone, allocations will be diverted to lower zones eventually, so it is
> more of a shift in timing of the lower zone allocations. Workloads
> that previously could fit their dirty pages completely in the higher
> zone may be forced to allocate from lower zones, but the amount of
> pages that 'spill over' are limited themselves by the lower zones'
> dirty constraints, and thus unlikely to become a problem.
That's a good justification.
>
> For now, the problem of unfair dirty page distribution remains for
> NUMA configurations where the zones allowed for allocation are in sum
> not big enough to trigger the global dirty limits, wake up the flusher
> threads and remedy the situation. Because of this, an allocation that
> could not succeed on any of the considered zones is allowed to ignore
> the dirty limits before going into direct reclaim or even failing the
> allocation, until a future patch changes the global dirty throttling
> and flusher thread activation so that they take individual zone states
> into account.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Otherwise, looks good to me.
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
--
Kinds regards,
Minchan Kim
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 5:56 ` Minchan Kim
0 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 5:56 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Rik van Riel, linux-ext4, Jan Kara, linux-btrfs,
linux-kernel, xfs, Christoph Hellwig, linux-mm, Andreas Dilger,
Mel Gorman, linux-fsdevel, Theodore Ts'o, Andrew Morton,
Wu Fengguang, Chris Mason
On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> The maximum number of dirty pages that exist in the system at any time
> is determined by a number of pages considered dirtyable and a
> user-configured percentage of those, or an absolute number in bytes.
It's explanation of old approach.
>
> This number of dirtyable pages is the sum of memory provided by all
> the zones in the system minus their lowmem reserves and high
> watermarks, so that the system can retain a healthy number of free
> pages without having to reclaim dirty pages.
It's a explanation of new approach.
>
> But there is a flaw in that we have a zoned page allocator which does
> not care about the global state but rather the state of individual
> memory zones. And right now there is nothing that prevents one zone
> from filling up with dirty pages while other zones are spared, which
> frequently leads to situations where kswapd, in order to restore the
> watermark of free pages, does indeed have to write pages from that
> zone's LRU list. This can interfere so badly with IO from the flusher
> threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> requests from reclaim already, taking away the VM's only possibility
> to keep such a zone balanced, aside from hoping the flushers will soon
> clean pages from that zone.
It's a explanation of old approach, again!
Shoudn't we move above phrase of new approach into below?
>
> Enter per-zone dirty limits. They are to a zone's dirtyable memory
> what the global limit is to the global amount of dirtyable memory, and
> try to make sure that no single zone receives more than its fair share
> of the globally allowed dirty pages in the first place. As the number
> of pages considered dirtyable exclude the zones' lowmem reserves and
> high watermarks, the maximum number of dirty pages in a zone is such
> that the zone can always be balanced without requiring page cleaning.
>
> As this is a placement decision in the page allocator and pages are
> dirtied only after the allocation, this patch allows allocators to
> pass __GFP_WRITE when they know in advance that the page will be
> written to and become dirty soon. The page allocator will then
> attempt to allocate from the first zone of the zonelist - which on
> NUMA is determined by the task's NUMA memory policy - that has not
> exceeded its dirty limit.
>
> At first glance, it would appear that the diversion to lower zones can
> increase pressure on them, but this is not the case. With a full high
> zone, allocations will be diverted to lower zones eventually, so it is
> more of a shift in timing of the lower zone allocations. Workloads
> that previously could fit their dirty pages completely in the higher
> zone may be forced to allocate from lower zones, but the amount of
> pages that 'spill over' are limited themselves by the lower zones'
> dirty constraints, and thus unlikely to become a problem.
That's a good justification.
>
> For now, the problem of unfair dirty page distribution remains for
> NUMA configurations where the zones allowed for allocation are in sum
> not big enough to trigger the global dirty limits, wake up the flusher
> threads and remedy the situation. Because of this, an allocation that
> could not succeed on any of the considered zones is allowed to ignore
> the dirty limits before going into direct reclaim or even failing the
> allocation, until a future patch changes the global dirty throttling
> and flusher thread activation so that they take individual zone states
> into account.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Otherwise, looks good to me.
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
--
Kinds regards,
Minchan Kim
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
2011-09-28 5:56 ` Minchan Kim
(?)
@ 2011-09-28 7:11 ` Johannes Weiner
-1 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-28 7:11 UTC (permalink / raw)
To: Minchan Kim
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Wed, Sep 28, 2011 at 02:56:40PM +0900, Minchan Kim wrote:
> On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> > The maximum number of dirty pages that exist in the system at any time
> > is determined by a number of pages considered dirtyable and a
> > user-configured percentage of those, or an absolute number in bytes.
>
> It's explanation of old approach.
What do you mean? This does not change with this patch. We still
have a number of dirtyable pages and a limit that is applied
relatively to this number.
> > This number of dirtyable pages is the sum of memory provided by all
> > the zones in the system minus their lowmem reserves and high
> > watermarks, so that the system can retain a healthy number of free
> > pages without having to reclaim dirty pages.
>
> It's a explanation of new approach.
Same here, this aspect is also not changed with this patch!
> > But there is a flaw in that we have a zoned page allocator which does
> > not care about the global state but rather the state of individual
> > memory zones. And right now there is nothing that prevents one zone
> > from filling up with dirty pages while other zones are spared, which
> > frequently leads to situations where kswapd, in order to restore the
> > watermark of free pages, does indeed have to write pages from that
> > zone's LRU list. This can interfere so badly with IO from the flusher
> > threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> > requests from reclaim already, taking away the VM's only possibility
> > to keep such a zone balanced, aside from hoping the flushers will soon
> > clean pages from that zone.
>
> It's a explanation of old approach, again!
> Shoudn't we move above phrase of new approach into below?
Everything above describes the current behaviour (at the point of this
patch, so respecting lowmem_reserve e.g. is part of the current
behaviour by now) and its problems. And below follows a description
of how the patch tries to fix it.
> > Enter per-zone dirty limits. They are to a zone's dirtyable memory
> > what the global limit is to the global amount of dirtyable memory, and
> > try to make sure that no single zone receives more than its fair share
> > of the globally allowed dirty pages in the first place. As the number
> > of pages considered dirtyable exclude the zones' lowmem reserves and
> > high watermarks, the maximum number of dirty pages in a zone is such
> > that the zone can always be balanced without requiring page cleaning.
> >
> > As this is a placement decision in the page allocator and pages are
> > dirtied only after the allocation, this patch allows allocators to
> > pass __GFP_WRITE when they know in advance that the page will be
> > written to and become dirty soon. The page allocator will then
> > attempt to allocate from the first zone of the zonelist - which on
> > NUMA is determined by the task's NUMA memory policy - that has not
> > exceeded its dirty limit.
> >
> > At first glance, it would appear that the diversion to lower zones can
> > increase pressure on them, but this is not the case. With a full high
> > zone, allocations will be diverted to lower zones eventually, so it is
> > more of a shift in timing of the lower zone allocations. Workloads
> > that previously could fit their dirty pages completely in the higher
> > zone may be forced to allocate from lower zones, but the amount of
> > pages that 'spill over' are limited themselves by the lower zones'
> > dirty constraints, and thus unlikely to become a problem.
>
> That's a good justification.
>
> > For now, the problem of unfair dirty page distribution remains for
> > NUMA configurations where the zones allowed for allocation are in sum
> > not big enough to trigger the global dirty limits, wake up the flusher
> > threads and remedy the situation. Because of this, an allocation that
> > could not succeed on any of the considered zones is allowed to ignore
> > the dirty limits before going into direct reclaim or even failing the
> > allocation, until a future patch changes the global dirty throttling
> > and flusher thread activation so that they take individual zone states
> > into account.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
>
> Otherwise, looks good to me.
> Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Thanks!
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 7:11 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-28 7:11 UTC (permalink / raw)
To: Minchan Kim
Cc: Andrew Morton, Rik van Riel, linux-ext4, Jan Kara, linux-btrfs,
linux-kernel, xfs, Christoph Hellwig, linux-mm, Andreas Dilger,
Mel Gorman, linux-fsdevel, Theodore Ts'o, Andrew Morton,
Wu Fengguang, Chris Mason
On Wed, Sep 28, 2011 at 02:56:40PM +0900, Minchan Kim wrote:
> On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> > The maximum number of dirty pages that exist in the system at any time
> > is determined by a number of pages considered dirtyable and a
> > user-configured percentage of those, or an absolute number in bytes.
>
> It's explanation of old approach.
What do you mean? This does not change with this patch. We still
have a number of dirtyable pages and a limit that is applied
relatively to this number.
> > This number of dirtyable pages is the sum of memory provided by all
> > the zones in the system minus their lowmem reserves and high
> > watermarks, so that the system can retain a healthy number of free
> > pages without having to reclaim dirty pages.
>
> It's a explanation of new approach.
Same here, this aspect is also not changed with this patch!
> > But there is a flaw in that we have a zoned page allocator which does
> > not care about the global state but rather the state of individual
> > memory zones. And right now there is nothing that prevents one zone
> > from filling up with dirty pages while other zones are spared, which
> > frequently leads to situations where kswapd, in order to restore the
> > watermark of free pages, does indeed have to write pages from that
> > zone's LRU list. This can interfere so badly with IO from the flusher
> > threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> > requests from reclaim already, taking away the VM's only possibility
> > to keep such a zone balanced, aside from hoping the flushers will soon
> > clean pages from that zone.
>
> It's a explanation of old approach, again!
> Shoudn't we move above phrase of new approach into below?
Everything above describes the current behaviour (at the point of this
patch, so respecting lowmem_reserve e.g. is part of the current
behaviour by now) and its problems. And below follows a description
of how the patch tries to fix it.
> > Enter per-zone dirty limits. They are to a zone's dirtyable memory
> > what the global limit is to the global amount of dirtyable memory, and
> > try to make sure that no single zone receives more than its fair share
> > of the globally allowed dirty pages in the first place. As the number
> > of pages considered dirtyable exclude the zones' lowmem reserves and
> > high watermarks, the maximum number of dirty pages in a zone is such
> > that the zone can always be balanced without requiring page cleaning.
> >
> > As this is a placement decision in the page allocator and pages are
> > dirtied only after the allocation, this patch allows allocators to
> > pass __GFP_WRITE when they know in advance that the page will be
> > written to and become dirty soon. The page allocator will then
> > attempt to allocate from the first zone of the zonelist - which on
> > NUMA is determined by the task's NUMA memory policy - that has not
> > exceeded its dirty limit.
> >
> > At first glance, it would appear that the diversion to lower zones can
> > increase pressure on them, but this is not the case. With a full high
> > zone, allocations will be diverted to lower zones eventually, so it is
> > more of a shift in timing of the lower zone allocations. Workloads
> > that previously could fit their dirty pages completely in the higher
> > zone may be forced to allocate from lower zones, but the amount of
> > pages that 'spill over' are limited themselves by the lower zones'
> > dirty constraints, and thus unlikely to become a problem.
>
> That's a good justification.
>
> > For now, the problem of unfair dirty page distribution remains for
> > NUMA configurations where the zones allowed for allocation are in sum
> > not big enough to trigger the global dirty limits, wake up the flusher
> > threads and remedy the situation. Because of this, an allocation that
> > could not succeed on any of the considered zones is allowed to ignore
> > the dirty limits before going into direct reclaim or even failing the
> > allocation, until a future patch changes the global dirty throttling
> > and flusher thread activation so that they take individual zone states
> > into account.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
>
> Otherwise, looks good to me.
> Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Thanks!
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 7:11 ` Johannes Weiner
0 siblings, 0 replies; 117+ messages in thread
From: Johannes Weiner @ 2011-09-28 7:11 UTC (permalink / raw)
To: Minchan Kim
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Wed, Sep 28, 2011 at 02:56:40PM +0900, Minchan Kim wrote:
> On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> > The maximum number of dirty pages that exist in the system at any time
> > is determined by a number of pages considered dirtyable and a
> > user-configured percentage of those, or an absolute number in bytes.
>
> It's explanation of old approach.
What do you mean? This does not change with this patch. We still
have a number of dirtyable pages and a limit that is applied
relatively to this number.
> > This number of dirtyable pages is the sum of memory provided by all
> > the zones in the system minus their lowmem reserves and high
> > watermarks, so that the system can retain a healthy number of free
> > pages without having to reclaim dirty pages.
>
> It's a explanation of new approach.
Same here, this aspect is also not changed with this patch!
> > But there is a flaw in that we have a zoned page allocator which does
> > not care about the global state but rather the state of individual
> > memory zones. And right now there is nothing that prevents one zone
> > from filling up with dirty pages while other zones are spared, which
> > frequently leads to situations where kswapd, in order to restore the
> > watermark of free pages, does indeed have to write pages from that
> > zone's LRU list. This can interfere so badly with IO from the flusher
> > threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> > requests from reclaim already, taking away the VM's only possibility
> > to keep such a zone balanced, aside from hoping the flushers will soon
> > clean pages from that zone.
>
> It's a explanation of old approach, again!
> Shoudn't we move above phrase of new approach into below?
Everything above describes the current behaviour (at the point of this
patch, so respecting lowmem_reserve e.g. is part of the current
behaviour by now) and its problems. And below follows a description
of how the patch tries to fix it.
> > Enter per-zone dirty limits. They are to a zone's dirtyable memory
> > what the global limit is to the global amount of dirtyable memory, and
> > try to make sure that no single zone receives more than its fair share
> > of the globally allowed dirty pages in the first place. As the number
> > of pages considered dirtyable exclude the zones' lowmem reserves and
> > high watermarks, the maximum number of dirty pages in a zone is such
> > that the zone can always be balanced without requiring page cleaning.
> >
> > As this is a placement decision in the page allocator and pages are
> > dirtied only after the allocation, this patch allows allocators to
> > pass __GFP_WRITE when they know in advance that the page will be
> > written to and become dirty soon. The page allocator will then
> > attempt to allocate from the first zone of the zonelist - which on
> > NUMA is determined by the task's NUMA memory policy - that has not
> > exceeded its dirty limit.
> >
> > At first glance, it would appear that the diversion to lower zones can
> > increase pressure on them, but this is not the case. With a full high
> > zone, allocations will be diverted to lower zones eventually, so it is
> > more of a shift in timing of the lower zone allocations. Workloads
> > that previously could fit their dirty pages completely in the higher
> > zone may be forced to allocate from lower zones, but the amount of
> > pages that 'spill over' are limited themselves by the lower zones'
> > dirty constraints, and thus unlikely to become a problem.
>
> That's a good justification.
>
> > For now, the problem of unfair dirty page distribution remains for
> > NUMA configurations where the zones allowed for allocation are in sum
> > not big enough to trigger the global dirty limits, wake up the flusher
> > threads and remedy the situation. Because of this, an allocation that
> > could not succeed on any of the considered zones is allowed to ignore
> > the dirty limits before going into direct reclaim or even failing the
> > allocation, until a future patch changes the global dirty throttling
> > and flusher thread activation so that they take individual zone states
> > into account.
> >
> > Signed-off-by: Johannes Weiner <jweiner@redhat.com>
>
> Otherwise, looks good to me.
> Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Thanks!
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
2011-09-28 7:11 ` Johannes Weiner
(?)
@ 2011-09-28 18:09 ` Minchan Kim
-1 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 18:09 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Wed, Sep 28, 2011 at 09:11:54AM +0200, Johannes Weiner wrote:
> On Wed, Sep 28, 2011 at 02:56:40PM +0900, Minchan Kim wrote:
> > On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> > > The maximum number of dirty pages that exist in the system at any time
> > > is determined by a number of pages considered dirtyable and a
> > > user-configured percentage of those, or an absolute number in bytes.
> >
> > It's explanation of old approach.
>
> What do you mean? This does not change with this patch. We still
> have a number of dirtyable pages and a limit that is applied
> relatively to this number.
>
> > > This number of dirtyable pages is the sum of memory provided by all
> > > the zones in the system minus their lowmem reserves and high
> > > watermarks, so that the system can retain a healthy number of free
> > > pages without having to reclaim dirty pages.
> >
> > It's a explanation of new approach.
>
> Same here, this aspect is also not changed with this patch!
>
> > > But there is a flaw in that we have a zoned page allocator which does
> > > not care about the global state but rather the state of individual
> > > memory zones. And right now there is nothing that prevents one zone
> > > from filling up with dirty pages while other zones are spared, which
> > > frequently leads to situations where kswapd, in order to restore the
> > > watermark of free pages, does indeed have to write pages from that
> > > zone's LRU list. This can interfere so badly with IO from the flusher
> > > threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> > > requests from reclaim already, taking away the VM's only possibility
> > > to keep such a zone balanced, aside from hoping the flushers will soon
> > > clean pages from that zone.
> >
> > It's a explanation of old approach, again!
> > Shoudn't we move above phrase of new approach into below?
>
> Everything above describes the current behaviour (at the point of this
> patch, so respecting lowmem_reserve e.g. is part of the current
> behaviour by now) and its problems. And below follows a description
> of how the patch tries to fix it.
It seems that it's not a good choice to use "old" and "new" terms.
Hannes, please ignore, it's not a biggie.
--
Kind regards,
Minchan Kim
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 18:09 ` Minchan Kim
0 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 18:09 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Mel Gorman, Christoph Hellwig, Dave Chinner,
Wu Fengguang, Jan Kara, Rik van Riel, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Wed, Sep 28, 2011 at 09:11:54AM +0200, Johannes Weiner wrote:
> On Wed, Sep 28, 2011 at 02:56:40PM +0900, Minchan Kim wrote:
> > On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> > > The maximum number of dirty pages that exist in the system at any time
> > > is determined by a number of pages considered dirtyable and a
> > > user-configured percentage of those, or an absolute number in bytes.
> >
> > It's explanation of old approach.
>
> What do you mean? This does not change with this patch. We still
> have a number of dirtyable pages and a limit that is applied
> relatively to this number.
>
> > > This number of dirtyable pages is the sum of memory provided by all
> > > the zones in the system minus their lowmem reserves and high
> > > watermarks, so that the system can retain a healthy number of free
> > > pages without having to reclaim dirty pages.
> >
> > It's a explanation of new approach.
>
> Same here, this aspect is also not changed with this patch!
>
> > > But there is a flaw in that we have a zoned page allocator which does
> > > not care about the global state but rather the state of individual
> > > memory zones. And right now there is nothing that prevents one zone
> > > from filling up with dirty pages while other zones are spared, which
> > > frequently leads to situations where kswapd, in order to restore the
> > > watermark of free pages, does indeed have to write pages from that
> > > zone's LRU list. This can interfere so badly with IO from the flusher
> > > threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> > > requests from reclaim already, taking away the VM's only possibility
> > > to keep such a zone balanced, aside from hoping the flushers will soon
> > > clean pages from that zone.
> >
> > It's a explanation of old approach, again!
> > Shoudn't we move above phrase of new approach into below?
>
> Everything above describes the current behaviour (at the point of this
> patch, so respecting lowmem_reserve e.g. is part of the current
> behaviour by now) and its problems. And below follows a description
> of how the patch tries to fix it.
It seems that it's not a good choice to use "old" and "new" terms.
Hannes, please ignore, it's not a biggie.
--
Kind regards,
Minchan Kim
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 18:09 ` Minchan Kim
0 siblings, 0 replies; 117+ messages in thread
From: Minchan Kim @ 2011-09-28 18:09 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Rik van Riel, linux-ext4, Jan Kara, linux-btrfs,
linux-kernel, xfs, Christoph Hellwig, linux-mm, Andreas Dilger,
Mel Gorman, linux-fsdevel, Theodore Ts'o, Andrew Morton,
Wu Fengguang, Chris Mason
On Wed, Sep 28, 2011 at 09:11:54AM +0200, Johannes Weiner wrote:
> On Wed, Sep 28, 2011 at 02:56:40PM +0900, Minchan Kim wrote:
> > On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> > > The maximum number of dirty pages that exist in the system at any time
> > > is determined by a number of pages considered dirtyable and a
> > > user-configured percentage of those, or an absolute number in bytes.
> >
> > It's explanation of old approach.
>
> What do you mean? This does not change with this patch. We still
> have a number of dirtyable pages and a limit that is applied
> relatively to this number.
>
> > > This number of dirtyable pages is the sum of memory provided by all
> > > the zones in the system minus their lowmem reserves and high
> > > watermarks, so that the system can retain a healthy number of free
> > > pages without having to reclaim dirty pages.
> >
> > It's a explanation of new approach.
>
> Same here, this aspect is also not changed with this patch!
>
> > > But there is a flaw in that we have a zoned page allocator which does
> > > not care about the global state but rather the state of individual
> > > memory zones. And right now there is nothing that prevents one zone
> > > from filling up with dirty pages while other zones are spared, which
> > > frequently leads to situations where kswapd, in order to restore the
> > > watermark of free pages, does indeed have to write pages from that
> > > zone's LRU list. This can interfere so badly with IO from the flusher
> > > threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> > > requests from reclaim already, taking away the VM's only possibility
> > > to keep such a zone balanced, aside from hoping the flushers will soon
> > > clean pages from that zone.
> >
> > It's a explanation of old approach, again!
> > Shoudn't we move above phrase of new approach into below?
>
> Everything above describes the current behaviour (at the point of this
> patch, so respecting lowmem_reserve e.g. is part of the current
> behaviour by now) and its problems. And below follows a description
> of how the patch tries to fix it.
It seems that it's not a good choice to use "old" and "new" terms.
Hannes, please ignore, it's not a biggie.
--
Kind regards,
Minchan Kim
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
2011-09-23 14:42 ` Johannes Weiner
(?)
@ 2011-09-28 9:36 ` Mel Gorman
-1 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-28 9:36 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> The maximum number of dirty pages that exist in the system at any time
> is determined by a number of pages considered dirtyable and a
> user-configured percentage of those, or an absolute number in bytes.
>
> This number of dirtyable pages is the sum of memory provided by all
> the zones in the system minus their lowmem reserves and high
> watermarks, so that the system can retain a healthy number of free
> pages without having to reclaim dirty pages.
>
> But there is a flaw in that we have a zoned page allocator which does
> not care about the global state but rather the state of individual
> memory zones. And right now there is nothing that prevents one zone
> from filling up with dirty pages while other zones are spared, which
> frequently leads to situations where kswapd, in order to restore the
> watermark of free pages, does indeed have to write pages from that
> zone's LRU list. This can interfere so badly with IO from the flusher
> threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> requests from reclaim already, taking away the VM's only possibility
> to keep such a zone balanced, aside from hoping the flushers will soon
> clean pages from that zone.
>
> Enter per-zone dirty limits. They are to a zone's dirtyable memory
> what the global limit is to the global amount of dirtyable memory, and
> try to make sure that no single zone receives more than its fair share
> of the globally allowed dirty pages in the first place. As the number
> of pages considered dirtyable exclude the zones' lowmem reserves and
> high watermarks, the maximum number of dirty pages in a zone is such
> that the zone can always be balanced without requiring page cleaning.
>
> As this is a placement decision in the page allocator and pages are
> dirtied only after the allocation, this patch allows allocators to
> pass __GFP_WRITE when they know in advance that the page will be
> written to and become dirty soon. The page allocator will then
> attempt to allocate from the first zone of the zonelist - which on
> NUMA is determined by the task's NUMA memory policy - that has not
> exceeded its dirty limit.
>
> At first glance, it would appear that the diversion to lower zones can
> increase pressure on them, but this is not the case. With a full high
> zone, allocations will be diverted to lower zones eventually, so it is
> more of a shift in timing of the lower zone allocations. Workloads
> that previously could fit their dirty pages completely in the higher
> zone may be forced to allocate from lower zones, but the amount of
> pages that 'spill over' are limited themselves by the lower zones'
> dirty constraints, and thus unlikely to become a problem.
>
> For now, the problem of unfair dirty page distribution remains for
> NUMA configurations where the zones allowed for allocation are in sum
> not big enough to trigger the global dirty limits, wake up the flusher
> threads and remedy the situation. Because of this, an allocation that
> could not succeed on any of the considered zones is allowed to ignore
> the dirty limits before going into direct reclaim or even failing the
> allocation, until a future patch changes the global dirty throttling
> and flusher thread activation so that they take individual zone states
> into account.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
--
Mel Gorman
SUSE Labs
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 9:36 ` Mel Gorman
0 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-28 9:36 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Christoph Hellwig, Dave Chinner, Wu Fengguang,
Jan Kara, Rik van Riel, Minchan Kim, Chris Mason,
Theodore Ts'o, Andreas Dilger, xfs, linux-btrfs, linux-ext4,
linux-mm, linux-fsdevel, linux-kernel, Andrew Morton
On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> The maximum number of dirty pages that exist in the system at any time
> is determined by a number of pages considered dirtyable and a
> user-configured percentage of those, or an absolute number in bytes.
>
> This number of dirtyable pages is the sum of memory provided by all
> the zones in the system minus their lowmem reserves and high
> watermarks, so that the system can retain a healthy number of free
> pages without having to reclaim dirty pages.
>
> But there is a flaw in that we have a zoned page allocator which does
> not care about the global state but rather the state of individual
> memory zones. And right now there is nothing that prevents one zone
> from filling up with dirty pages while other zones are spared, which
> frequently leads to situations where kswapd, in order to restore the
> watermark of free pages, does indeed have to write pages from that
> zone's LRU list. This can interfere so badly with IO from the flusher
> threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> requests from reclaim already, taking away the VM's only possibility
> to keep such a zone balanced, aside from hoping the flushers will soon
> clean pages from that zone.
>
> Enter per-zone dirty limits. They are to a zone's dirtyable memory
> what the global limit is to the global amount of dirtyable memory, and
> try to make sure that no single zone receives more than its fair share
> of the globally allowed dirty pages in the first place. As the number
> of pages considered dirtyable exclude the zones' lowmem reserves and
> high watermarks, the maximum number of dirty pages in a zone is such
> that the zone can always be balanced without requiring page cleaning.
>
> As this is a placement decision in the page allocator and pages are
> dirtied only after the allocation, this patch allows allocators to
> pass __GFP_WRITE when they know in advance that the page will be
> written to and become dirty soon. The page allocator will then
> attempt to allocate from the first zone of the zonelist - which on
> NUMA is determined by the task's NUMA memory policy - that has not
> exceeded its dirty limit.
>
> At first glance, it would appear that the diversion to lower zones can
> increase pressure on them, but this is not the case. With a full high
> zone, allocations will be diverted to lower zones eventually, so it is
> more of a shift in timing of the lower zone allocations. Workloads
> that previously could fit their dirty pages completely in the higher
> zone may be forced to allocate from lower zones, but the amount of
> pages that 'spill over' are limited themselves by the lower zones'
> dirty constraints, and thus unlikely to become a problem.
>
> For now, the problem of unfair dirty page distribution remains for
> NUMA configurations where the zones allowed for allocation are in sum
> not big enough to trigger the global dirty limits, wake up the flusher
> threads and remedy the situation. Because of this, an allocation that
> could not succeed on any of the considered zones is allowed to ignore
> the dirty limits before going into direct reclaim or even failing the
> allocation, until a future patch changes the global dirty throttling
> and flusher thread activation so that they take individual zone states
> into account.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
--
Mel Gorman
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 117+ messages in thread
* Re: [patch 2/2/4] mm: try to distribute dirty pages fairly across zones
@ 2011-09-28 9:36 ` Mel Gorman
0 siblings, 0 replies; 117+ messages in thread
From: Mel Gorman @ 2011-09-28 9:36 UTC (permalink / raw)
To: Johannes Weiner
Cc: Andrew Morton, Rik van Riel, linux-ext4, Jan Kara, linux-btrfs,
linux-kernel, xfs, Christoph Hellwig, linux-mm, Andreas Dilger,
Minchan Kim, linux-fsdevel, Theodore Ts'o, Andrew Morton,
Wu Fengguang, Chris Mason
On Fri, Sep 23, 2011 at 04:42:48PM +0200, Johannes Weiner wrote:
> The maximum number of dirty pages that exist in the system at any time
> is determined by a number of pages considered dirtyable and a
> user-configured percentage of those, or an absolute number in bytes.
>
> This number of dirtyable pages is the sum of memory provided by all
> the zones in the system minus their lowmem reserves and high
> watermarks, so that the system can retain a healthy number of free
> pages without having to reclaim dirty pages.
>
> But there is a flaw in that we have a zoned page allocator which does
> not care about the global state but rather the state of individual
> memory zones. And right now there is nothing that prevents one zone
> from filling up with dirty pages while other zones are spared, which
> frequently leads to situations where kswapd, in order to restore the
> watermark of free pages, does indeed have to write pages from that
> zone's LRU list. This can interfere so badly with IO from the flusher
> threads that major filesystems (btrfs, xfs, ext4) mostly ignore write
> requests from reclaim already, taking away the VM's only possibility
> to keep such a zone balanced, aside from hoping the flushers will soon
> clean pages from that zone.
>
> Enter per-zone dirty limits. They are to a zone's dirtyable memory
> what the global limit is to the global amount of dirtyable memory, and
> try to make sure that no single zone receives more than its fair share
> of the globally allowed dirty pages in the first place. As the number
> of pages considered dirtyable exclude the zones' lowmem reserves and
> high watermarks, the maximum number of dirty pages in a zone is such
> that the zone can always be balanced without requiring page cleaning.
>
> As this is a placement decision in the page allocator and pages are
> dirtied only after the allocation, this patch allows allocators to
> pass __GFP_WRITE when they know in advance that the page will be
> written to and become dirty soon. The page allocator will then
> attempt to allocate from the first zone of the zonelist - which on
> NUMA is determined by the task's NUMA memory policy - that has not
> exceeded its dirty limit.
>
> At first glance, it would appear that the diversion to lower zones can
> increase pressure on them, but this is not the case. With a full high
> zone, allocations will be diverted to lower zones eventually, so it is
> more of a shift in timing of the lower zone allocations. Workloads
> that previously could fit their dirty pages completely in the higher
> zone may be forced to allocate from lower zones, but the amount of
> pages that 'spill over' are limited themselves by the lower zones'
> dirty constraints, and thus unlikely to become a problem.
>
> For now, the problem of unfair dirty page distribution remains for
> NUMA configurations where the zones allowed for allocation are in sum
> not big enough to trigger the global dirty limits, wake up the flusher
> threads and remedy the situation. Because of this, an allocation that
> could not succeed on any of the considered zones is allowed to ignore
> the dirty limits before going into direct reclaim or even failing the
> allocation, until a future patch changes the global dirty throttling
> and flusher thread activation so that they take individual zone states
> into account.
>
> Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
--
Mel Gorman
SUSE Labs
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
^ permalink raw reply [flat|nested] 117+ messages in thread