All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mel Gorman <mel@csn.ul.ie>
To: Mel Gorman <mel@csn.ul.ie>,
	Linux Memory Management List <linux-mm@kvack.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Christoph Lameter <cl@linux-foundation.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Nick Piggin <npiggin@suse.de>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Lin Ming <ming.m.lin@intel.com>,
	Zhang Yanmin <yanmin_zhang@linux.intel.com>
Subject: [PATCH 06/20] Break up the allocator entry point into fast and slow paths
Date: Sun, 22 Feb 2009 23:17:15 +0000	[thread overview]
Message-ID: <1235344649-18265-7-git-send-email-mel@csn.ul.ie> (raw)
In-Reply-To: <1235344649-18265-1-git-send-email-mel@csn.ul.ie>

The core of the page allocator is one giant function which allocates
memory on the stack and makes calculations that may not be needed for every
allocation. This patch breaks up the allocator path into fast and slow paths.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
---
 mm/page_alloc.c |  345 ++++++++++++++++++++++++++++++++++---------------------
 1 files changed, 216 insertions(+), 129 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 99fd538..503d692 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1463,45 +1463,170 @@ try_next_zone:
 	return page;
 }
 
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
+int
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+				unsigned long pages_reclaimed)
+{
+	/* Do not loop if specifically requested */
+	if (gfp_mask & __GFP_NORETRY)
+		return 0;
+	
+	/*
+	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+	 * means __GFP_NOFAIL, but that may not be true in other
+	 * implementations.
+	 */
+	if (order <= PAGE_ALLOC_COSTLY_ORDER)
+		return 1;
+
+	/*
+	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+	 * specified, then we retry until we no longer reclaim any pages
+	 * (above), or we've reclaimed an order of pages at least as
+	 * large as the allocation's order. In both cases, if the
+	 * allocation still fails, we stop retrying.
+	 */
+	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+		return 1;
+
+	/*
+	 * Don't let big-order allocations loop unless the caller
+	 * explicitly requests that. 
+	 */
+	if (gfp_mask & __GFP_NOFAIL)
+		return 1;
+
+	return 0;
+}
+
 struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-			struct zonelist *zonelist, nodemask_t *nodemask)
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
 {
-	const gfp_t wait = gfp_mask & __GFP_WAIT;
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-	struct zoneref *z;
-	struct zone *zone;
 	struct page *page;
+
+	/* Acquire the OOM killer lock for the zones in zonelist */
+	if (!try_set_zone_oom(zonelist, gfp_mask)) {
+		schedule_timeout_uninterruptible(1);
+		return NULL;
+	}
+
+	/*
+	 * Go through the zonelist yet one more time, keep very high watermark
+	 * here, this is only to catch a parallel oom killing, we must fail if
+	 * we're still under heavy pressure.
+	 */
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+		order, zonelist, high_zoneidx,
+		ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+	if (page)
+		goto out;
+
+	/* The OOM killer will not help higher order allocs */
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		goto out;
+
+	/* Exhausted what can be done so it's blamo time */
+	out_of_memory(zonelist, gfp_mask, order);
+
+out:
+	clear_zonelist_oom(zonelist, gfp_mask);
+	return page;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+	struct page *page = NULL;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
-	int do_retry;
-	int alloc_flags;
-	unsigned long did_some_progress;
-	unsigned long pages_reclaimed = 0;
 
-	might_sleep_if(wait);
+	cond_resched();
 
-	if (should_fail_alloc_page(gfp_mask, order))
-		return NULL;
+	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
 
-	/* the list of zones suitable for gfp_mask */
-	z = zonelist->_zonerefs;
-	if (unlikely(!z->zone)) {
-		/*
-		 * Happens if we have an empty zonelist as a result of
-		 * GFP_THISNODE being used on a memoryless node
-		 */
-		return NULL;
-	}
+	/*
+	 * The task's cpuset might have expanded its set of allowable nodes
+	 */
+	cpuset_update_task_memory_state();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
 
-restart:
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
-	if (page)
-		goto got_pg;
+	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;
+
+	cond_resched();
+
+	if (order != 0)
+		drain_all_pages();
+
+	if (likely(*did_some_progress))
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+					zonelist, high_zoneidx, alloc_flags);
+	return page;
+}
+
+static inline int is_allocation_high_priority(struct task_struct *p,
+							gfp_t gfp_mask)
+{
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+			&& !in_interrupt())
+		if (!(gfp_mask & __GFP_NOMEMALLOC))
+			return 1;
+	return 0;
+}
+
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	struct page *page;
+
+	do {
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+
+		if (!page && gfp_mask & __GFP_NOFAIL)
+			congestion_wait(WRITE, HZ/50);
+	} while (!page && (gfp_mask & __GFP_NOFAIL));
+
+	return page;
+}
+
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx)
+{
+	struct zoneref *z;
+	struct zone *zone;
+
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		wakeup_kswapd(zone, order);
+}
+
+static struct page * noinline
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	struct page *page = NULL;
+	int alloc_flags;
+	unsigned long pages_reclaimed = 0;
+	unsigned long did_some_progress;
+	struct task_struct *p = current;
 
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1514,8 +1639,7 @@ restart:
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-		wakeup_kswapd(zone, order);
+	wake_all_kswapd(order, zonelist, high_zoneidx);
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -1535,6 +1659,7 @@ restart:
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;
 
+restart:
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
@@ -1548,118 +1673,47 @@ restart:
 	if (page)
 		goto got_pg;
 
-	/* This allocation should allow future memory freeing. */
-
-rebalance:
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-			&& !in_interrupt()) {
-		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
-			/* go through the zonelist yet again, ignoring mins */
-			page = get_page_from_freelist(gfp_mask, nodemask, order,
-				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
-			if (page)
-				goto got_pg;
-			if (gfp_mask & __GFP_NOFAIL) {
-				congestion_wait(WRITE, HZ/50);
-				goto nofail_alloc;
-			}
-		}
-		goto nopage;
-	}
+	/* Allocate without watermarks if the context allows */
+	if (is_allocation_high_priority(p, gfp_mask))
+		page = __alloc_pages_high_priority(gfp_mask, order,
+			zonelist, high_zoneidx, nodemask);
+	if (page)
+		goto got_pg;
 
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 
-	cond_resched();
+	/* Try direct reclaim and then allocating */
+	page = __alloc_pages_direct_reclaim(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask,
+					alloc_flags, &did_some_progress);
+	if (page)
+		goto got_pg;
 
-	/* We now go into synchronous reclaim */
-	cpuset_memory_pressure_bump();
 	/*
-	 * The task's cpuset might have expanded its set of allowable nodes
+	 * If we failed to make any progress reclaiming, then we are
+	 * running out of options and have to consider going OOM
 	 */
-	cpuset_update_task_memory_state();
-	p->flags |= PF_MEMALLOC;
-	reclaim_state.reclaimed_slab = 0;
-	p->reclaim_state = &reclaim_state;
-
-	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
-
-	p->reclaim_state = NULL;
-	p->flags &= ~PF_MEMALLOC;
-
-	cond_resched();
-
-	if (order != 0)
-		drain_all_pages();
+	if (!did_some_progress) {
+		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+			page = __alloc_pages_may_oom(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask);
+			if (page)
+				goto got_pg;
 
-	if (likely(did_some_progress)) {
-		page = get_page_from_freelist(gfp_mask, nodemask, order,
-					zonelist, high_zoneidx, alloc_flags);
-		if (page)
-			goto got_pg;
-	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-		if (!try_set_zone_oom(zonelist, gfp_mask)) {
-			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
-
-		/*
-		 * Go through the zonelist yet one more time, keep
-		 * very high watermark here, this is only to catch
-		 * a parallel oom killing, we must fail if we're still
-		 * under heavy pressure.
-		 */
-		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-			order, zonelist, high_zoneidx,
-			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-		if (page) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto got_pg;
-		}
-
-		/* The OOM killer will not help higher order allocs so fail */
-		if (order > PAGE_ALLOC_COSTLY_ORDER) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto nopage;
-		}
-
-		out_of_memory(zonelist, gfp_mask, order);
-		clear_zonelist_oom(zonelist, gfp_mask);
-		goto restart;
 	}
 
-	/*
-	 * Don't let big-order allocations loop unless the caller explicitly
-	 * requests that.  Wait for some write requests to complete then retry.
-	 *
-	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-	 * means __GFP_NOFAIL, but that may not be true in other
-	 * implementations.
-	 *
-	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-	 * specified, then we retry until we no longer reclaim any pages
-	 * (above), or we've reclaimed an order of pages at least as
-	 * large as the allocation's order. In both cases, if the
-	 * allocation still fails, we stop retrying.
-	 */
+	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
-	do_retry = 0;
-	if (!(gfp_mask & __GFP_NORETRY)) {
-		if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-			do_retry = 1;
-		} else {
-			if (gfp_mask & __GFP_REPEAT &&
-				pages_reclaimed < (1 << order))
-					do_retry = 1;
-		}
-		if (gfp_mask & __GFP_NOFAIL)
-			do_retry = 1;
-	}
-	if (do_retry) {
+	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+		/* Wait for some write requests to complete then retry */
 		congestion_wait(WRITE, HZ/50);
-		goto rebalance;
+		goto restart;
 	}
 
 nopage:
@@ -1672,6 +1726,39 @@ nopage:
 	}
 got_pg:
 	return page;
+
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	struct page *page;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	if (should_fail_alloc_page(gfp_mask, order))
+		return NULL;
+
+	/*
+	 * Check the zones suitable for the gfp_mask contain at least one
+	 * valid zone. It's possible to have an empty zonelist as a result
+	 * of GFP_THISNODE and a memoryless node
+	 */
+	if (unlikely(!zonelist->_zonerefs->zone))
+		return NULL;
+
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (unlikely(!page))
+		page = __alloc_pages_slowpath(gfp_mask, order,
+				zonelist, high_zoneidx, nodemask);
+
+	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 
-- 
1.5.6.5


WARNING: multiple messages have this Message-ID (diff)
From: Mel Gorman <mel@csn.ul.ie>
To: Mel Gorman <mel@csn.ul.ie>,
	Linux Memory Management List <linux-mm@kvack.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Christoph Lameter <cl@linux-foundation.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Nick Piggin <npiggin@suse.de>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Lin Ming <ming.m.lin@intel.com>,
	Zhang Yanmin <yanmin_zhang@linux.intel.com>
Subject: [PATCH 06/20] Break up the allocator entry point into fast and slow paths
Date: Sun, 22 Feb 2009 23:17:15 +0000	[thread overview]
Message-ID: <1235344649-18265-7-git-send-email-mel@csn.ul.ie> (raw)
In-Reply-To: <1235344649-18265-1-git-send-email-mel@csn.ul.ie>

The core of the page allocator is one giant function which allocates
memory on the stack and makes calculations that may not be needed for every
allocation. This patch breaks up the allocator path into fast and slow paths.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
---
 mm/page_alloc.c |  345 ++++++++++++++++++++++++++++++++++---------------------
 1 files changed, 216 insertions(+), 129 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 99fd538..503d692 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1463,45 +1463,170 @@ try_next_zone:
 	return page;
 }
 
-/*
- * This is the 'heart' of the zoned buddy allocator.
- */
+int
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+				unsigned long pages_reclaimed)
+{
+	/* Do not loop if specifically requested */
+	if (gfp_mask & __GFP_NORETRY)
+		return 0;
+	
+	/*
+	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+	 * means __GFP_NOFAIL, but that may not be true in other
+	 * implementations.
+	 */
+	if (order <= PAGE_ALLOC_COSTLY_ORDER)
+		return 1;
+
+	/*
+	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+	 * specified, then we retry until we no longer reclaim any pages
+	 * (above), or we've reclaimed an order of pages at least as
+	 * large as the allocation's order. In both cases, if the
+	 * allocation still fails, we stop retrying.
+	 */
+	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+		return 1;
+
+	/*
+	 * Don't let big-order allocations loop unless the caller
+	 * explicitly requests that. 
+	 */
+	if (gfp_mask & __GFP_NOFAIL)
+		return 1;
+
+	return 0;
+}
+
 struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-			struct zonelist *zonelist, nodemask_t *nodemask)
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
 {
-	const gfp_t wait = gfp_mask & __GFP_WAIT;
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-	struct zoneref *z;
-	struct zone *zone;
 	struct page *page;
+
+	/* Acquire the OOM killer lock for the zones in zonelist */
+	if (!try_set_zone_oom(zonelist, gfp_mask)) {
+		schedule_timeout_uninterruptible(1);
+		return NULL;
+	}
+
+	/*
+	 * Go through the zonelist yet one more time, keep very high watermark
+	 * here, this is only to catch a parallel oom killing, we must fail if
+	 * we're still under heavy pressure.
+	 */
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+		order, zonelist, high_zoneidx,
+		ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+	if (page)
+		goto out;
+
+	/* The OOM killer will not help higher order allocs */
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		goto out;
+
+	/* Exhausted what can be done so it's blamo time */
+	out_of_memory(zonelist, gfp_mask, order);
+
+out:
+	clear_zonelist_oom(zonelist, gfp_mask);
+	return page;
+}
+
+/* The really slow allocator path where we enter direct reclaim */
+struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+	struct page *page = NULL;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
-	int do_retry;
-	int alloc_flags;
-	unsigned long did_some_progress;
-	unsigned long pages_reclaimed = 0;
 
-	might_sleep_if(wait);
+	cond_resched();
 
-	if (should_fail_alloc_page(gfp_mask, order))
-		return NULL;
+	/* We now go into synchronous reclaim */
+	cpuset_memory_pressure_bump();
 
-	/* the list of zones suitable for gfp_mask */
-	z = zonelist->_zonerefs;
-	if (unlikely(!z->zone)) {
-		/*
-		 * Happens if we have an empty zonelist as a result of
-		 * GFP_THISNODE being used on a memoryless node
-		 */
-		return NULL;
-	}
+	/*
+	 * The task's cpuset might have expanded its set of allowable nodes
+	 */
+	cpuset_update_task_memory_state();
+	p->flags |= PF_MEMALLOC;
+	reclaim_state.reclaimed_slab = 0;
+	p->reclaim_state = &reclaim_state;
 
-restart:
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
-	if (page)
-		goto got_pg;
+	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+
+	p->reclaim_state = NULL;
+	p->flags &= ~PF_MEMALLOC;
+
+	cond_resched();
+
+	if (order != 0)
+		drain_all_pages();
+
+	if (likely(*did_some_progress))
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+					zonelist, high_zoneidx, alloc_flags);
+	return page;
+}
+
+static inline int is_allocation_high_priority(struct task_struct *p,
+							gfp_t gfp_mask)
+{
+	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+			&& !in_interrupt())
+		if (!(gfp_mask & __GFP_NOMEMALLOC))
+			return 1;
+	return 0;
+}
+
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	struct page *page;
+
+	do {
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+
+		if (!page && gfp_mask & __GFP_NOFAIL)
+			congestion_wait(WRITE, HZ/50);
+	} while (!page && (gfp_mask & __GFP_NOFAIL));
+
+	return page;
+}
+
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx)
+{
+	struct zoneref *z;
+	struct zone *zone;
+
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		wakeup_kswapd(zone, order);
+}
+
+static struct page * noinline
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+	struct zonelist *zonelist, enum zone_type high_zoneidx,
+	nodemask_t *nodemask)
+{
+	const gfp_t wait = gfp_mask & __GFP_WAIT;
+	struct page *page = NULL;
+	int alloc_flags;
+	unsigned long pages_reclaimed = 0;
+	unsigned long did_some_progress;
+	struct task_struct *p = current;
 
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1514,8 +1639,7 @@ restart:
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-		wakeup_kswapd(zone, order);
+	wake_all_kswapd(order, zonelist, high_zoneidx);
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -1535,6 +1659,7 @@ restart:
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;
 
+restart:
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
@@ -1548,118 +1673,47 @@ restart:
 	if (page)
 		goto got_pg;
 
-	/* This allocation should allow future memory freeing. */
-
-rebalance:
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-			&& !in_interrupt()) {
-		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
-			/* go through the zonelist yet again, ignoring mins */
-			page = get_page_from_freelist(gfp_mask, nodemask, order,
-				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
-			if (page)
-				goto got_pg;
-			if (gfp_mask & __GFP_NOFAIL) {
-				congestion_wait(WRITE, HZ/50);
-				goto nofail_alloc;
-			}
-		}
-		goto nopage;
-	}
+	/* Allocate without watermarks if the context allows */
+	if (is_allocation_high_priority(p, gfp_mask))
+		page = __alloc_pages_high_priority(gfp_mask, order,
+			zonelist, high_zoneidx, nodemask);
+	if (page)
+		goto got_pg;
 
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 
-	cond_resched();
+	/* Try direct reclaim and then allocating */
+	page = __alloc_pages_direct_reclaim(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask,
+					alloc_flags, &did_some_progress);
+	if (page)
+		goto got_pg;
 
-	/* We now go into synchronous reclaim */
-	cpuset_memory_pressure_bump();
 	/*
-	 * The task's cpuset might have expanded its set of allowable nodes
+	 * If we failed to make any progress reclaiming, then we are
+	 * running out of options and have to consider going OOM
 	 */
-	cpuset_update_task_memory_state();
-	p->flags |= PF_MEMALLOC;
-	reclaim_state.reclaimed_slab = 0;
-	p->reclaim_state = &reclaim_state;
-
-	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
-
-	p->reclaim_state = NULL;
-	p->flags &= ~PF_MEMALLOC;
-
-	cond_resched();
-
-	if (order != 0)
-		drain_all_pages();
+	if (!did_some_progress) {
+		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+			page = __alloc_pages_may_oom(gfp_mask, order,
+					zonelist, high_zoneidx,
+					nodemask);
+			if (page)
+				goto got_pg;
 
-	if (likely(did_some_progress)) {
-		page = get_page_from_freelist(gfp_mask, nodemask, order,
-					zonelist, high_zoneidx, alloc_flags);
-		if (page)
-			goto got_pg;
-	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-		if (!try_set_zone_oom(zonelist, gfp_mask)) {
-			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
-
-		/*
-		 * Go through the zonelist yet one more time, keep
-		 * very high watermark here, this is only to catch
-		 * a parallel oom killing, we must fail if we're still
-		 * under heavy pressure.
-		 */
-		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-			order, zonelist, high_zoneidx,
-			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-		if (page) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto got_pg;
-		}
-
-		/* The OOM killer will not help higher order allocs so fail */
-		if (order > PAGE_ALLOC_COSTLY_ORDER) {
-			clear_zonelist_oom(zonelist, gfp_mask);
-			goto nopage;
-		}
-
-		out_of_memory(zonelist, gfp_mask, order);
-		clear_zonelist_oom(zonelist, gfp_mask);
-		goto restart;
 	}
 
-	/*
-	 * Don't let big-order allocations loop unless the caller explicitly
-	 * requests that.  Wait for some write requests to complete then retry.
-	 *
-	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-	 * means __GFP_NOFAIL, but that may not be true in other
-	 * implementations.
-	 *
-	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-	 * specified, then we retry until we no longer reclaim any pages
-	 * (above), or we've reclaimed an order of pages at least as
-	 * large as the allocation's order. In both cases, if the
-	 * allocation still fails, we stop retrying.
-	 */
+	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
-	do_retry = 0;
-	if (!(gfp_mask & __GFP_NORETRY)) {
-		if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-			do_retry = 1;
-		} else {
-			if (gfp_mask & __GFP_REPEAT &&
-				pages_reclaimed < (1 << order))
-					do_retry = 1;
-		}
-		if (gfp_mask & __GFP_NOFAIL)
-			do_retry = 1;
-	}
-	if (do_retry) {
+	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+		/* Wait for some write requests to complete then retry */
 		congestion_wait(WRITE, HZ/50);
-		goto rebalance;
+		goto restart;
 	}
 
 nopage:
@@ -1672,6 +1726,39 @@ nopage:
 	}
 got_pg:
 	return page;
+
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+	struct page *page;
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	if (should_fail_alloc_page(gfp_mask, order))
+		return NULL;
+
+	/*
+	 * Check the zones suitable for the gfp_mask contain at least one
+	 * valid zone. It's possible to have an empty zonelist as a result
+	 * of GFP_THISNODE and a memoryless node
+	 */
+	if (unlikely(!zonelist->_zonerefs->zone))
+		return NULL;
+
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (unlikely(!page))
+		page = __alloc_pages_slowpath(gfp_mask, order,
+				zonelist, high_zoneidx, nodemask);
+
+	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 
-- 
1.5.6.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2009-02-22 23:18 UTC|newest]

Thread overview: 190+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-02-22 23:17 [RFC PATCH 00/20] Cleanup and optimise the page allocator Mel Gorman
2009-02-22 23:17 ` Mel Gorman
2009-02-22 23:17 ` [PATCH 01/20] Replace __alloc_pages_internal() with __alloc_pages_nodemask() Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 02/20] Do not sanity check order in the fast path Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 03/20] Do not check NUMA node ID when the caller knows the node is valid Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-23 15:01   ` Christoph Lameter
2009-02-23 15:01     ` Christoph Lameter
2009-02-23 16:24     ` Mel Gorman
2009-02-23 16:24       ` Mel Gorman
2009-02-22 23:17 ` [PATCH 04/20] Convert gfp_zone() to use a table of precalculated values Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-23 11:55   ` [PATCH] mm: clean up __GFP_* flags a bit Peter Zijlstra
2009-02-23 11:55     ` Peter Zijlstra
2009-02-23 18:01     ` Mel Gorman
2009-02-23 18:01       ` Mel Gorman
2009-02-23 20:27       ` Vegard Nossum
2009-02-23 20:27         ` Vegard Nossum
2009-02-23 15:23   ` [PATCH 04/20] Convert gfp_zone() to use a table of precalculated values Christoph Lameter
2009-02-23 15:23     ` Christoph Lameter
2009-02-23 15:41     ` Nick Piggin
2009-02-23 15:41       ` Nick Piggin
2009-02-23 15:43       ` [PATCH 04/20] Convert gfp_zone() to use a table of precalculated value Christoph Lameter
2009-02-23 15:43         ` Christoph Lameter
2009-02-23 16:40         ` Mel Gorman
2009-02-23 16:40           ` Mel Gorman
2009-02-23 17:03           ` Christoph Lameter
2009-02-23 17:03             ` Christoph Lameter
2009-02-24  1:32           ` KAMEZAWA Hiroyuki
2009-02-24  1:32             ` KAMEZAWA Hiroyuki
2009-02-24  3:59             ` Nick Piggin
2009-02-24  3:59               ` Nick Piggin
2009-02-24  5:20               ` KAMEZAWA Hiroyuki
2009-02-24  5:20                 ` KAMEZAWA Hiroyuki
2009-02-24 11:36             ` Mel Gorman
2009-02-24 11:36               ` Mel Gorman
2009-02-23 16:33     ` [PATCH 04/20] Convert gfp_zone() to use a table of precalculated values Mel Gorman
2009-02-23 16:33       ` Mel Gorman
2009-02-23 16:33       ` [PATCH 04/20] Convert gfp_zone() to use a table of precalculated value Christoph Lameter
2009-02-23 16:33         ` Christoph Lameter
2009-02-23 17:41         ` Mel Gorman
2009-02-23 17:41           ` Mel Gorman
2009-02-22 23:17 ` [PATCH 05/20] Check only once if the zonelist is suitable for the allocation Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` Mel Gorman [this message]
2009-02-22 23:17   ` [PATCH 06/20] Break up the allocator entry point into fast and slow paths Mel Gorman
2009-02-22 23:17 ` [PATCH 07/20] Simplify the check on whether cpusets are a factor or not Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-23  7:14   ` Pekka J Enberg
2009-02-23  7:14     ` Pekka J Enberg
2009-02-23  9:07     ` Peter Zijlstra
2009-02-23  9:07       ` Peter Zijlstra
2009-02-23  9:13       ` Pekka Enberg
2009-02-23  9:13         ` Pekka Enberg
2009-02-23 11:39         ` Mel Gorman
2009-02-23 11:39           ` Mel Gorman
2009-02-23 13:19           ` Pekka Enberg
2009-02-23 13:19             ` Pekka Enberg
2009-02-23  9:14   ` Li Zefan
2009-02-23  9:14     ` Li Zefan
2009-02-22 23:17 ` [PATCH 08/20] Move check for disabled anti-fragmentation out of fastpath Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 09/20] Calculate the preferred zone for allocation only once Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 10/20] Calculate the migratetype " Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 11/20] Inline get_page_from_freelist() in the fast-path Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-23  7:21   ` Pekka Enberg
2009-02-23  7:21     ` Pekka Enberg
2009-02-23 11:42     ` Mel Gorman
2009-02-23 11:42       ` Mel Gorman
2009-02-23 15:32   ` Nick Piggin
2009-02-23 15:32     ` Nick Piggin
2009-02-24 13:32     ` Mel Gorman
2009-02-24 13:32       ` Mel Gorman
2009-02-24 14:08       ` Nick Piggin
2009-02-24 14:08         ` Nick Piggin
2009-02-24 15:03         ` Mel Gorman
2009-02-24 15:03           ` Mel Gorman
2009-02-22 23:17 ` [PATCH 12/20] Inline __rmqueue_smallest() Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 13/20] Inline buffered_rmqueue() Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-23  7:24   ` Pekka Enberg
2009-02-23  7:24     ` Pekka Enberg
2009-02-23 11:44     ` Mel Gorman
2009-02-23 11:44       ` Mel Gorman
2009-02-22 23:17 ` [PATCH 14/20] Do not call get_pageblock_migratetype() more than necessary Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 15/20] Do not disable interrupts in free_page_mlock() Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-23  9:19   ` Peter Zijlstra
2009-02-23  9:19     ` Peter Zijlstra
2009-02-23 12:23     ` Mel Gorman
2009-02-23 12:23       ` Mel Gorman
2009-02-23 12:44       ` Peter Zijlstra
2009-02-23 12:44         ` Peter Zijlstra
2009-02-23 14:25         ` Mel Gorman
2009-02-23 14:25           ` Mel Gorman
2009-02-22 23:17 ` [PATCH 16/20] Do not setup zonelist cache when there is only one node Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 17/20] Do not double sanity check page attributes during allocation Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 18/20] Split per-cpu list into one-list-per-migrate-type Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 19/20] Batch free pages from migratetype per-cpu lists Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-22 23:17 ` [PATCH 20/20] Get rid of the concept of hot/cold page freeing Mel Gorman
2009-02-22 23:17   ` Mel Gorman
2009-02-23  9:37   ` Andrew Morton
2009-02-23  9:37     ` Andrew Morton
2009-02-23 23:30     ` Mel Gorman
2009-02-23 23:30       ` Mel Gorman
2009-02-23 23:53       ` Andrew Morton
2009-02-23 23:53         ` Andrew Morton
2009-02-24 11:51         ` Mel Gorman
2009-02-24 11:51           ` Mel Gorman
2009-02-25  0:01           ` Andrew Morton
2009-02-25  0:01             ` Andrew Morton
2009-02-25 16:01             ` Mel Gorman
2009-02-25 16:01               ` Mel Gorman
2009-02-25 16:19               ` Andrew Morton
2009-02-25 16:19                 ` Andrew Morton
2009-02-26 16:37                 ` Mel Gorman
2009-02-26 16:37                   ` Mel Gorman
2009-02-26 17:00                   ` Christoph Lameter
2009-02-26 17:00                     ` Christoph Lameter
2009-02-26 17:15                     ` Mel Gorman
2009-02-26 17:15                       ` Mel Gorman
2009-02-26 17:30                       ` Christoph Lameter
2009-02-26 17:30                         ` Christoph Lameter
2009-02-27 11:33                         ` Nick Piggin
2009-02-27 11:33                           ` Nick Piggin
2009-02-27 15:40                           ` Christoph Lameter
2009-02-27 15:40                             ` Christoph Lameter
2009-03-03 13:52                             ` Mel Gorman
2009-03-03 13:52                               ` Mel Gorman
2009-03-03 18:53                               ` Christoph Lameter
2009-03-03 18:53                                 ` Christoph Lameter
2009-02-27 11:38                       ` Nick Piggin
2009-02-27 11:38                         ` Nick Piggin
2009-03-01 10:37                         ` KOSAKI Motohiro
2009-03-01 10:37                           ` KOSAKI Motohiro
2009-02-25 18:33               ` Christoph Lameter
2009-02-25 18:33                 ` Christoph Lameter
2009-02-22 23:57 ` [RFC PATCH 00/20] Cleanup and optimise the page allocator Andi Kleen
2009-02-22 23:57   ` Andi Kleen
2009-02-23 12:34   ` Mel Gorman
2009-02-23 12:34     ` Mel Gorman
2009-02-23 15:34   ` [RFC PATCH 00/20] Cleanup and optimise the page allocato Christoph Lameter
2009-02-23 15:34     ` Christoph Lameter
2009-02-23  0:02 ` [RFC PATCH 00/20] Cleanup and optimise the page allocator Andi Kleen
2009-02-23  0:02   ` Andi Kleen
2009-02-23 14:32   ` Mel Gorman
2009-02-23 14:32     ` Mel Gorman
2009-02-23 17:49     ` Andi Kleen
2009-02-23 17:49       ` Andi Kleen
2009-02-24 14:32       ` Mel Gorman
2009-02-24 14:32         ` Mel Gorman
2009-02-23  7:29 ` Pekka Enberg
2009-02-23  7:29   ` Pekka Enberg
2009-02-23  8:34   ` Zhang, Yanmin
2009-02-23  8:34     ` Zhang, Yanmin
2009-02-23  9:10   ` KOSAKI Motohiro
2009-02-23  9:10     ` KOSAKI Motohiro
2009-02-23 11:55 ` [PATCH] mm: gfp_to_alloc_flags() Peter Zijlstra
2009-02-23 11:55   ` Peter Zijlstra
2009-02-23 14:00   ` Pekka Enberg
2009-02-23 14:00     ` Pekka Enberg
2009-02-23 18:17   ` Mel Gorman
2009-02-23 18:17     ` Mel Gorman
2009-02-23 20:09     ` Peter Zijlstra
2009-02-23 20:09       ` Peter Zijlstra
2009-02-23 22:59   ` Andrew Morton
2009-02-23 22:59     ` Andrew Morton
2009-02-24  8:59     ` Peter Zijlstra
2009-02-24  8:59       ` Peter Zijlstra
2009-02-23 14:38 ` [RFC PATCH 00/20] Cleanup and optimise the page allocator Christoph Lameter
2009-02-23 14:38   ` Christoph Lameter
2009-02-23 14:46 ` Nick Piggin
2009-02-23 14:46   ` Nick Piggin
2009-02-23 15:00   ` Mel Gorman
2009-02-23 15:00     ` Mel Gorman
2009-02-23 15:22     ` Nick Piggin
2009-02-23 15:22       ` Nick Piggin
2009-02-23 20:26       ` Mel Gorman
2009-02-23 20:26         ` Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1235344649-18265-7-git-send-email-mel@csn.ul.ie \
    --to=mel@csn.ul.ie \
    --cc=cl@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ming.m.lin@intel.com \
    --cc=npiggin@suse.de \
    --cc=penberg@cs.helsinki.fi \
    --cc=riel@redhat.com \
    --cc=yanmin_zhang@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.