All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm, x86, pat: Improve scaling of pat_pagerange_is_ram()
@ 2012-05-14 20:26 John Dykstra
  2012-05-16 22:46 ` Suresh Siddha
  0 siblings, 1 reply; 6+ messages in thread
From: John Dykstra @ 2012-05-14 20:26 UTC (permalink / raw)
  To: mingo; +Cc: suresh.b.siddha, linux-kernel

Function pat_pagerange_is_ram() scales poorly to large address ranges,
because it probes the resource tree for each page.  On a 2.6 GHz
Opteron, this function consumes 34 ms. for a 1 GB range.  It is called
twice during untrack_pfn_vma(), slowing process cleanup and handicapping
the OOM killer.

This replacement based on walk_system_ram_range() consumes less than 1
ms. under the same conditions.

Signed-off-by: John Dykstra <jdykstra@cray.com> on behalf of Cray Inc. 
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/mm/pat.c      |   55 ++++++++++++++++++++++++++++++-----------------
 include/linux/ioport.h |    2 +
 kernel/resource.c      |    2 +-
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b..c119afb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -160,29 +160,44 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 
 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 {
-	int ram_page = 0, not_rampage = 0;
-	unsigned long page_nr;
+	struct resource res;
+	resource_size_t pg_end, after_ram;
+	int ram = 0, not_ram = 0;
 
-	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
-	     ++page_nr) {
-		/*
-		 * For legacy reasons, physical address range in the legacy ISA
-		 * region is tracked as non-RAM. This will allow users of
-		 * /dev/mem to map portions of legacy ISA region, even when
-		 * some of those portions are listed(or not even listed) with
-		 * different e820 types(RAM/reserved/..)
-		 */
-		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
-		    page_is_ram(page_nr))
-			ram_page = 1;
-		else
-			not_rampage = 1;
+	res.start = start & PHYSICAL_PAGE_MASK;
 
-		if (ram_page == not_rampage)
+	/*
+	 * For legacy reasons, physical address range in the legacy ISA
+	 * region is tracked as non-RAM. This will allow users of
+	 * /dev/mem to map portions of legacy ISA region, even when
+	 * some of those portions are listed(or not even listed) with
+	 * different e820 types(RAM/reserved/..)
+	 */
+	if (res.start < ISA_END_ADDRESS) {
+		not_ram = 1;
+		res.start = ISA_END_ADDRESS;
+	}
+
+	pg_end = (end + PAGE_SIZE - 1) & PHYSICAL_PAGE_MASK;
+	res.end = pg_end;
+	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	after_ram = res.start;
+	while ((res.start < res.end) &&
+		(find_next_system_ram(&res, "System RAM") >= 0)) {
+		if (res.start > after_ram)
+			not_ram = 1;
+		if (res.end > res.start)
+			ram = 1;
+
+		if (ram && not_ram)
 			return -1;
+
+		after_ram = res.end + 1;
+		res.start = res.end + 1;
+		res.end = pg_end;
 	}
 
-	return ram_page;
+	return ram;
 }
 
 /*
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index e885ba2..273a725 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -223,5 +223,7 @@ extern int
 walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *));
 
+extern int find_next_system_ram(struct resource *res, char *name);
+
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 7e8ea66..dd8f553 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -283,7 +283,7 @@ EXPORT_SYMBOL(release_resource);
  * the caller must specify res->start, res->end, res->flags and "name".
  * If found, returns 0, res is overwritten, if not found, returns -1.
  */
-static int find_next_system_ram(struct resource *res, char *name)
+int find_next_system_ram(struct resource *res, char *name)
 {
 	resource_size_t start, end;
 	struct resource *p;
-- 
1.7.0.4






^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm, x86, pat: Improve scaling of pat_pagerange_is_ram()
  2012-05-14 20:26 [PATCH] mm, x86, pat: Improve scaling of pat_pagerange_is_ram() John Dykstra
@ 2012-05-16 22:46 ` Suresh Siddha
  2012-05-18  6:48   ` Ingo Molnar
  2012-05-25 21:12   ` [PATCH V2] " John Dykstra
  0 siblings, 2 replies; 6+ messages in thread
From: Suresh Siddha @ 2012-05-16 22:46 UTC (permalink / raw)
  To: John Dykstra; +Cc: mingo, linux-kernel

On Mon, 2012-05-14 at 15:26 -0500, John Dykstra wrote:
> Function pat_pagerange_is_ram() scales poorly to large address ranges,
> because it probes the resource tree for each page.  On a 2.6 GHz
> Opteron, this function consumes 34 ms. for a 1 GB range.  It is called
> twice during untrack_pfn_vma(), slowing process cleanup and handicapping
> the OOM killer.
> 
> This replacement based on walk_system_ram_range() consumes less than 1
> ms. under the same conditions.
> 
> Signed-off-by: John Dykstra <jdykstra@cray.com> on behalf of Cray Inc. 
> Cc: Suresh Siddha <suresh.b.siddha@intel.com>
> ---
>  arch/x86/mm/pat.c      |   55 ++++++++++++++++++++++++++++++-----------------
>  include/linux/ioport.h |    2 +
>  kernel/resource.c      |    2 +-
>  3 files changed, 38 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
> index f6ff57b..c119afb 100644
> --- a/arch/x86/mm/pat.c
> +++ b/arch/x86/mm/pat.c
> @@ -160,29 +160,44 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
>  
>  static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
>  {
> -	int ram_page = 0, not_rampage = 0;
> -	unsigned long page_nr;
> +	struct resource res;
> +	resource_size_t pg_end, after_ram;
> +	int ram = 0, not_ram = 0;
>  
> -	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
> -	     ++page_nr) {
> -		/*
> -		 * For legacy reasons, physical address range in the legacy ISA
> -		 * region is tracked as non-RAM. This will allow users of
> -		 * /dev/mem to map portions of legacy ISA region, even when
> -		 * some of those portions are listed(or not even listed) with
> -		 * different e820 types(RAM/reserved/..)
> -		 */
> -		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
> -		    page_is_ram(page_nr))
> -			ram_page = 1;
> -		else
> -			not_rampage = 1;
> +	res.start = start & PHYSICAL_PAGE_MASK;
>  
> -		if (ram_page == not_rampage)
> +	/*
> +	 * For legacy reasons, physical address range in the legacy ISA
> +	 * region is tracked as non-RAM. This will allow users of
> +	 * /dev/mem to map portions of legacy ISA region, even when
> +	 * some of those portions are listed(or not even listed) with
> +	 * different e820 types(RAM/reserved/..)
> +	 */
> +	if (res.start < ISA_END_ADDRESS) {
> +		not_ram = 1;
> +		res.start = ISA_END_ADDRESS;
> +	}
> +
> +	pg_end = (end + PAGE_SIZE - 1) & PHYSICAL_PAGE_MASK;
> +	res.end = pg_end;
> +	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> +	after_ram = res.start;
> +	while ((res.start < res.end) &&
> +		(find_next_system_ram(&res, "System RAM") >= 0)) {
> +		if (res.start > after_ram)
> +			not_ram = 1;
> +		if (res.end > res.start)
> +			ram = 1;
> +
> +		if (ram && not_ram)
>  			return -1;
> +
> +		after_ram = res.end + 1;
> +		res.start = res.end + 1;
> +		res.end = pg_end;
>  	}

Instead of duplicating what kernel/resource.c:walk_system_ram_range() is
already doing, can we just provide a callback that can be used with
walk_system_ram_range() and see if the expected RAM pages is what the
callback also sees.

That will greatly simplify the patch and avoid code duplication.

thanks,
suresh


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm, x86, pat: Improve scaling of pat_pagerange_is_ram()
  2012-05-16 22:46 ` Suresh Siddha
@ 2012-05-18  6:48   ` Ingo Molnar
  2012-05-25 21:12   ` [PATCH V2] " John Dykstra
  1 sibling, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2012-05-18  6:48 UTC (permalink / raw)
  To: Suresh Siddha; +Cc: John Dykstra, the arch/x86 maintainers, linux-kernel


* Suresh Siddha <suresh.b.siddha@intel.com> wrote:

> On Mon, 2012-05-14 at 15:26 -0500, John Dykstra wrote:
> > Function pat_pagerange_is_ram() scales poorly to large address ranges,
> > because it probes the resource tree for each page.  On a 2.6 GHz
> > Opteron, this function consumes 34 ms. for a 1 GB range.  It is called
> > twice during untrack_pfn_vma(), slowing process cleanup and handicapping
> > the OOM killer.
> > 
> > This replacement based on walk_system_ram_range() consumes less than 1
> > ms. under the same conditions.

Nice performance improvement!

> > 
> > Signed-off-by: John Dykstra <jdykstra@cray.com> on behalf of Cray Inc. 
> > Cc: Suresh Siddha <suresh.b.siddha@intel.com>
> > ---
> >  arch/x86/mm/pat.c      |   55 ++++++++++++++++++++++++++++++-----------------
> >  include/linux/ioport.h |    2 +
> >  kernel/resource.c      |    2 +-
> >  3 files changed, 38 insertions(+), 21 deletions(-)
> > 
> > diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
> > index f6ff57b..c119afb 100644
> > --- a/arch/x86/mm/pat.c
> > +++ b/arch/x86/mm/pat.c
> > @@ -160,29 +160,44 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
> >  
> >  static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
> >  {
> > -	int ram_page = 0, not_rampage = 0;
> > -	unsigned long page_nr;
> > +	struct resource res;
> > +	resource_size_t pg_end, after_ram;
> > +	int ram = 0, not_ram = 0;
> >  
> > -	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
> > -	     ++page_nr) {
> > -		/*
> > -		 * For legacy reasons, physical address range in the legacy ISA
> > -		 * region is tracked as non-RAM. This will allow users of
> > -		 * /dev/mem to map portions of legacy ISA region, even when
> > -		 * some of those portions are listed(or not even listed) with
> > -		 * different e820 types(RAM/reserved/..)
> > -		 */
> > -		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
> > -		    page_is_ram(page_nr))
> > -			ram_page = 1;
> > -		else
> > -			not_rampage = 1;
> > +	res.start = start & PHYSICAL_PAGE_MASK;
> >  
> > -		if (ram_page == not_rampage)
> > +	/*
> > +	 * For legacy reasons, physical address range in the legacy ISA
> > +	 * region is tracked as non-RAM. This will allow users of
> > +	 * /dev/mem to map portions of legacy ISA region, even when
> > +	 * some of those portions are listed(or not even listed) with
> > +	 * different e820 types(RAM/reserved/..)
> > +	 */
> > +	if (res.start < ISA_END_ADDRESS) {
> > +		not_ram = 1;
> > +		res.start = ISA_END_ADDRESS;
> > +	}
> > +
> > +	pg_end = (end + PAGE_SIZE - 1) & PHYSICAL_PAGE_MASK;
> > +	res.end = pg_end;
> > +	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> > +	after_ram = res.start;
> > +	while ((res.start < res.end) &&
> > +		(find_next_system_ram(&res, "System RAM") >= 0)) {
> > +		if (res.start > after_ram)
> > +			not_ram = 1;
> > +		if (res.end > res.start)
> > +			ram = 1;
> > +
> > +		if (ram && not_ram)
> >  			return -1;
> > +
> > +		after_ram = res.end + 1;
> > +		res.start = res.end + 1;
> > +		res.end = pg_end;
> >  	}
> 
> Instead of duplicating what kernel/resource.c:walk_system_ram_range() is
> already doing, can we just provide a callback that can be used with
> walk_system_ram_range() and see if the expected RAM pages is what the
> callback also sees.
> 
> That will greatly simplify the patch and avoid code duplication.

Agreed.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH V2] mm, x86, pat: Improve scaling of pat_pagerange_is_ram()
  2012-05-16 22:46 ` Suresh Siddha
  2012-05-18  6:48   ` Ingo Molnar
@ 2012-05-25 21:12   ` John Dykstra
  2012-05-26  0:37     ` Suresh Siddha
  2012-05-30 13:34     ` [tip:x86/urgent] x86/mm/pat: " tip-bot for John Dykstra
  1 sibling, 2 replies; 6+ messages in thread
From: John Dykstra @ 2012-05-25 21:12 UTC (permalink / raw)
  To: Suresh Siddha; +Cc: mingo, linux-kernel

On Wed, 2012-05-16 at 15:46 -0700, Suresh Siddha wrote:
> Instead of duplicating what kernel/resource.c:walk_system_ram_range() is
> already doing, can we just provide a callback that can be used with
> walk_system_ram_range() and see if the expected RAM pages is what the
> callback also sees.

The resulting patch is a bit longer than V1.  
---

Function pat_pagerange_is_ram() scales poorly to large address ranges,
because it probes the resource tree for each page.  On a 2.6 GHz
Opteron, this function consumes 34 ms. for a 1 GB range.  It is called
twice during untrack_pfn_vma(), slowing process cleanup and handicapping
the OOM killer.

This replacement consumes less than 1ms. under the same conditions.

Signed-off-by: John Dykstra <jdykstra@cray.com> on behalf of Cray Inc. 
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/mm/pat.c |   58
++++++++++++++++++++++++++++++++--------------------
 1 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b..246cce8 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,31 +158,45 @@ static unsigned long pat_x_mtrr_type(u64 start,
u64 end, unsigned long req_type)
 	return req_type;
 }
 
+struct pagerange_is_ram_state {
+	unsigned long		cur_pfn;
+	int			ram;
+	int			not_ram;
+};
+
+static int pagerange_is_ram_callback(unsigned long initial_pfn,
+				unsigned long total_nr_pages, void *arg)
+{
+	struct pagerange_is_ram_state *state = arg;
+
+	state->not_ram |= initial_pfn > state->cur_pfn;
+	state->ram |= total_nr_pages > 0;
+	state->cur_pfn = initial_pfn + total_nr_pages;
+
+	return state->ram && state->not_ram;
+}
+
 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 {
-	int ram_page = 0, not_rampage = 0;
-	unsigned long page_nr;
+	int ret = 0;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	struct pagerange_is_ram_state state = {start_pfn, 0, 0};
 
-	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
-	     ++page_nr) {
-		/*
-		 * For legacy reasons, physical address range in the legacy ISA
-		 * region is tracked as non-RAM. This will allow users of
-		 * /dev/mem to map portions of legacy ISA region, even when
-		 * some of those portions are listed(or not even listed) with
-		 * different e820 types(RAM/reserved/..)
-		 */
-		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
-		    page_is_ram(page_nr))
-			ram_page = 1;
-		else
-			not_rampage = 1;
-
-		if (ram_page == not_rampage)
-			return -1;
-	}
+	/*
+	 * For legacy reasons, physical address range in the legacy ISA
+	 * region is tracked as non-RAM. This will allow users of
+	 * /dev/mem to map portions of legacy ISA region, even when
+	 * some of those portions are listed(or not even listed) with
+	 * different e820 types(RAM/reserved/..)
+	 */
+	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
 
-	return ram_page;
+	if (start_pfn < end_pfn)
+		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+				&state, pagerange_is_ram_callback);
+	return (ret > 0) ? -1 : (state.ram ? 1 : 0);
 }
 
 /*
-- 
1.7.0.4




^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH V2] mm, x86, pat: Improve scaling of pat_pagerange_is_ram()
  2012-05-25 21:12   ` [PATCH V2] " John Dykstra
@ 2012-05-26  0:37     ` Suresh Siddha
  2012-05-30 13:34     ` [tip:x86/urgent] x86/mm/pat: " tip-bot for John Dykstra
  1 sibling, 0 replies; 6+ messages in thread
From: Suresh Siddha @ 2012-05-26  0:37 UTC (permalink / raw)
  To: John Dykstra; +Cc: mingo, linux-kernel

On Fri, 2012-05-25 at 16:12 -0500, John Dykstra wrote:
> On Wed, 2012-05-16 at 15:46 -0700, Suresh Siddha wrote:
> > Instead of duplicating what kernel/resource.c:walk_system_ram_range() is
> > already doing, can we just provide a callback that can be used with
> > walk_system_ram_range() and see if the expected RAM pages is what the
> > callback also sees.
> 
> The resulting patch is a bit longer than V1.  
> ---
> 
> Function pat_pagerange_is_ram() scales poorly to large address ranges,
> because it probes the resource tree for each page.  On a 2.6 GHz
> Opteron, this function consumes 34 ms. for a 1 GB range.  It is called
> twice during untrack_pfn_vma(), slowing process cleanup and handicapping
> the OOM killer.
> 
> This replacement consumes less than 1ms. under the same conditions.
> 
> Signed-off-by: John Dykstra <jdykstra@cray.com> on behalf of Cray Inc. 
> Cc: Suresh Siddha <suresh.b.siddha@intel.com>

Looks good to me.

Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

> ---
>  arch/x86/mm/pat.c |   58
> ++++++++++++++++++++++++++++++++--------------------
>  1 files changed, 36 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
> index f6ff57b..246cce8 100644
> --- a/arch/x86/mm/pat.c
> +++ b/arch/x86/mm/pat.c
> @@ -158,31 +158,45 @@ static unsigned long pat_x_mtrr_type(u64 start,
> u64 end, unsigned long req_type)
>  	return req_type;
>  }
>  
> +struct pagerange_is_ram_state {
> +	unsigned long		cur_pfn;
> +	int			ram;
> +	int			not_ram;
> +};
> +
> +static int pagerange_is_ram_callback(unsigned long initial_pfn,
> +				unsigned long total_nr_pages, void *arg)
> +{
> +	struct pagerange_is_ram_state *state = arg;
> +
> +	state->not_ram |= initial_pfn > state->cur_pfn;
> +	state->ram |= total_nr_pages > 0;
> +	state->cur_pfn = initial_pfn + total_nr_pages;
> +
> +	return state->ram && state->not_ram;
> +}
> +
>  static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
>  {
> -	int ram_page = 0, not_rampage = 0;
> -	unsigned long page_nr;
> +	int ret = 0;
> +	unsigned long start_pfn = start >> PAGE_SHIFT;
> +	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +	struct pagerange_is_ram_state state = {start_pfn, 0, 0};
>  
> -	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
> -	     ++page_nr) {
> -		/*
> -		 * For legacy reasons, physical address range in the legacy ISA
> -		 * region is tracked as non-RAM. This will allow users of
> -		 * /dev/mem to map portions of legacy ISA region, even when
> -		 * some of those portions are listed(or not even listed) with
> -		 * different e820 types(RAM/reserved/..)
> -		 */
> -		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
> -		    page_is_ram(page_nr))
> -			ram_page = 1;
> -		else
> -			not_rampage = 1;
> -
> -		if (ram_page == not_rampage)
> -			return -1;
> -	}
> +	/*
> +	 * For legacy reasons, physical address range in the legacy ISA
> +	 * region is tracked as non-RAM. This will allow users of
> +	 * /dev/mem to map portions of legacy ISA region, even when
> +	 * some of those portions are listed(or not even listed) with
> +	 * different e820 types(RAM/reserved/..)
> +	 */
> +	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
> +		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
>  
> -	return ram_page;
> +	if (start_pfn < end_pfn)
> +		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
> +				&state, pagerange_is_ram_callback);
> +	return (ret > 0) ? -1 : (state.ram ? 1 : 0);
>  }
>  
>  /*



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [tip:x86/urgent] x86/mm/pat: Improve scaling of pat_pagerange_is_ram()
  2012-05-25 21:12   ` [PATCH V2] " John Dykstra
  2012-05-26  0:37     ` Suresh Siddha
@ 2012-05-30 13:34     ` tip-bot for John Dykstra
  1 sibling, 0 replies; 6+ messages in thread
From: tip-bot for John Dykstra @ 2012-05-30 13:34 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, a.p.zijlstra, torvalds, jdykstra, akpm,
	suresh.b.siddha, tglx

Commit-ID:  fa83523f45fbb403eba4ebc5704bf98aa4da0163
Gitweb:     http://git.kernel.org/tip/fa83523f45fbb403eba4ebc5704bf98aa4da0163
Author:     John Dykstra <jdykstra@cray.com>
AuthorDate: Fri, 25 May 2012 16:12:46 -0500
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 30 May 2012 10:57:11 +0200

x86/mm/pat: Improve scaling of pat_pagerange_is_ram()

Function pat_pagerange_is_ram() scales poorly to large address
ranges, because it probes the resource tree for each page.

On a 2.6 GHz Opteron, this function consumes 34 ms for a 1 GB range.

It is called twice during untrack_pfn_vma(), slowing process
cleanup and handicapping the OOM killer.

This replacement consumes less than 1ms, under the same conditions.

Signed-off-by: John Dykstra <jdykstra@cray.com> on behalf of Cray Inc.
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1337980366.1979.6.camel@redwood
[ Small stylistic cleanups and renames ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pat.c |   58 +++++++++++++++++++++++++++++++++-------------------
 1 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f6ff57b..bea6e57 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 	return req_type;
 }
 
+struct pagerange_state {
+	unsigned long		cur_pfn;
+	int			ram;
+	int			not_ram;
+};
+
+static int
+pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
+{
+	struct pagerange_state *state = arg;
+
+	state->not_ram	|= initial_pfn > state->cur_pfn;
+	state->ram	|= total_nr_pages > 0;
+	state->cur_pfn	 = initial_pfn + total_nr_pages;
+
+	return state->ram && state->not_ram;
+}
+
 static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 {
-	int ram_page = 0, not_rampage = 0;
-	unsigned long page_nr;
+	int ret = 0;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	struct pagerange_state state = {start_pfn, 0, 0};
 
-	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
-	     ++page_nr) {
-		/*
-		 * For legacy reasons, physical address range in the legacy ISA
-		 * region is tracked as non-RAM. This will allow users of
-		 * /dev/mem to map portions of legacy ISA region, even when
-		 * some of those portions are listed(or not even listed) with
-		 * different e820 types(RAM/reserved/..)
-		 */
-		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
-		    page_is_ram(page_nr))
-			ram_page = 1;
-		else
-			not_rampage = 1;
-
-		if (ram_page == not_rampage)
-			return -1;
+	/*
+	 * For legacy reasons, physical address range in the legacy ISA
+	 * region is tracked as non-RAM. This will allow users of
+	 * /dev/mem to map portions of legacy ISA region, even when
+	 * some of those portions are listed(or not even listed) with
+	 * different e820 types(RAM/reserved/..)
+	 */
+	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
+
+	if (start_pfn < end_pfn) {
+		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+				&state, pagerange_is_ram_callback);
 	}
 
-	return ram_page;
+	return (ret > 0) ? -1 : (state.ram ? 1 : 0);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2012-05-30 13:34 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-14 20:26 [PATCH] mm, x86, pat: Improve scaling of pat_pagerange_is_ram() John Dykstra
2012-05-16 22:46 ` Suresh Siddha
2012-05-18  6:48   ` Ingo Molnar
2012-05-25 21:12   ` [PATCH V2] " John Dykstra
2012-05-26  0:37     ` Suresh Siddha
2012-05-30 13:34     ` [tip:x86/urgent] x86/mm/pat: " tip-bot for John Dykstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.