All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size.
@ 2012-04-25  8:49 ShuoX Liu
  2012-04-25 22:24 ` Tejun Heo
  0 siblings, 1 reply; 16+ messages in thread
From: ShuoX Liu @ 2012-04-25  8:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: tj, Yanmin Zhang

From: ShuoX Liu <shuox.liu@intel.com>

We are enabling Android on Medfield. On i386, if the board has more
physical memory, it means the vmalloc space is small. If the vmalloc
space is big, it means physical memory is small. Dynamic percpu
allocation is based on VM space. On i386, by default, the chunk size
is 4MB. As vmalloc space is <= 128M, percpu allocation often fails.
If using PERCPU_FC_PAGE, system can't go to deep sleep states.

The patch add one more chosen to percpu_alloc kernel parameter.
Besides page,embed,auto are available for percpu_alloc, memory size
suffixed with %K,%M are supported. It set pcpu_atom_size to fixed
wanted size.

Signed-off-by: Yanmin Zhang <yanmin.zhang@intel.com>
Signed-off-by: ShuoX Liu <shuox.liu@intel.com>
---
 Documentation/kernel-parameters.txt |    4 +++-
 arch/x86/kernel/setup_percpu.c      |    6 +++++-
 include/linux/percpu.h              |    2 ++
 mm/percpu.c                         |   11 +++++++++--
 4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1601e5..befeb5f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2196,7 +2196,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			See arch/parisc/kernel/pdc_chassis.c
 
 	percpu_alloc=	Select which percpu first chunk allocator to use.
-			Currently supported values are "embed" and "page".
+			Currently supported values are "embed", "page" and size
+			suffixed with %K,%M. Fixed size will overwrite
+			pcpu_atom_size of chunk allocation parameter.
 			Archs may support subset or none of the	selections.
 			See comments in mm/percpu.c for details on each
 			allocator.  This parameter is primarily	for debugging
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727..824bc41 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -185,9 +185,13 @@ void __init setup_per_cpu_areas(void)
 #endif
 	rc = -EINVAL;
 	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
-		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
+		size_t atom_size;
 		const size_t dyn_size = PERCPU_MODULE_RESERVE +
 			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+		if (pcpu_chosen_fc == PCPU_FC_FIXED && pcpu_atom_size)
+			atom_size = pcpu_atom_size;
+		else
+			atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
 
 		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 					    dyn_size, atom_size,
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 21638ae..3943088 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -104,12 +104,14 @@ enum pcpu_fc {
 	PCPU_FC_AUTO,
 	PCPU_FC_EMBED,
 	PCPU_FC_PAGE,
+	PCPU_FC_FIXED,
 
 	PCPU_FC_NR,
 };
 extern const char *pcpu_fc_names[PCPU_FC_NR];
 
 extern enum pcpu_fc pcpu_chosen_fc;
+extern size_t pcpu_atom_size;
 
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size,
 				     size_t align);
diff --git a/mm/percpu.c b/mm/percpu.c
index f47af91..7770ad0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -113,9 +113,9 @@ struct pcpu_chunk {
 static int pcpu_unit_pages __read_mostly;
 static int pcpu_unit_size __read_mostly;
 static int pcpu_nr_units __read_mostly;
-static int pcpu_atom_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
+size_t pcpu_atom_size __read_mostly;
 
 /* cpus with the lowest and highest unit addresses */
 static unsigned int pcpu_low_unit_cpu __read_mostly;
@@ -1374,12 +1374,14 @@ const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
 	[PCPU_FC_AUTO]	= "auto",
 	[PCPU_FC_EMBED]	= "embed",
 	[PCPU_FC_PAGE]	= "page",
+	[PCPU_FC_FIXED]	= "fixed",
 };
 
 enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
 
 static int __init percpu_alloc_setup(char *str)
 {
+	size_t size = memparse(str, NULL);
 	if (0)
 		/* nada */;
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
@@ -1390,7 +1392,12 @@ static int __init percpu_alloc_setup(char *str)
 	else if (!strcmp(str, "page"))
 		pcpu_chosen_fc = PCPU_FC_PAGE;
 #endif
-	else
+	else if (!strcmp(str, "auto"))
+		pcpu_chosen_fc = PCPU_FC_AUTO;
+	else if (size > 0) {
+		pcpu_atom_size = size;
+		pcpu_chosen_fc = PCPU_FC_FIXED;
+	} else
 		pr_warning("PERCPU: unknown allocator %s specified\n", str);
 
 	return 0;

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size.
  2012-04-25  8:49 [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size ShuoX Liu
@ 2012-04-25 22:24 ` Tejun Heo
  2012-04-26  2:01   ` Yanmin Zhang
  0 siblings, 1 reply; 16+ messages in thread
From: Tejun Heo @ 2012-04-25 22:24 UTC (permalink / raw)
  To: ShuoX Liu; +Cc: linux-kernel, Yanmin Zhang

Hello, ShouX.

On Wed, Apr 25, 2012 at 04:49:28PM +0800, ShuoX Liu wrote:
> From: ShuoX Liu <shuox.liu@intel.com>
> 
> We are enabling Android on Medfield. On i386, if the board has more
> physical memory, it means the vmalloc space is small. If the vmalloc
> space is big, it means physical memory is small. Dynamic percpu
> allocation is based on VM space. On i386, by default, the chunk size
> is 4MB. As vmalloc space is <= 128M, percpu allocation often fails.

Can you please provide more details - which kernel/user split was
used, how much memory was there and so on?  Also, can you please
attach boot log and the output of "cat /proc/vmallocinfo"?

> If using PERCPU_FC_PAGE, system can't go to deep sleep states.

Why?

> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
> index 71f4727..824bc41 100644
> --- a/arch/x86/kernel/setup_percpu.c
> +++ b/arch/x86/kernel/setup_percpu.c
> @@ -185,9 +185,13 @@ void __init setup_per_cpu_areas(void)
>  #endif
>  	rc = -EINVAL;
>  	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
> -		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
> +		size_t atom_size;
>  		const size_t dyn_size = PERCPU_MODULE_RESERVE +
>  			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
> +		if (pcpu_chosen_fc == PCPU_FC_FIXED && pcpu_atom_size)
> +			atom_size = pcpu_atom_size;
> +		else
> +			atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
>  
>  		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
>  					    dyn_size, atom_size,

Umm... this is way too hacky.  atom_size can't be an arbitrary value
and the param is meaningful only to x86 yet defined globally.  Also,
while atom_size has effect on vmalloc area usage, way more important
factor is distance between units.  What we probably need to do is
tighten the rejection criteria of pcpu_embed_first_chunk() and fix
whatever problem FC_PAGE is causing.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size.
  2012-04-25 22:24 ` Tejun Heo
@ 2012-04-26  2:01   ` Yanmin Zhang
  2012-04-26 22:49     ` Tejun Heo
  0 siblings, 1 reply; 16+ messages in thread
From: Yanmin Zhang @ 2012-04-26  2:01 UTC (permalink / raw)
  To: Tejun Heo; +Cc: ShuoX Liu, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 6181 bytes --]

On Wed, 2012-04-25 at 15:24 -0700, Tejun Heo wrote:
> Hello, ShouX.
> 
> On Wed, Apr 25, 2012 at 04:49:28PM +0800, ShuoX Liu wrote:
> > From: ShuoX Liu <shuox.liu@intel.com>
> > 
> > We are enabling Android on Medfield. On i386, if the board has more
> > physical memory, it means the vmalloc space is small. If the vmalloc
> > space is big, it means physical memory is small. Dynamic percpu
> > allocation is based on VM space. On i386, by default, the chunk size
> > is 4MB. As vmalloc space is <= 128M, percpu allocation often fails.
> 
> Can you please provide more details - which kernel/user split was
> used, how much memory was there and so on?  Also, can you please
> attach boot log and the output of "cat /proc/vmallocinfo"?
We are enabling Android on Medfield which is an embedded i386 platform.
kernel/user split is 1/3G. Physical memory is 1GB and we turn on HIGHMEM.

The boot log is a little big. I copied the percpu part for your reference.

04-26 09:57:32.987     0     0 W Kernel-Dmesg: <6>[    0.000000] SMP: Allowing 2 CPUs, 0 hotplug CPUs
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <7>[    0.000000] nr_irqs_gsi: 85
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <6>[    0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:bec00000)
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <6>[    0.000000] setup_percpu: NR_CPUS:2 nr_cpumask_bits:2 nr_cpu_ids:2 nr_node_ids:1
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <6>[    0.000000] PERCPU: Embedded 12 pages/cpu @f6400000 s25280 r0 d23872 u2097152
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <7>[    0.000000] pcpu-alloc: s25280 r0 d23872 u2097152 alloc=1*4194304
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <7>[    0.000000] pcpu-alloc: [0] 0 1
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <4>[    0.000000] Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 252826
04-26 09:57:32.987     0     0 W Kernel-Dmesg: <5>[    0.000000] Kernel command line: init=/init pci=noearly console=tty
04-26 09:57:32.988     0     0 W Kernel-Dmesg: MFD3 console=logk0 earlyprintk=nologger loglevel=7 hsu_dma=7 kmemleak=off androidboot.bootmedia=sdcard androidboot.hardware=mfld_pr2 ip=50.0.0.2:50.0.0.1::255.255.255.0::usb0:on g_android.fastboot=1 droidboot.scratch=100 androidboot.wakesrc=05 androidboot.mode=fastboot
04-26 09:57:32.988     0     0 W Kernel-Dmesg: <6>[    0.000000] PID hash table entries: 4096 (order: 2, 16384 bytes)
04-26 09:57:32.988     0     0 W Kernel-Dmesg: <6>[    0.000000] Dentry cache hash table entries: 131072 (order: 7, 524288 b


We hit below issue.

03-13 08:22:36.273 11151 11133 I KERNEL : [74395.013328] PERCPU: allocation
failed, size=252 align=4, failed to allocate new chunk
03-13 08:22:36.274 11151 11133 I KERNEL : [74395.014062] Pid: 11151, comm: rild
Not tainted 3.0.8-137162-g5e405a0 #1
03-13 08:22:36.275 11151 11133 I KERNEL : [74395.014656] Call Trace:
03-13 08:22:36.283 11151 11133 I KERNEL : [74395.022691] [<c1306eaa>]
pcpu_alloc+0x12ca/0x1300
03-13 08:22:36.288 11151 11133 I KERNEL : [74395.027998] [<c125c1a9>] ?
sysctl_set_parent+0x29/0x40
03-13 08:22:36.293 11151 11133 I KERNEL : [74395.033335] [<c1306f0f>]
__alloc_percpu+0xf/0x20
03-13 08:22:36.298 11151 11133 I KERNEL : [74395.038144] [<c180dcad>]
snmp_mib_init+0x3d/0x70
03-13 08:22:36.302 11151 11133 I KERNEL : [74395.043221] [<c1846efa>]
ipv6_add_dev+0xfa/0x380


We run many stress testing and hit the percpu allocation failure at other
places.

We googled the error and found other guys in community also hit the bug.
You suggested them to use percpu_alloc=page. We tried it and it hurts power.

vmallocinfo is attached. From the vmallocinfo, we could find the VM space
is fragmented. We would write another patch to clean it up.

> 
> > If using PERCPU_FC_PAGE, system can't go to deep sleep states.
> 
> Why?
Medfield has 2 cpu threads. Only when all the 2 threads enter deep C states,
for example, C6, the core would enter C6. If booting kernel with percpu_alloc=page,
cpu core often aborts the C6 entering. We don't know why. C6 is aborted under
many conditions. One is when there is pending interrupt. I suspect with page size
alloc, it might trigger more cache miss. Just before calls mwait to enter
C6, we record some statistics data and that might trigger the cache miss
to abort the C6. It's just a _GUESS_.

We tried atom_size with 32k, 128k, 256k. There is no power regression.

> 
> > diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
> > index 71f4727..824bc41 100644
> > --- a/arch/x86/kernel/setup_percpu.c
> > +++ b/arch/x86/kernel/setup_percpu.c
> > @@ -185,9 +185,13 @@ void __init setup_per_cpu_areas(void)
> >  #endif
> >  	rc = -EINVAL;
> >  	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
> > -		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
> > +		size_t atom_size;
> >  		const size_t dyn_size = PERCPU_MODULE_RESERVE +
> >  			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
> > +		if (pcpu_chosen_fc == PCPU_FC_FIXED && pcpu_atom_size)
> > +			atom_size = pcpu_atom_size;
> > +		else
> > +			atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
> >  
> >  		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
> >  					    dyn_size, atom_size,
> 
> Umm... this is way too hacky.  atom_size can't be an arbitrary value
The interface is give admin more options. It doesn't mean admin could
choose any value.

> and the param is meaningful only to x86 yet defined globally. 
Right, from real usage requirement point of view. If we googled the failure
info dumped from kernel, we could see many other guys also hit it. That's why
we send the patch to LKML.

>  Also,
> while atom_size has effect on vmalloc area usage, way more important
> factor is distance between units.
Could you elaborate it?

>   What we probably need to do is
> tighten the rejection criteria of pcpu_embed_first_chunk()
Could you explain it more?

>  and fix
> whatever problem FC_PAGE is causing.
We can't fix FC_PAGE power regression. If we do so, we need contact many
hardware architects. Current kernel supports FC_PAGE and PMD_SIZE, why
not to allow admin to choose other values?

Thanks for the comments.

Yanmin


[-- Attachment #2: vmallocinfo.tgz --]
[-- Type: application/x-compressed-tar, Size: 5618 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size.
  2012-04-26  2:01   ` Yanmin Zhang
@ 2012-04-26 22:49     ` Tejun Heo
  2012-04-27  1:09       ` Yanmin Zhang
  0 siblings, 1 reply; 16+ messages in thread
From: Tejun Heo @ 2012-04-26 22:49 UTC (permalink / raw)
  To: Yanmin Zhang; +Cc: ShuoX Liu, linux-kernel

Hello,

On Thu, Apr 26, 2012 at 10:01:12AM +0800, Yanmin Zhang wrote:
> [    0.000000] SMP: Allowing 2 CPUs, 0 hotplug CPUs
> [    0.000000] nr_irqs_gsi: 85
> [    0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:bec00000)
> [    0.000000] setup_percpu: NR_CPUS:2 nr_cpumask_bits:2 nr_cpu_ids:2 nr_node_ids:1
> [    0.000000] PERCPU: Embedded 12 pages/cpu @f6400000 s25280 r0 d23872 u2097152
> [    0.000000] pcpu-alloc: s25280 r0 d23872 u2097152 alloc=1*4194304
> [    0.000000] pcpu-alloc: [0] 0 1

Heh, I was getting confused, forget the distance thing, so it's single
group w/ 4MiB allocation size.

> PERCPU: allocation failed, size=252 align=4, failed to allocate new chunk

Which later fails percpu allocation due to vmalloc space exhaustion.
How long does that take to happen?

> vmallocinfo is attached. From the vmallocinfo, we could find the VM space
> is fragmented. We would write another patch to clean it up.

Whee... ah well, 128M isn't that big after all.

> > > If using PERCPU_FC_PAGE, system can't go to deep sleep states.
> > 
> > Why?
>
> Medfield has 2 cpu threads. Only when all the 2 threads enter deep C states,
> for example, C6, the core would enter C6. If booting kernel with percpu_alloc=page,
> cpu core often aborts the C6 entering. We don't know why. C6 is aborted under
> many conditions. One is when there is pending interrupt. I suspect with page size
> alloc, it might trigger more cache miss. Just before calls mwait to enter
> C6, we record some statistics data and that might trigger the cache miss
> to abort the C6. It's just a _GUESS_.
> 
> We tried atom_size with 32k, 128k, 256k. There is no power regression.

So, the difference between EMBED and PAGE is how the first chunk which
contains all the static percpu variables and some dynamic area for
optimization is allocated.  For EMBED, it's just kmallocd which means
that it piggy backs on the default kernel linear mapping thus avoiding
adding any extra TLB pressure.  For PAGE, all those percpu areas end
up getting re-mapped in vmalloc area using 4k pages, so if TLB
pressure can affect entering C6, that could be it.

> We can't fix FC_PAGE power regression. If we do so, we need contact many
> hardware architects. Current kernel supports FC_PAGE and PMD_SIZE, why
> not to allow admin to choose other values?

If this is something which is met in the field commonly, we need to
fix the default behavior rather than introducing some arcane boot
param.  IIRC, the reasons PMD_SIZE is used for atom_size are so that
percpu areas are aligned to PSE mapping, maybe later we can make use
of PSE mapping in vmalloc area too, and it didn't seem to hurt
anything.

If the large unit size is becoming a problem on i386, we can just use
PAGE_SIZE as atom_size.  Can you please verify that atom_size of 4k w/
EMBED also resolves the power issue?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size.
  2012-04-26 22:49     ` Tejun Heo
@ 2012-04-27  1:09       ` Yanmin Zhang
  2012-04-27  8:56         ` Yanmin Zhang
  2012-04-27 16:34         ` [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size Tejun Heo
  0 siblings, 2 replies; 16+ messages in thread
From: Yanmin Zhang @ 2012-04-27  1:09 UTC (permalink / raw)
  To: Tejun Heo; +Cc: ShuoX Liu, linux-kernel

On Thu, 2012-04-26 at 15:49 -0700, Tejun Heo wrote:
> Hello,
> 
> On Thu, Apr 26, 2012 at 10:01:12AM +0800, Yanmin Zhang wrote:
> > [    0.000000] SMP: Allowing 2 CPUs, 0 hotplug CPUs
> > [    0.000000] nr_irqs_gsi: 85
> > [    0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:bec00000)
> > [    0.000000] setup_percpu: NR_CPUS:2 nr_cpumask_bits:2 nr_cpu_ids:2 nr_node_ids:1
> > [    0.000000] PERCPU: Embedded 12 pages/cpu @f6400000 s25280 r0 d23872 u2097152
> > [    0.000000] pcpu-alloc: s25280 r0 d23872 u2097152 alloc=1*4194304
> > [    0.000000] pcpu-alloc: [0] 0 1
> 
> Heh, I was getting confused, forget the distance thing, so it's single
> group w/ 4MiB allocation size.
> 
> > PERCPU: allocation failed, size=252 align=4, failed to allocate new chunk
> 
> Which later fails percpu allocation due to vmalloc space exhaustion.
> How long does that take to happen?
It depends. Sometimes it fails in 400 seconds after booting. We run MTBF and other
stress testing. Sometimes even with other non-stress testing, pecpu allocation
fails. Most drivers or upper layers expect the percpu allocation should succeed. If
not, although mostly there is no OOPS in kernel, upper applications wouldn't work.

> 
> > vmallocinfo is attached. From the vmallocinfo, we could find the VM space
> > is fragmented. We would write another patch to clean it up.
> 
> Whee... ah well, 128M isn't that big after all.
Indeed, so we need tune the memory utilization carefully on i386.
We did work out other patches at other places/drivers to fix other OOM issues.

> 
> > > > If using PERCPU_FC_PAGE, system can't go to deep sleep states.
> > > 
> > > Why?
> >
> > Medfield has 2 cpu threads. Only when all the 2 threads enter deep C states,
> > for example, C6, the core would enter C6. If booting kernel with percpu_alloc=page,
> > cpu core often aborts the C6 entering. We don't know why. C6 is aborted under
> > many conditions. One is when there is pending interrupt. I suspect with page size
> > alloc, it might trigger more cache miss. Just before calls mwait to enter
> > C6, we record some statistics data and that might trigger the cache miss
> > to abort the C6. It's just a _GUESS_.
> > 
> > We tried atom_size with 32k, 128k, 256k. There is no power regression.
> 
> So, the difference between EMBED and PAGE is how the first chunk which
> contains all the static percpu variables and some dynamic area for
> optimization is allocated.  For EMBED, it's just kmallocd which means
> that it piggy backs on the default kernel linear mapping thus avoiding
> adding any extra TLB pressure.  For PAGE, all those percpu areas end
> up getting re-mapped in vmalloc area using 4k pages, so if TLB
> pressure can affect entering C6, that could be it.
Thanks for the explanation.

> 
> > We can't fix FC_PAGE power regression. If we do so, we need contact many
> > hardware architects. Current kernel supports FC_PAGE and PMD_SIZE, why
> > not to allow admin to choose other values?
> 
> If this is something which is met in the field commonly, we need to
> fix the default behavior rather than introducing some arcane boot
> param.
We just add a new value input method instead of introducing new parameter.

>   IIRC, the reasons PMD_SIZE is used for atom_size are so that
> percpu areas are aligned to PSE mapping, maybe later we can make use
> of PSE mapping in vmalloc area too, and it didn't seem to hurt
> anything.
Well, vmalloc area might use different prot to map physical pages.
So sharing one PMD huge page by many vmalloc areas might be not good.

> 
> If the large unit size is becoming a problem on i386, we can just use
> PAGE_SIZE as atom_size.  Can you please verify that atom_size of 4k w/
> EMBED also resolves the power issue?
We are enable Android ICS which is based on kernel 3.0.8. It seems there is
no much change between 3.0.8 and the latest kernel.
With 3.0.8, although we could set percpualloc=embed, atom_size would becomes
PMD_SIZE.
With our patch, we could do the experiment as we could configure percpu_alloc=4K
easily. We would let you know the testing result of atom_size=4K && first_chunk_embedded.

Yanmin



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size.
  2012-04-27  1:09       ` Yanmin Zhang
@ 2012-04-27  8:56         ` Yanmin Zhang
  2012-04-27 16:53           ` [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit Tejun Heo
  2012-04-27 16:34         ` [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size Tejun Heo
  1 sibling, 1 reply; 16+ messages in thread
From: Yanmin Zhang @ 2012-04-27  8:56 UTC (permalink / raw)
  To: Tejun Heo; +Cc: ShuoX Liu, linux-kernel

On Fri, 2012-04-27 at 09:09 +0800, Yanmin Zhang wrote:
> On Thu, 2012-04-26 at 15:49 -0700, Tejun Heo wrote:
> > Hello,
> > 
> > On Thu, Apr 26, 2012 at 10:01:12AM +0800, Yanmin Zhang wrote:
> > > [    0.000000] SMP: Allowing 2 CPUs, 0 hotplug CPUs
> > > [    0.000000] nr_irqs_gsi: 85
> > > [    0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:bec00000)
> > > [    0.000000] setup_percpu: NR_CPUS:2 nr_cpumask_bits:2 nr_cpu_ids:2 nr_node_ids:1
> > > [    0.000000] PERCPU: Embedded 12 pages/cpu @f6400000 s25280 r0 d23872 u2097152
> > > [    0.000000] pcpu-alloc: s25280 r0 d23872 u2097152 alloc=1*4194304
> > > [    0.000000] pcpu-alloc: [0] 0 1
> > 
> > Heh, I was getting confused, forget the distance thing, so it's single
> > group w/ 4MiB allocation size.
> > 
> > > PERCPU: allocation failed, size=252 align=4, failed to allocate new chunk
> > 
> > Which later fails percpu allocation due to vmalloc space exhaustion.
> > How long does that take to happen?
> It depends. Sometimes it fails in 400 seconds after booting. We run MTBF and other
> stress testing. Sometimes even with other non-stress testing, pecpu allocation
> fails. Most drivers or upper layers expect the percpu allocation should succeed. If
> not, although mostly there is no OOPS in kernel, upper applications wouldn't work.
> 
> > 
> > > vmallocinfo is attached. From the vmallocinfo, we could find the VM space
> > > is fragmented. We would write another patch to clean it up.
> > 
> > Whee... ah well, 128M isn't that big after all.
> Indeed, so we need tune the memory utilization carefully on i386.
> We did work out other patches at other places/drivers to fix other OOM issues.
> 
> > 
> > > > > If using PERCPU_FC_PAGE, system can't go to deep sleep states.
> > > > 
> > > > Why?
> > >
> > > Medfield has 2 cpu threads. Only when all the 2 threads enter deep C states,
> > > for example, C6, the core would enter C6. If booting kernel with percpu_alloc=page,
> > > cpu core often aborts the C6 entering. We don't know why. C6 is aborted under
> > > many conditions. One is when there is pending interrupt. I suspect with page size
> > > alloc, it might trigger more cache miss. Just before calls mwait to enter
> > > C6, we record some statistics data and that might trigger the cache miss
> > > to abort the C6. It's just a _GUESS_.
> > > 
> > > We tried atom_size with 32k, 128k, 256k. There is no power regression.
> > 
> > So, the difference between EMBED and PAGE is how the first chunk which
> > contains all the static percpu variables and some dynamic area for
> > optimization is allocated.  For EMBED, it's just kmallocd which means
> > that it piggy backs on the default kernel linear mapping thus avoiding
> > adding any extra TLB pressure.  For PAGE, all those percpu areas end
> > up getting re-mapped in vmalloc area using 4k pages, so if TLB
> > pressure can affect entering C6, that could be it.
> Thanks for the explanation.
> 
> > 
> > > We can't fix FC_PAGE power regression. If we do so, we need contact many
> > > hardware architects. Current kernel supports FC_PAGE and PMD_SIZE, why
> > > not to allow admin to choose other values?
> > 
> > If this is something which is met in the field commonly, we need to
> > fix the default behavior rather than introducing some arcane boot
> > param.
> We just add a new value input method instead of introducing new parameter.
> 
> >   IIRC, the reasons PMD_SIZE is used for atom_size are so that
> > percpu areas are aligned to PSE mapping, maybe later we can make use
> > of PSE mapping in vmalloc area too, and it didn't seem to hurt
> > anything.
> Well, vmalloc area might use different prot to map physical pages.
> So sharing one PMD huge page by many vmalloc areas might be not good.
> 
> > 
> > If the large unit size is becoming a problem on i386, we can just use
> > PAGE_SIZE as atom_size.  Can you please verify that atom_size of 4k w/
> > EMBED also resolves the power issue?
> We are enable Android ICS which is based on kernel 3.0.8. It seems there is
> no much change between 3.0.8 and the latest kernel.
> With 3.0.8, although we could set percpualloc=embed, atom_size would becomes
> PMD_SIZE.
> With our patch, we could do the experiment as we could configure percpu_alloc=4K
> easily. We would let you know the testing result of atom_size=4K && first_chunk_embedded.
Liu Shuo did initial testing about this scenario and there is no power
regression.

Yanmin



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size.
  2012-04-27  1:09       ` Yanmin Zhang
  2012-04-27  8:56         ` Yanmin Zhang
@ 2012-04-27 16:34         ` Tejun Heo
  1 sibling, 0 replies; 16+ messages in thread
From: Tejun Heo @ 2012-04-27 16:34 UTC (permalink / raw)
  To: Yanmin Zhang; +Cc: ShuoX Liu, linux-kernel

Hello,

On Fri, Apr 27, 2012 at 09:09:24AM +0800, Yanmin Zhang wrote:
> > > We can't fix FC_PAGE power regression. If we do so, we need contact many
> > > hardware architects. Current kernel supports FC_PAGE and PMD_SIZE, why
> > > not to allow admin to choose other values?
> > 
> > If this is something which is met in the field commonly, we need to
> > fix the default behavior rather than introducing some arcane boot
> > param.
>
> We just add a new value input method instead of introducing new parameter.

Ummm... I don't know what you meant by the above sentence but adding a
new magic kernel param whether it's part of an existing one or not, is
not a good solution.  They're difficult to discover and not many
actually understand what they do.  If you *have* to add some, then you
better make it clear where it's being applied for what.  ie. in this
case, add something like x86_percpu_embed_unit_size.

> >   IIRC, the reasons PMD_SIZE is used for atom_size are so that
> > percpu areas are aligned to PSE mapping, maybe later we can make use
> > of PSE mapping in vmalloc area too, and it didn't seem to hurt
> > anything.
>
> Well, vmalloc area might use different prot to map physical pages.
> So sharing one PMD huge page by many vmalloc areas might be not good.

Percpu allocator uses the whole vmalloc chunk, so there's no prot
problem.  They're all percpu memory.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27  8:56         ` Yanmin Zhang
@ 2012-04-27 16:53           ` Tejun Heo
  2012-04-27 17:17             ` H. Peter Anvin
                               ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Tejun Heo @ 2012-04-27 16:53 UTC (permalink / raw)
  To: Yanmin Zhang
  Cc: ShuoX Liu, linux-kernel, Christoph Lameter, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin

With the embed percpu first chunk allocator, x86 uses either PAGE_SIZE
or PMD_SIZE for atom_size.  PMD_SIZE is used when CPU supports PSE so
that percpu areas are aligned to PMD mappings and possibly allow using
PMD mappings in vmalloc areas in the future.  Using larger atom_size
doesn't waste actual memory; however, it does require larger vmalloc
space allocation later on for !first chunks.

With reasonably sized vmalloc area, PMD_SIZE shouldn't be a problem
but x86_32 at this point is anything but reasonable in terms of
address space and using later atom_size reportedly leads to frequent
percpu allocation failures on certain setups.

This patch makes x86_32 always use PAGE_SIZE as atom_size for embed
first chunk allocator.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yanmin Zhang <yanmin.zhang@intel.com>
Reported-by: ShuoX Liu <shuox.liu@intel.com>
LKML-Reference: <4F97BA98.6010001@intel.com>
Cc: stable@vger.kernel.org
---
Can you please verify this resolves the issue you guys are seeing?
Once verified, I'll push it through percpu/for-3.5-fixes.

Thanks.

 arch/x86/kernel/setup_percpu.c |   14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727..ed96573 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
 #endif
 	rc = -EINVAL;
 	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
-		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
 		const size_t dyn_size = PERCPU_MODULE_RESERVE +
 			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+		size_t atom_size;
 
+		/*
+		 * If PSE is available, use PMD_SIZE for atom_size so that
+		 * embedded percpu areas are aligned to PMD.  This, in the
+		 * future, can also allow using PMD mappings in vmalloc
+		 * area.  Use PAGE_SIZE on 32bit as vmalloc space is highly
+		 * contended and large vmalloc area alloc can easily fail.
+		 */
+		atom_size = PAGE_SIZE;
+#ifdef CONFIG_X86_64
+		if (cpu_has_pse)
+			atom_size = PMD_SIZE;
+#endif
 		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 					    dyn_size, atom_size,
 					    pcpu_cpu_distance,

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 16:53           ` [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit Tejun Heo
@ 2012-04-27 17:17             ` H. Peter Anvin
  2012-04-27 17:27               ` Tejun Heo
  2012-04-27 17:21             ` Christoph Lameter
                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: H. Peter Anvin @ 2012-04-27 17:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Yanmin Zhang, ShuoX Liu, linux-kernel, Christoph Lameter,
	Thomas Gleixner, Ingo Molnar

On 04/27/2012 09:53 AM, Tejun Heo wrote:
>  
> +		/*
> +		 * If PSE is available, use PMD_SIZE for atom_size so that
> +		 * embedded percpu areas are aligned to PMD.  This, in the
> +		 * future, can also allow using PMD mappings in vmalloc
> +		 * area.  Use PAGE_SIZE on 32bit as vmalloc space is highly
> +		 * contended and large vmalloc area alloc can easily fail.
> +		 */
> +		atom_size = PAGE_SIZE;
> +#ifdef CONFIG_X86_64
> +		if (cpu_has_pse)
> +			atom_size = PMD_SIZE;
> +#endif

Just make it PAGE_SIZE on i386 and PMD_SIZE on x86-64; keep the behavior
consistent.  !cpu_has_pse on x86-64 is an abnormal situation (I think it
applies to Xen, pretty much.)

Other than that,

Acked-by: H. Peter Anvin <hpa@zytor.com>

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 16:53           ` [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit Tejun Heo
  2012-04-27 17:17             ` H. Peter Anvin
@ 2012-04-27 17:21             ` Christoph Lameter
  2012-04-27 17:27               ` Tejun Heo
  2012-04-27 17:39             ` [PATCH v2] " Tejun Heo
  2012-04-28  2:18             ` [PATCH] " ShuoX Liu
  3 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2012-04-27 17:21 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Yanmin Zhang, ShuoX Liu, linux-kernel, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin

On Fri, 27 Apr 2012, Tejun Heo wrote:

> This patch makes x86_32 always use PAGE_SIZE as atom_size for embed
> first chunk allocator.

Would it not be easier to use the first #ifdef in the function to set
pcpu_chosen_fc = PCPU_FC_PAGE?

You could just skip the whole PCPU_FC_PAGE block in the CONFIG_X86_32
case.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 17:21             ` Christoph Lameter
@ 2012-04-27 17:27               ` Tejun Heo
  0 siblings, 0 replies; 16+ messages in thread
From: Tejun Heo @ 2012-04-27 17:27 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Yanmin Zhang, ShuoX Liu, linux-kernel, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin

On Fri, Apr 27, 2012 at 12:21:34PM -0500, Christoph Lameter wrote:
> On Fri, 27 Apr 2012, Tejun Heo wrote:
> 
> > This patch makes x86_32 always use PAGE_SIZE as atom_size for embed
> > first chunk allocator.
> 
> Would it not be easier to use the first #ifdef in the function to set
> pcpu_chosen_fc = PCPU_FC_PAGE?
> 
> You could just skip the whole PCPU_FC_PAGE block in the CONFIG_X86_32
> case.

PCPU_FC_PAGE != PCPU_FC_EMBED w/ PAGE_SIZE atom_size.  EMBED still
gets to piggy back on the kernel linear mapping for the first chunk
unlike PAGE.  It apparently created noticeable difference (I'm
guessing due to extra TLB pressure) for intel android folks.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 17:17             ` H. Peter Anvin
@ 2012-04-27 17:27               ` Tejun Heo
  0 siblings, 0 replies; 16+ messages in thread
From: Tejun Heo @ 2012-04-27 17:27 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yanmin Zhang, ShuoX Liu, linux-kernel, Christoph Lameter,
	Thomas Gleixner, Ingo Molnar

On Fri, Apr 27, 2012 at 10:17:17AM -0700, H. Peter Anvin wrote:
> On 04/27/2012 09:53 AM, Tejun Heo wrote:
> >  
> > +		/*
> > +		 * If PSE is available, use PMD_SIZE for atom_size so that
> > +		 * embedded percpu areas are aligned to PMD.  This, in the
> > +		 * future, can also allow using PMD mappings in vmalloc
> > +		 * area.  Use PAGE_SIZE on 32bit as vmalloc space is highly
> > +		 * contended and large vmalloc area alloc can easily fail.
> > +		 */
> > +		atom_size = PAGE_SIZE;
> > +#ifdef CONFIG_X86_64
> > +		if (cpu_has_pse)
> > +			atom_size = PMD_SIZE;
> > +#endif
> 
> Just make it PAGE_SIZE on i386 and PMD_SIZE on x86-64; keep the behavior
> consistent.  !cpu_has_pse on x86-64 is an abnormal situation (I think it
> applies to Xen, pretty much.)
> 
> Other than that,
> 
> Acked-by: H. Peter Anvin <hpa@zytor.com>

Yeah, that's simpler.  Will update.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 16:53           ` [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit Tejun Heo
  2012-04-27 17:17             ` H. Peter Anvin
  2012-04-27 17:21             ` Christoph Lameter
@ 2012-04-27 17:39             ` Tejun Heo
  2012-04-27 17:51               ` H. Peter Anvin
  2012-04-28  2:18             ` [PATCH] " ShuoX Liu
  3 siblings, 1 reply; 16+ messages in thread
From: Tejun Heo @ 2012-04-27 17:39 UTC (permalink / raw)
  To: Yanmin Zhang
  Cc: ShuoX Liu, linux-kernel, Christoph Lameter, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin

With the embed percpu first chunk allocator, x86 uses either PAGE_SIZE
or PMD_SIZE for atom_size.  PMD_SIZE is used when CPU supports PSE so
that percpu areas are aligned to PMD mappings and possibly allow using
PMD mappings in vmalloc areas in the future.  Using larger atom_size
doesn't waste actual memory; however, it does require larger vmalloc
space allocation later on for !first chunks.

With reasonably sized vmalloc area, PMD_SIZE shouldn't be a problem
but x86_32 at this point is anything but reasonable in terms of
address space and using larger atom_size reportedly leads to frequent
percpu allocation failures on certain setups.

As there is no reason to not use PMD_SIZE on x86_64 as vmalloc space
is aplenty and most x86_64 configurations support PSE, fix the issue
by always using PMD_SIZE on x86_64 and PAGE_SIZE on x86_32.

v2: drop cpu_has_pse test and make x86_64 always use PMD_SIZE and
    x86_32 PAGE_SIZE as suggested by hpa.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yanmin Zhang <yanmin.zhang@intel.com>
Reported-by: ShuoX Liu <shuox.liu@intel.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <4F97BA98.6010001@intel.com>
Cc: stable@vger.kernel.org
---
 arch/x86/kernel/setup_percpu.c |   14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727..5a98aa2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
 #endif
 	rc = -EINVAL;
 	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
-		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
 		const size_t dyn_size = PERCPU_MODULE_RESERVE +
 			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+		size_t atom_size;
 
+		/*
+		 * On 64bit, use PMD_SIZE for atom_size so that embedded
+		 * percpu areas are aligned to PMD.  This, in the future,
+		 * can also allow using PMD mappings in vmalloc area.  Use
+		 * PAGE_SIZE on 32bit as vmalloc space is highly contended
+		 * and large vmalloc area allocs can easily fail.
+		 */
+#ifdef CONFIG_X86_64
+		atom_size = PMD_SIZE;
+#else
+		atom_size = PAGE_SIZE;
+#endif
 		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
 					    dyn_size, atom_size,
 					    pcpu_cpu_distance,

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 17:39             ` [PATCH v2] " Tejun Heo
@ 2012-04-27 17:51               ` H. Peter Anvin
  2012-04-27 17:53                 ` Tejun Heo
  0 siblings, 1 reply; 16+ messages in thread
From: H. Peter Anvin @ 2012-04-27 17:51 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Yanmin Zhang, ShuoX Liu, linux-kernel, Christoph Lameter,
	Thomas Gleixner, Ingo Molnar

On 04/27/2012 10:39 AM, Tejun Heo wrote:
> With the embed percpu first chunk allocator, x86 uses either PAGE_SIZE
> or PMD_SIZE for atom_size.  PMD_SIZE is used when CPU supports PSE so
> that percpu areas are aligned to PMD mappings and possibly allow using
> PMD mappings in vmalloc areas in the future.  Using larger atom_size
> doesn't waste actual memory; however, it does require larger vmalloc
> space allocation later on for !first chunks.
> 
> With reasonably sized vmalloc area, PMD_SIZE shouldn't be a problem
> but x86_32 at this point is anything but reasonable in terms of
> address space and using larger atom_size reportedly leads to frequent
> percpu allocation failures on certain setups.
> 
> As there is no reason to not use PMD_SIZE on x86_64 as vmalloc space
> is aplenty and most x86_64 configurations support PSE, fix the issue
> by always using PMD_SIZE on x86_64 and PAGE_SIZE on x86_32.
> 
> v2: drop cpu_has_pse test and make x86_64 always use PMD_SIZE and
>     x86_32 PAGE_SIZE as suggested by hpa.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Yanmin Zhang <yanmin.zhang@intel.com>
> Reported-by: ShuoX Liu <shuox.liu@intel.com>
> Acked-by: H. Peter Anvin <hpa@zytor.com>
> LKML-Reference: <4F97BA98.6010001@intel.com>
> Cc: stable@vger.kernel.org

You're carrying this, or should I?

	-hpa


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 17:51               ` H. Peter Anvin
@ 2012-04-27 17:53                 ` Tejun Heo
  0 siblings, 0 replies; 16+ messages in thread
From: Tejun Heo @ 2012-04-27 17:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yanmin Zhang, ShuoX Liu, linux-kernel, Christoph Lameter,
	Thomas Gleixner, Ingo Molnar

On Fri, Apr 27, 2012 at 10:51:43AM -0700, H. Peter Anvin wrote:
> On 04/27/2012 10:39 AM, Tejun Heo wrote:
> > With the embed percpu first chunk allocator, x86 uses either PAGE_SIZE
> > or PMD_SIZE for atom_size.  PMD_SIZE is used when CPU supports PSE so
> > that percpu areas are aligned to PMD mappings and possibly allow using
> > PMD mappings in vmalloc areas in the future.  Using larger atom_size
> > doesn't waste actual memory; however, it does require larger vmalloc
> > space allocation later on for !first chunks.
> > 
> > With reasonably sized vmalloc area, PMD_SIZE shouldn't be a problem
> > but x86_32 at this point is anything but reasonable in terms of
> > address space and using larger atom_size reportedly leads to frequent
> > percpu allocation failures on certain setups.
> > 
> > As there is no reason to not use PMD_SIZE on x86_64 as vmalloc space
> > is aplenty and most x86_64 configurations support PSE, fix the issue
> > by always using PMD_SIZE on x86_64 and PAGE_SIZE on x86_32.
> > 
> > v2: drop cpu_has_pse test and make x86_64 always use PMD_SIZE and
> >     x86_32 PAGE_SIZE as suggested by hpa.
> > 
> > Signed-off-by: Tejun Heo <tj@kernel.org>
> > Reported-by: Yanmin Zhang <yanmin.zhang@intel.com>
> > Reported-by: ShuoX Liu <shuox.liu@intel.com>
> > Acked-by: H. Peter Anvin <hpa@zytor.com>
> > LKML-Reference: <4F97BA98.6010001@intel.com>
> > Cc: stable@vger.kernel.org
> 
> You're carrying this, or should I?

I have another percpu fix to push so I can carry it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit
  2012-04-27 16:53           ` [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit Tejun Heo
                               ` (2 preceding siblings ...)
  2012-04-27 17:39             ` [PATCH v2] " Tejun Heo
@ 2012-04-28  2:18             ` ShuoX Liu
  3 siblings, 0 replies; 16+ messages in thread
From: ShuoX Liu @ 2012-04-28  2:18 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Yanmin Zhang, linux-kernel, Christoph Lameter, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin

On 2012年04月28日 00:53, Tejun Heo wrote:

> With the embed percpu first chunk allocator, x86 uses either PAGE_SIZE
> or PMD_SIZE for atom_size.  PMD_SIZE is used when CPU supports PSE so
> that percpu areas are aligned to PMD mappings and possibly allow using
> PMD mappings in vmalloc areas in the future.  Using larger atom_size
> doesn't waste actual memory; however, it does require larger vmalloc
> space allocation later on for !first chunks.
> 
> With reasonably sized vmalloc area, PMD_SIZE shouldn't be a problem
> but x86_32 at this point is anything but reasonable in terms of
> address space and using later atom_size reportedly leads to frequent
> percpu allocation failures on certain setups.
> 
> This patch makes x86_32 always use PAGE_SIZE as atom_size for embed
> first chunk allocator.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Yanmin Zhang <yanmin.zhang@intel.com>
> Reported-by: ShuoX Liu <shuox.liu@intel.com>
> LKML-Reference: <4F97BA98.6010001@intel.com>
> Cc: stable@vger.kernel.org
> ---
> Can you please verify this resolves the issue you guys are seeing?
> Once verified, I'll push it through percpu/for-3.5-fixes.
> 

Verify OK.

> Thanks.
> 
>  arch/x86/kernel/setup_percpu.c |   14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
> index 71f4727..ed96573 100644
> --- a/arch/x86/kernel/setup_percpu.c
> +++ b/arch/x86/kernel/setup_percpu.c
> @@ -185,10 +185,22 @@ void __init setup_per_cpu_areas(void)
>  #endif
>  	rc = -EINVAL;
>  	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
> -		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
>  		const size_t dyn_size = PERCPU_MODULE_RESERVE +
>  			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
> +		size_t atom_size;
>  
> +		/*
> +		 * If PSE is available, use PMD_SIZE for atom_size so that
> +		 * embedded percpu areas are aligned to PMD.  This, in the
> +		 * future, can also allow using PMD mappings in vmalloc
> +		 * area.  Use PAGE_SIZE on 32bit as vmalloc space is highly
> +		 * contended and large vmalloc area alloc can easily fail.
> +		 */
> +		atom_size = PAGE_SIZE;
> +#ifdef CONFIG_X86_64
> +		if (cpu_has_pse)
> +			atom_size = PMD_SIZE;
> +#endif
>  		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
>  					    dyn_size, atom_size,
>  					    pcpu_cpu_distance,



^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2012-04-28  2:21 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-25  8:49 [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size ShuoX Liu
2012-04-25 22:24 ` Tejun Heo
2012-04-26  2:01   ` Yanmin Zhang
2012-04-26 22:49     ` Tejun Heo
2012-04-27  1:09       ` Yanmin Zhang
2012-04-27  8:56         ` Yanmin Zhang
2012-04-27 16:53           ` [PATCH] percpu, x86: don't use PMD_SIZE as embedded atom_size on 32bit Tejun Heo
2012-04-27 17:17             ` H. Peter Anvin
2012-04-27 17:27               ` Tejun Heo
2012-04-27 17:21             ` Christoph Lameter
2012-04-27 17:27               ` Tejun Heo
2012-04-27 17:39             ` [PATCH v2] " Tejun Heo
2012-04-27 17:51               ` H. Peter Anvin
2012-04-27 17:53                 ` Tejun Heo
2012-04-28  2:18             ` [PATCH] " ShuoX Liu
2012-04-27 16:34         ` [PATCH] mm: percpu: Add PCPU_FC_FIXED to pcpu_fc for setting fixed pcpu_atom_size Tejun Heo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.