linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] tvec_bases too large for per-cpu data
@ 2006-01-18 13:11 Jan Beulich
  2006-01-21  7:25 ` Andrew Morton
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2006-01-18 13:11 UTC (permalink / raw)
  To: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 621 bytes --]

From: Jan Beulich <jbeulich@novell.com>

With internal Xen-enabled kernels we see the kernel's static per-cpu
data area
exceed the limit of 32k on x86-64, and even native x86-64 kernels get
fairly
close to that limit. I generally question whether it is reasonable to
have
data structures several kb in size allocated as per-cpu data when the
space
there is rather limited.
The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit
archs,
over 8k on 64-bit ones), which now gets converted to use dynamically
allocated
memory instead.

Signed-Off-By: Jan Beulich <jbeulich@novell.com>

(actual patch attached)

[-- Attachment #2: linux-2.6.16-rc1-per-cpu-tvec_bases.patch --]
[-- Type: application/octet-stream, Size: 4159 bytes --]

From: Jan Beulich <jbeulich@novell.com>

With internal Xen-enabled kernels we see the kernel's static per-cpu data area
exceed the limit of 32k on x86-64, and even native x86-64 kernels get fairly
close to that limit. I generally question whether it is reasonable to have
data structures several kb in size allocated as per-cpu data when the space
there is rather limited.
The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit archs,
over 8k on 64-bit ones), which now gets converted to use dynamically allocated
memory instead.

Signed-Off-By: Jan Beulich <jbeulich@novell.com>

diff -Npru /home/jbeulich/tmp/linux-2.6.16-rc1/kernel/timer.c 2.6.16-rc1-per-cpu-tvec_bases/kernel/timer.c
--- /home/jbeulich/tmp/linux-2.6.16-rc1/kernel/timer.c	2006-01-18 12:39:13.000000000 +0100
+++ 2.6.16-rc1-per-cpu-tvec_bases/kernel/timer.c	2006-01-18 13:53:28.000000000 +0100
@@ -86,7 +86,8 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
+static tvec_base_t boot_tvec_bases;
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
@@ -157,7 +158,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
-	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -218,7 +219,7 @@ int __mod_timer(struct timer_list *timer
 		ret = 1;
 	}
 
-	new_base = &__get_cpu_var(tvec_bases);
+	new_base = __get_cpu_var(tvec_bases);
 
 	if (base != &new_base->t_base) {
 		/*
@@ -258,7 +259,7 @@ EXPORT_SYMBOL(__mod_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
+	tvec_base_t *base = per_cpu(tvec_bases, cpu);
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
@@ -492,7 +493,7 @@ unsigned long next_timer_interrupt(void)
 	tvec_t *varray[4];
 	int i, j;
 
-	base = &__get_cpu_var(tvec_bases);
+	base = __get_cpu_var(tvec_bases);
 	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = 0;
@@ -856,7 +857,7 @@ EXPORT_SYMBOL(xtime_lock);
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
  	hrtimer_run_queues();
 	if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1209,12 +1210,31 @@ asmlinkage long sys_sysinfo(struct sysin
 	return 0;
 }
 
-static void __devinit init_timers_cpu(int cpu)
+static int __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
 
-	base = &per_cpu(tvec_bases, cpu);
+	base = per_cpu(tvec_bases, cpu);
+	if (likely(!base)) {
+		static char boot_done;
+
+		if (likely(boot_done)) {
+#ifdef CONFIG_NUMA
+			base = kmalloc_node(sizeof(*base), GFP_KERNEL, cpu_to_node(cpu));
+			if (!base)
+#endif
+				base = kmalloc(sizeof(*base), GFP_KERNEL);
+			if (!base)
+				return -ENOMEM;
+			memset(base, 0, sizeof(*base));
+		}
+		else {
+			base = &boot_tvec_bases;
+			boot_done = 1;
+		}
+		per_cpu(tvec_bases, cpu) = base;
+	}
 	spin_lock_init(&base->t_base.lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1226,6 +1246,7 @@ static void __devinit init_timers_cpu(in
 		INIT_LIST_HEAD(base->tv1.vec + j);
 
 	base->timer_jiffies = jiffies;
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1248,8 +1269,8 @@ static void __devinit migrate_timers(int
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = &per_cpu(tvec_bases, cpu);
-	new_base = &get_cpu_var(tvec_bases);
+	old_base = per_cpu(tvec_bases, cpu);
+	new_base = get_cpu_var(tvec_bases);
 
 	local_irq_disable();
 	spin_lock(&new_base->t_base.lock);
@@ -1279,7 +1300,8 @@ static int __devinit timer_cpu_notify(st
 	long cpu = (long)hcpu;
 	switch(action) {
 	case CPU_UP_PREPARE:
-		init_timers_cpu(cpu);
+		if (init_timers_cpu(cpu) < 0)
+			return NOTIFY_BAD;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] tvec_bases too large for per-cpu data
  2006-01-18 13:11 [PATCH] tvec_bases too large for per-cpu data Jan Beulich
@ 2006-01-21  7:25 ` Andrew Morton
  2006-01-23 10:31   ` Jan Beulich
  0 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2006-01-21  7:25 UTC (permalink / raw)
  To: Jan Beulich; +Cc: linux-kernel

"Jan Beulich" <JBeulich@novell.com> wrote:
>
> The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit
>  archs,
>  over 8k on 64-bit ones), which now gets converted to use dynamically
>  allocated
>  memory instead.

ho hum, another pointer hop.

Did you consider using alloc_percpu()?


The patch does trickery in init_timers_cpu() which, from my reading, defers
the actual per-cpu allocation until the second CPU comes online. 
Presumably because of some ordering issue which you discovered.  Readers of
the code need to know what that issue was.

And boot_tvec_bases will always be used for the BP, and hence one slot in
the per-cpu array will forever be unused.  Until the BP is taken down and
brought back up, in which case it will suddenly start to use a dynamically
allocated structure.

But all of this modification was unchangelogged and is uncommented, so I'm
somewhat guessing here.  Please always ensure that tricksy things like this
have complete covering comments.

Also, the new code would appear to leak one tvec_base_t per cpu-unplugging?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] tvec_bases too large for per-cpu data
  2006-01-21  7:25 ` Andrew Morton
@ 2006-01-23 10:31   ` Jan Beulich
  2006-01-23 10:57     ` Andrew Morton
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2006-01-23 10:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

>>> Andrew Morton <akpm@osdl.org> 21.01.06 08:25:00 >>>
>"Jan Beulich" <JBeulich@novell.com> wrote:
>>
>> The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit
>>  archs,
>>  over 8k on 64-bit ones), which now gets converted to use dynamically
>>  allocated
>>  memory instead.
>
>ho hum, another pointer hop.
>
>Did you consider using alloc_percpu()?

I did, but I saw drawbacks with that (most notably the fact that all instances are allocated at
once, possibly wasting a lot of memory).

>The patch does trickery in init_timers_cpu() which, from my reading, defers
>the actual per-cpu allocation until the second CPU comes online. 
>Presumably because of some ordering issue which you discovered.  Readers of
>the code need to know what that issue was.

No, I don't see any trickery there (on demand allocation in CPU_UP_PREPARE is being done
elsewhere in very similar ways), and I also didn't see any ordering issues. Hence I also didn't
see any need to explain this in detail.

>And boot_tvec_bases will always be used for the BP, and hence one slot in
>the per-cpu array will forever be unused.  Until the BP is taken down and
>brought back up, in which case it will suddenly start to use a dynamically
>allocated structure.

Why? Each slot is allocated at most once, the BP's is never allocated (it will continue to use the
static one even when brought down and back up).

>But all of this modification was unchangelogged and is uncommented, so I'm
>somewhat guessing here.  Please always ensure that tricksy things like this
>have complete covering comments.
>
>Also, the new code would appear to leak one tvec_base_t per cpu-unplugging?

Not really, as it would be re-used the next time a cpu with the same ID gets brought up (that
is, compared with the current situation there is generally less memory wasted unless all NR_CPUs
are brought up and then one or more down again, in which case the amount of space wasted
would equal [neglecting the slack space resulting from kmalloc's way of allocating]).

Jan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] tvec_bases too large for per-cpu data
  2006-01-23 10:31   ` Jan Beulich
@ 2006-01-23 10:57     ` Andrew Morton
  2006-01-24  8:33       ` Jan Beulich
  2006-01-30  8:43       ` [PATCH] tvec_bases too large for per-cpu data Jan Beulich
  0 siblings, 2 replies; 11+ messages in thread
From: Andrew Morton @ 2006-01-23 10:57 UTC (permalink / raw)
  To: Jan Beulich; +Cc: linux-kernel

"Jan Beulich" <JBeulich@novell.com> wrote:
>
> >>> Andrew Morton <akpm@osdl.org> 21.01.06 08:25:00 >>>
> >"Jan Beulich" <JBeulich@novell.com> wrote:
> >>
> >> The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit
> >>  archs,
> >>  over 8k on 64-bit ones), which now gets converted to use dynamically
> >>  allocated
> >>  memory instead.
> >
> >ho hum, another pointer hop.
> >
> >Did you consider using alloc_percpu()?
> 
> I did, but I saw drawbacks with that (most notably the fact that all instances are allocated at
> once, possibly wasting a lot of memory).

It's 4k for each cpu which is in the possible_map but which will never be
brought online.  I don't think that'll be a lot of memory - are there
machines which have a lot of possible-but-not-really-there CPUs?

> >The patch does trickery in init_timers_cpu() which, from my reading, defers
> >the actual per-cpu allocation until the second CPU comes online. 
> >Presumably because of some ordering issue which you discovered.  Readers of
> >the code need to know what that issue was.
> 
> No, I don't see any trickery there (on demand allocation in CPU_UP_PREPARE is being done
> elsewhere in very similar ways), and I also didn't see any ordering issues. Hence I also didn't
> see any need to explain this in detail.

There _must_ be ordering issues.  Otherwise we'd just dynamically allocate
all the structs up-front and be done with it.

Presumably the ordering issue is that init_timers() is called before
kmem_cache_init().  That's non-obvious and should be commented.

> >And boot_tvec_bases will always be used for the BP, and hence one slot in
> >the per-cpu array will forever be unused.  Until the BP is taken down and
> >brought back up, in which case it will suddenly start to use a dynamically
> >allocated structure.
> 
> Why? Each slot is allocated at most once, the BP's is never allocated (it will continue to use the
> static one even when brought down and back up).

OK, I missed the `if (likely(!base))' test in there.  Patch seems OK from
that POV and we now seem to know what the ordering problem is.

- The `#ifdef CONFIG_NUMA' in init_timers_cpu() seems to be unnecessary -
  kmalloc_node() will use kmalloc() if !NUMA.

- The likely()s in init_timers_cpu() seems fairly pointless - it's not a
  fastpath.

- We prefer to do this:

	if (expr) {
		...
	} else {
		...
	}

  and not

	if (expr) {
		...
	}
	else {
		...
	}



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] tvec_bases too large for per-cpu data
  2006-01-23 10:57     ` Andrew Morton
@ 2006-01-24  8:33       ` Jan Beulich
  2006-01-24  8:58         ` Andrew Morton
  2006-01-30  8:43       ` [PATCH] tvec_bases too large for per-cpu data Jan Beulich
  1 sibling, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2006-01-24  8:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

>> >Did you consider using alloc_percpu()?
>> 
>> I did, but I saw drawbacks with that (most notably the fact that all instances are allocated at
>> once, possibly wasting a lot of memory).
>
>It's 4k for each cpu which is in the possible_map but which will never be
>brought online.  I don't think that'll be a lot of memory - are there
>machines which have a lot of possible-but-not-really-there CPUs?

I would suppose so. Why wouldn't a machine supporting CPU hotplug not reasonably be able to double,
triple, etc the number of CPUs originally present?

>There _must_ be ordering issues.  Otherwise we'd just dynamically allocate
>all the structs up-front and be done with it.
>
>Presumably the ordering issue is that init_timers() is called before
>kmem_cache_init().  That's non-obvious and should be commented.

That I can easily do, sure.

>- The `#ifdef CONFIG_NUMA' in init_timers_cpu() seems to be unnecessary -
>  kmalloc_node() will use kmalloc() if !NUMA.

That is correct, but I wanted the fallback if kmalloc_node() fails (from briefly looking at that code it didn't
seem like it would do such fallback itself). And calling kmalloc() twice if !NUMA seemed pointless.

>- The likely()s in init_timers_cpu() seems fairly pointless - it's not a
>  fastpath.

OK, will change that.

>- We prefer to do this:
>
>	if (expr) {
>		...
>	} else {
>		...
>	}
>
>  and not
>
>	if (expr) {
>		...
>	}
>	else {
>		...
>	}

I can change that, too, but I don't see why this gets pointed out again and again when there really
is no consistency across the entire kernel...

Jan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] tvec_bases too large for per-cpu data
  2006-01-24  8:33       ` Jan Beulich
@ 2006-01-24  8:58         ` Andrew Morton
  2006-01-24 14:46           ` [PATCH] [SMP] reduce size of percpudata, and make sure per_cpu(object, not_possible_cpu) cause an invalid memory reference Eric Dumazet
  0 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2006-01-24  8:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: linux-kernel

"Jan Beulich" <JBeulich@novell.com> wrote:
>
> >- The `#ifdef CONFIG_NUMA' in init_timers_cpu() seems to be unnecessary -
> >  kmalloc_node() will use kmalloc() if !NUMA.
> 
> That is correct, but I wanted the fallback if kmalloc_node() fails (from briefly looking at that code it didn't
> seem like it would do such fallback itself). And calling kmalloc() twice if !NUMA seemed pointless.

I'm sorry, but how were we to know that?  Telepathy?

GFP_KERNEL for these size objects is pretty much infallible anyway, so I
wouldn't worry about it.  Or allocate all needed storage a little later in
boot with alloc_percpu().


> >- We prefer to do this:
> >
> >	if (expr) {
> >		...
> >	} else {
> >		...
> >	}
> >
> >  and not
> >
> >	if (expr) {
> >		...
> >	}
> >	else {
> >		...
> >	}
> 
> I can change that, too, but I don't see why this gets pointed out again and again when there really
> is no consistency across the entire kernel...
> 

We fix these things up when working on incorrectly laid-out code, but we
rarely bother raising a patch just to fix something like that.  We'll get
there eventually.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] [SMP] reduce size of percpudata, and make sure per_cpu(object, not_possible_cpu) cause an invalid memory reference
  2006-01-24  8:58         ` Andrew Morton
@ 2006-01-24 14:46           ` Eric Dumazet
  2006-01-24 14:53             ` Andi Kleen
  2006-02-01  9:21             ` [PATCH] [SMP] __GENERIC_PER_CPU changes Eric Dumazet
  0 siblings, 2 replies; 11+ messages in thread
From: Eric Dumazet @ 2006-01-24 14:46 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Beulich, linux-kernel, Andi Kleen, shai, kiran, pravins

[-- Attachment #1: Type: text/plain, Size: 802 bytes --]

percpu_data blindly allocates bootmem memory to store NR_CPUS instances of 
cpudata, instead of allocating memory only for possible cpus.

This patch saves ram, allocating num_possible_cpus() (instead of NR_CPUS) 
instances.

This patch also makes sure a reference to  per_cpu(object, not_possible_cpu) 
does a reference to invalid memory (NULL+small_offset).

As some architectures (x86_64) are now allocating cpudata only on possible 
cpus, we (kernel developers on x86 machines) should make sure that x86 does a 
similar thing to find bugs. This is important that this patch has some 
exposure in -mm for some time, some places must now use :

for_each_cpu(i) {
	... per_cpu(xxx,i) ...

instead of the traditional

for (i = 0 ; i < NR_CPUS ; i++)


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: setup_per_cpu_areas.patch --]
[-- Type: text/plain, Size: 3639 bytes --]

--- linux-2.6.16-rc1-mm2/init/main.c	2006-01-24 15:45:28.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/init/main.c	2006-01-24 16:31:53.000000000 +0100
@@ -334,6 +334,7 @@
 {
 	unsigned long size, i;
 	char *ptr;
+	unsigned long nr_possible_cpus = num_possible_cpus();
 
 	/* Copy section for each CPU (we discard the original) */
 	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
@@ -341,12 +342,16 @@
 	if (size < PERCPU_ENOUGH_ROOM)
 		size = PERCPU_ENOUGH_ROOM;
 #endif
+	ptr = alloc_bootmem(size * nr_possible_cpus);
 
-	ptr = alloc_bootmem(size * NR_CPUS);
-
-	for (i = 0; i < NR_CPUS; i++, ptr += size) {
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i)) {
+			__per_cpu_offset[i] = (char*)0 - __per_cpu_start;
+			continue;
+		}
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+		ptr += size;
 	}
 }
 #endif /* !__GENERIC_PER_CPU */
--- linux-2.6.16-rc1-mm2/block/ll_rw_blk.c	2006-01-24 15:57:29.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/block/ll_rw_blk.c	2006-01-24 15:57:40.000000000 +0100
@@ -3536,7 +3536,7 @@
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
--- linux-2.6.16-rc1-mm2/drivers/scsi/scsi.c	2006-01-24 16:04:28.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/drivers/scsi/scsi.c	2006-01-24 16:04:40.000000000 +0100
@@ -1245,7 +1245,7 @@
 	if (error)
 		goto cleanup_sysctl;
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(scsi_done_q, i));
 
 	devfs_mk_dir("scsi");
--- linux-2.6.16-rc1-mm2/net/core/utils.c	2006-01-24 16:14:40.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/net/core/utils.c	2006-01-24 16:16:37.000000000 +0100
@@ -121,7 +121,7 @@
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, i+jiffies);
 	}
@@ -133,7 +133,7 @@
 	unsigned long seed[NR_CPUS];
 
 	get_random_bytes(seed, sizeof(seed));
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, seed[i]);
 	}
--- linux-2.6.16-rc1-mm2/net/core/dev.c	2006-01-24 16:24:24.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/net/core/dev.c	2006-01-24 16:24:46.000000000 +0100
@@ -3240,7 +3240,7 @@
 	 *	Initialise the packet receive queues.
 	 */
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct softnet_data *queue;
 
 		queue = &per_cpu(softnet_data, i);
--- linux-2.6.16-rc1-mm2/net/ipv4/proc.c	2006-01-24 16:18:13.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/net/ipv4/proc.c	2006-01-24 16:19:10.000000000 +0100
@@ -49,7 +49,7 @@
 	int res = 0;
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
--- linux-2.6.16-rc1-mm2/net/ipv6/proc.c	2006-01-24 16:18:40.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/net/ipv6/proc.c	2006-01-24 16:19:10.000000000 +0100
@@ -38,7 +38,7 @@
 	int res = 0;
 	int cpu;
 
-	for (cpu=0; cpu<NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
--- linux-2.6.16-rc1-mm2/net/socket.c	2006-01-24 16:19:01.000000000 +0100
+++ linux-2.6.16-rc1-mm2-ed/net/socket.c	2006-01-24 16:19:10.000000000 +0100
@@ -2079,7 +2079,7 @@
 	int cpu;
 	int counter = 0;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		counter += per_cpu(sockets_in_use, cpu);
 
 	/* It can be negative, by the way. 8) */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] [SMP] reduce size of percpudata, and make sure per_cpu(object, not_possible_cpu) cause an invalid memory reference
  2006-01-24 14:46           ` [PATCH] [SMP] reduce size of percpudata, and make sure per_cpu(object, not_possible_cpu) cause an invalid memory reference Eric Dumazet
@ 2006-01-24 14:53             ` Andi Kleen
  2006-02-01  9:21             ` [PATCH] [SMP] __GENERIC_PER_CPU changes Eric Dumazet
  1 sibling, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2006-01-24 14:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andrew Morton, Jan Beulich, linux-kernel, shai, kiran, pravins

On Tuesday 24 January 2006 15:46, Eric Dumazet wrote:

> As some architectures (x86_64) are now allocating cpudata only on possible
> cpus, we (kernel developers on x86 machines) should make sure that x86 does
> a similar thing to find bugs. This is important that this patch has some
> exposure in -mm for some time, some places must now use :

The for_each_cpu changes should be actually merged ASAP because x86-64 
needs them (or maybe i should back out that change until the kernel is ready?)
But it really saves a lot of memory - several MB on a distro kernel which is 
compiled with NR_CPUS==128.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] tvec_bases too large for per-cpu data
  2006-01-23 10:57     ` Andrew Morton
  2006-01-24  8:33       ` Jan Beulich
@ 2006-01-30  8:43       ` Jan Beulich
  2006-01-31 22:27         ` Andrew Morton
  1 sibling, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2006-01-30  8:43 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2625 bytes --]

Hopefully attached revised patch addresses all concerns mentioned (except that it intentionally continues to not use
alloc_percpu()).

Jan

>>> Andrew Morton <akpm@osdl.org> 23.01.06 11:57:02 >>>
"Jan Beulich" <JBeulich@novell.com> wrote:
>
> >>> Andrew Morton <akpm@osdl.org> 21.01.06 08:25:00 >>>
> >"Jan Beulich" <JBeulich@novell.com> wrote:
> >>
> >> The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit
> >>  archs,
> >>  over 8k on 64-bit ones), which now gets converted to use dynamically
> >>  allocated
> >>  memory instead.
> >
> >ho hum, another pointer hop.
> >
> >Did you consider using alloc_percpu()?
> 
> I did, but I saw drawbacks with that (most notably the fact that all instances are allocated at
> once, possibly wasting a lot of memory).

It's 4k for each cpu which is in the possible_map but which will never be
brought online.  I don't think that'll be a lot of memory - are there
machines which have a lot of possible-but-not-really-there CPUs?

> >The patch does trickery in init_timers_cpu() which, from my reading, defers
> >the actual per-cpu allocation until the second CPU comes online. 
> >Presumably because of some ordering issue which you discovered.  Readers of
> >the code need to know what that issue was.
> 
> No, I don't see any trickery there (on demand allocation in CPU_UP_PREPARE is being done
> elsewhere in very similar ways), and I also didn't see any ordering issues. Hence I also didn't
> see any need to explain this in detail.

There _must_ be ordering issues.  Otherwise we'd just dynamically allocate
all the structs up-front and be done with it.

Presumably the ordering issue is that init_timers() is called before
kmem_cache_init().  That's non-obvious and should be commented.

> >And boot_tvec_bases will always be used for the BP, and hence one slot in
> >the per-cpu array will forever be unused.  Until the BP is taken down and
> >brought back up, in which case it will suddenly start to use a dynamically
> >allocated structure.
> 
> Why? Each slot is allocated at most once, the BP's is never allocated (it will continue to use the
> static one even when brought down and back up).

OK, I missed the `if (likely(!base))' test in there.  Patch seems OK from
that POV and we now seem to know what the ordering problem is.

- The `#ifdef CONFIG_NUMA' in init_timers_cpu() seems to be unnecessary -
  kmalloc_node() will use kmalloc() if !NUMA.

- The likely()s in init_timers_cpu() seems fairly pointless - it's not a
  fastpath.

- We prefer to do this:

	if (expr) {
		...
	} else {
		...
	}

  and not

	if (expr) {
		...
	}
	else {
		...
	}



[-- Attachment #2: linux-2.6.16-rc1-per-cpu-tvec_bases.patch --]
[-- Type: text/plain, Size: 4397 bytes --]

From: Jan Beulich <jbeulich@novell.com>

With internal Xen-enabled kernels we see the kernel's static per-cpu data area
exceed the limit of 32k on x86-64, and even native x86-64 kernels get fairly
close to that limit. I generally question whether it is reasonable to have
data structures several kb in size allocated as per-cpu data when the space
there is rather limited.
The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit archs,
over 8k on 64-bit ones), which now gets converted to use dynamically allocated
memory instead.

Signed-Off-By: Jan Beulich <jbeulich@novell.com>

diff -Npru /home/jbeulich/tmp/linux-2.6.16-rc1/kernel/timer.c 2.6.16-rc1-per-cpu-tvec_bases/kernel/timer.c
--- /home/jbeulich/tmp/linux-2.6.16-rc1/kernel/timer.c	2006-01-27 15:10:49.000000000 +0100
+++ 2.6.16-rc1-per-cpu-tvec_bases/kernel/timer.c	2006-01-27 16:22:35.000000000 +0100
@@ -86,7 +86,8 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
+static tvec_base_t boot_tvec_bases;
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
@@ -157,7 +158,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
-	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -218,7 +219,7 @@ int __mod_timer(struct timer_list *timer
 		ret = 1;
 	}
 
-	new_base = &__get_cpu_var(tvec_bases);
+	new_base = __get_cpu_var(tvec_bases);
 
 	if (base != &new_base->t_base) {
 		/*
@@ -258,7 +259,7 @@ EXPORT_SYMBOL(__mod_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
+	tvec_base_t *base = per_cpu(tvec_bases, cpu);
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
@@ -492,7 +493,7 @@ unsigned long next_timer_interrupt(void)
 	tvec_t *varray[4];
 	int i, j;
 
-	base = &__get_cpu_var(tvec_bases);
+	base = __get_cpu_var(tvec_bases);
 	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = 0;
@@ -856,7 +857,7 @@ EXPORT_SYMBOL(xtime_lock);
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
  	hrtimer_run_queues();
 	if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1209,12 +1210,34 @@ asmlinkage long sys_sysinfo(struct sysin
 	return 0;
 }
 
-static void __devinit init_timers_cpu(int cpu)
+static int __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
 
-	base = &per_cpu(tvec_bases, cpu);
+	base = per_cpu(tvec_bases, cpu);
+	if (!base) {
+		static char boot_done;
+
+		/* Cannot do allocation in init_timers as that runs before the
+		   allocator initializes (and would waste memory if there are
+		   more possible CPUs than will ever be installed/brought up). */
+		if (boot_done) {
+#ifdef CONFIG_NUMA
+			base = kmalloc_node(sizeof(*base), GFP_KERNEL, cpu_to_node(cpu));
+			if (!base)
+				/* Just in case, fall back to normal allocation. */
+#endif
+				base = kmalloc(sizeof(*base), GFP_KERNEL);
+			if (!base)
+				return -ENOMEM;
+			memset(base, 0, sizeof(*base));
+		} else {
+			base = &boot_tvec_bases;
+			boot_done = 1;
+		}
+		per_cpu(tvec_bases, cpu) = base;
+	}
 	spin_lock_init(&base->t_base.lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1226,6 +1249,7 @@ static void __devinit init_timers_cpu(in
 		INIT_LIST_HEAD(base->tv1.vec + j);
 
 	base->timer_jiffies = jiffies;
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1248,8 +1272,8 @@ static void __devinit migrate_timers(int
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = &per_cpu(tvec_bases, cpu);
-	new_base = &get_cpu_var(tvec_bases);
+	old_base = per_cpu(tvec_bases, cpu);
+	new_base = get_cpu_var(tvec_bases);
 
 	local_irq_disable();
 	spin_lock(&new_base->t_base.lock);
@@ -1279,7 +1303,8 @@ static int __devinit timer_cpu_notify(st
 	long cpu = (long)hcpu;
 	switch(action) {
 	case CPU_UP_PREPARE:
-		init_timers_cpu(cpu);
+		if (init_timers_cpu(cpu) < 0)
+			return NOTIFY_BAD;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] tvec_bases too large for per-cpu data
  2006-01-30  8:43       ` [PATCH] tvec_bases too large for per-cpu data Jan Beulich
@ 2006-01-31 22:27         ` Andrew Morton
  0 siblings, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2006-01-31 22:27 UTC (permalink / raw)
  To: Jan Beulich; +Cc: linux-kernel

"Jan Beulich" <JBeulich@novell.com> wrote:
>
> Hopefully attached revised patch addresses all concerns mentioned (except that it intentionally continues to not use
> alloc_percpu()).

This remains unaddressed:


+#ifdef CONFIG_NUMA
+			base = kmalloc_node(sizeof(*base), GFP_KERNEL, cpu_to_node(cpu));
+			if (!base)
+				/* Just in case, fall back to normal allocation. */
+#endif
+				base = kmalloc(sizeof(*base), GFP_KERNEL);
+			if (!base)
+				return -ENOMEM;


If we cannot allocate memory on this node for this CPU (wildly unlikely,
btw) we face a choice.  Either

a) Allocate memory from some other node, making the machine mysteriously
   run slower for the remainder of its uptime or

b) Fail the CPU bringup.

I think b) is better - it tells the operator that something went wrong, so
some sort of corrective action can be taken.  I suspect that most operators
would prefer that to having the kernel secretly make their machine run
slower.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] [SMP] __GENERIC_PER_CPU changes
  2006-01-24 14:46           ` [PATCH] [SMP] reduce size of percpudata, and make sure per_cpu(object, not_possible_cpu) cause an invalid memory reference Eric Dumazet
  2006-01-24 14:53             ` Andi Kleen
@ 2006-02-01  9:21             ` Eric Dumazet
  1 sibling, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2006-02-01  9:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Jan Beulich, linux-kernel, Andi Kleen, shai, kiran, pravins

[-- Attachment #1: Type: text/plain, Size: 705 bytes --]

Now CONFIG_DEBUG_INITDATA is in, initial percpu data 
[__per_cpu_start,__per_cpu_end] can be declared as a redzone, and invalid 
accesses after boot can be detected, at least for i386.

We can let non possible cpus percpu data point to this 'redzone' instead of NULL .

NULL was not a good choice because part of [0..32768] memory may be readable 
and invalid accesses may happen unnoticed.

If CONFIG_DEBUG_INITDATA is not defined, each non possible cpu points to the 
initial percpu data (__per_cpu_offset[cpu] == 0), thus invalid accesses wont 
be detected/crash.

This patch also moves __per_cpu_offset[] to read_mostly area to avoid false 
sharing.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

[-- Attachment #2: init_main.patch --]
[-- Type: text/plain, Size: 662 bytes --]

--- a/init/main.c	2006-02-01 10:44:10.000000000 +0100
+++ b/init/main.c	2006-02-01 10:50:16.000000000 +0100
@@ -325,7 +325,7 @@
 #else
 
 #ifdef __GENERIC_PER_CPU
-unsigned long __per_cpu_offset[NR_CPUS];
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 
 EXPORT_SYMBOL(__per_cpu_offset);
 
@@ -343,11 +343,7 @@
 #endif
 	ptr = alloc_bootmem(size * nr_possible_cpus);
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i)) {
-			__per_cpu_offset[i] = (char*)0 - __per_cpu_start;
-			continue;
-		}
+	for_each_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 		ptr += size;

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2006-02-01  9:21 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-01-18 13:11 [PATCH] tvec_bases too large for per-cpu data Jan Beulich
2006-01-21  7:25 ` Andrew Morton
2006-01-23 10:31   ` Jan Beulich
2006-01-23 10:57     ` Andrew Morton
2006-01-24  8:33       ` Jan Beulich
2006-01-24  8:58         ` Andrew Morton
2006-01-24 14:46           ` [PATCH] [SMP] reduce size of percpudata, and make sure per_cpu(object, not_possible_cpu) cause an invalid memory reference Eric Dumazet
2006-01-24 14:53             ` Andi Kleen
2006-02-01  9:21             ` [PATCH] [SMP] __GENERIC_PER_CPU changes Eric Dumazet
2006-01-30  8:43       ` [PATCH] tvec_bases too large for per-cpu data Jan Beulich
2006-01-31 22:27         ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).