linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kmalloc_percpu
@ 2003-05-05  8:08 Rusty Russell
  2003-05-05  8:47 ` Andrew Morton
  2003-05-07  5:51 ` Ravikiran G Thirumalai
  0 siblings, 2 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-05  8:08 UTC (permalink / raw)
  To: akpm; +Cc: Dipankar Sarma, linux-kernel

Hi Andrew,

	This is the kmalloc_percpu patch.  I have another patch which
tests the allocator if you want to see that to.  This is the precursor
to per-cpu stuff in modules, but also allows other space gains for
structures which currently embed per-cpu arrays (eg. your fuzzy counters).

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: kmalloc_percpu to use same percpu operators
Author: Rusty Russell
Status: Tested on 2.5.68-bk11

D: By overallocating the per-cpu data at boot, we can make quite an
D: efficient allocator, and then use it to support per-cpu data in
D: modules (next patch).

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/asm-generic/percpu.h .29880-linux-2.5.69.updated/include/asm-generic/percpu.h
--- .29880-linux-2.5.69/include/asm-generic/percpu.h	2003-01-02 12:32:47.000000000 +1100
+++ .29880-linux-2.5.69.updated/include/asm-generic/percpu.h	2003-05-05 17:36:25.000000000 +1000
@@ -2,37 +2,11 @@
 #define _ASM_GENERIC_PERCPU_H_
 #include <linux/compiler.h>
 
-#define __GENERIC_PER_CPU
+/* Some archs may want to keep __per_cpu_offset for this CPU in a register,
+   or do their own allocation. */
 #ifdef CONFIG_SMP
-
-extern unsigned long __per_cpu_offset[NR_CPUS];
-
-/* Separate out the type, so (int[3], foo) works. */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
-#endif
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
-
-#else /* ! SMP */
-
-/* Can't define per-cpu variables in modules.  Sorry --RR */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) name##__per_cpu
-#endif
-
-#define per_cpu(var, cpu)			((void)cpu, var##__per_cpu)
-#define __get_cpu_var(var)			var##__per_cpu
-
-#endif	/* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
-
+#define __get_cpu_ptr(var) per_cpu_ptr(ptr, smp_processor_id())
+#define __NEED_SETUP_PER_CPU_AREAS
+#endif /* SMP */
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/linux/genhd.h .29880-linux-2.5.69.updated/include/linux/genhd.h
--- .29880-linux-2.5.69/include/linux/genhd.h	2003-05-05 12:37:12.000000000 +1000
+++ .29880-linux-2.5.69.updated/include/linux/genhd.h	2003-05-05 17:36:25.000000000 +1000
@@ -160,10 +160,9 @@ static inline void disk_stat_set_all(str
 #ifdef  CONFIG_SMP
 static inline int init_disk_stats(struct gendisk *disk)
 {
-	disk->dkstats = kmalloc_percpu(sizeof (struct disk_stats), GFP_KERNEL);
+	disk->dkstats = kmalloc_percpu(struct disk_stats);
 	if (!disk->dkstats)
 		return 0;
-	disk_stat_set_all(disk, 0);
 	return 1;
 }
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/linux/percpu.h .29880-linux-2.5.69.updated/include/linux/percpu.h
--- .29880-linux-2.5.69/include/linux/percpu.h	2003-02-07 19:20:01.000000000 +1100
+++ .29880-linux-2.5.69.updated/include/linux/percpu.h	2003-05-05 17:36:25.000000000 +1000
@@ -1,71 +1,85 @@
 #ifndef __LINUX_PERCPU_H
 #define __LINUX_PERCPU_H
-#include <linux/spinlock.h> /* For preempt_disable() */
-#include <linux/slab.h> /* For kmalloc_percpu() */
+#include <linux/preempt.h> /* For preempt_disable() */
+#include <linux/slab.h> /* For kmalloc() */
+#include <linux/cache.h>
+#include <linux/string.h>
+#include <asm/bug.h>
 #include <asm/percpu.h>
 
-/* Must be an lvalue. */
+/* Total pool for percpu data (for each CPU). */
+#ifndef PERCPU_POOL_SIZE
+#define PERCPU_POOL_SIZE 32768
+#endif
+
+/* For variables declared with DECLARE_PER_CPU()/DEFINE_PER_CPU(). */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
+/* Also, per_cpu(var, cpu) to get another cpu's value. */
+
+/* For ptrs allocated with kmalloc_percpu */
+#define get_cpu_ptr(ptr) ({ preempt_disable(); __get_cpu_ptr(ptr); })
+#define put_cpu_ptr(ptr) preempt_enable()
+/* Also, per_cpu_ptr(ptr, cpu) to get another cpu's value. */
 
 #ifdef CONFIG_SMP
 
-struct percpu_data {
-	void *ptrs[NR_CPUS];
-	void *blkp;
-};
+/* __alloc_percpu zeros memory for every cpu, as a convenience. */
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void kfree_percpu(const void *);
 
-/* 
- * Use this to get to a cpu's version of the per-cpu object allocated using
- * kmalloc_percpu.  If you want to get "this cpu's version", maybe you want
- * to use get_cpu_ptr... 
- */ 
-#define per_cpu_ptr(ptr, cpu)                   \
-({                                              \
-        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	\
-})
+extern unsigned long __per_cpu_offset[NR_CPUS];
 
-extern void *kmalloc_percpu(size_t size, int flags);
-extern void kfree_percpu(const void *);
-extern void kmalloc_percpu_init(void);
+/* Separate out the type, so (int[3], foo) works. */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
+#endif
 
-#else /* CONFIG_SMP */
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
+#define per_cpu_ptr(ptr, cpu) ((__typeof__(ptr))(RELOC_HIDE(ptr, __per_cpu_offset[cpu])))
 
-#define per_cpu_ptr(ptr, cpu) (ptr)
+extern void setup_per_cpu_areas(void);
+#else /* !CONFIG_SMP */
 
-static inline void *kmalloc_percpu(size_t size, int flags)
+/* Can't define per-cpu variables in modules.  Sorry --RR */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+    __typeof__(type) name##__per_cpu
+#endif
+
+#define per_cpu(var, cpu)			((void)(cpu), var##__per_cpu)
+#define __get_cpu_var(var)			var##__per_cpu
+#define per_cpu_ptr(ptr, cpu)			((void)(cpu), (ptr))
+#define __get_cpu_ptr(ptr)			(ptr)
+
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
-	return(kmalloc(size, flags));
+	void *ret;
+	/* kmalloc always cacheline aligns. */
+	BUG_ON(align > SMP_CACHE_BYTES);
+	BUG_ON(size > PERCPU_POOL_SIZE/2);
+	ret = kmalloc(size, GFP_KERNEL);
+	if (ret)
+		memset(ret, 0, size);
+	return ret;
 }
 static inline void kfree_percpu(const void *ptr)
 {	
 	kfree(ptr);
 }
-static inline void kmalloc_percpu_init(void) { }
 
+static inline void setup_per_cpu_areas(void) { }
 #endif /* CONFIG_SMP */
 
-/* 
- * Use these with kmalloc_percpu. If
- * 1. You want to operate on memory allocated by kmalloc_percpu (dereference
- *    and read/modify/write)  AND 
- * 2. You want "this cpu's version" of the object AND 
- * 3. You want to do this safely since:
- *    a. On multiprocessors, you don't want to switch between cpus after 
- *    you've read the current processor id due to preemption -- this would 
- *    take away the implicit  advantage to not have any kind of traditional 
- *    serialization for per-cpu data
- *    b. On uniprocessors, you don't want another kernel thread messing
- *    up with the same per-cpu data due to preemption
- *    
- * So, Use get_cpu_ptr to disable preemption and get pointer to the 
- * local cpu version of the per-cpu object. Use put_cpu_ptr to enable
- * preemption.  Operations on per-cpu data between get_ and put_ is
- * then considered to be safe. And ofcourse, "Thou shalt not sleep between 
- * get_cpu_ptr and put_cpu_ptr"
- */
-#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
-#define put_cpu_ptr(ptr) put_cpu()
+/* Simple wrapper for the common case.  Zeros memory. */
+#define kmalloc_percpu(type) \
+	((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
 
 #endif /* __LINUX_PERCPU_H */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/include/net/ipv6.h .29880-linux-2.5.69.updated/include/net/ipv6.h
--- .29880-linux-2.5.69/include/net/ipv6.h	2003-05-05 12:37:12.000000000 +1000
+++ .29880-linux-2.5.69.updated/include/net/ipv6.h	2003-05-05 17:36:25.000000000 +1000
@@ -145,7 +145,7 @@ extern atomic_t			inet6_sock_nr;
 
 int snmp6_register_dev(struct inet6_dev *idev);
 int snmp6_unregister_dev(struct inet6_dev *idev);
-int snmp6_mib_init(void *ptr[2], size_t mibsize);
+int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign);
 void snmp6_mib_free(void *ptr[2]);
 
 struct ip6_ra_chain
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/init/main.c .29880-linux-2.5.69.updated/init/main.c
--- .29880-linux-2.5.69/init/main.c	2003-05-05 12:37:13.000000000 +1000
+++ .29880-linux-2.5.69.updated/init/main.c	2003-05-05 17:36:25.000000000 +1000
@@ -301,35 +301,10 @@ static void __init smp_init(void)
 #define smp_init()	do { } while (0)
 #endif
 
-static inline void setup_per_cpu_areas(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
 #else
 
-#ifdef __GENERIC_PER_CPU
-unsigned long __per_cpu_offset[NR_CPUS];
-
-static void __init setup_per_cpu_areas(void)
-{
-	unsigned long size, i;
-	char *ptr;
-	/* Created by linker magic */
-	extern char __per_cpu_start[], __per_cpu_end[];
-
-	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
-	if (!size)
-		return;
-
-	ptr = alloc_bootmem(size * NR_CPUS);
-
-	for (i = 0; i < NR_CPUS; i++, ptr += size) {
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, size);
-	}
-}
-#endif /* !__GENERIC_PER_CPU */
-
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
 {
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/kernel/ksyms.c .29880-linux-2.5.69.updated/kernel/ksyms.c
--- .29880-linux-2.5.69/kernel/ksyms.c	2003-05-05 12:37:13.000000000 +1000
+++ .29880-linux-2.5.69.updated/kernel/ksyms.c	2003-05-05 17:36:25.000000000 +1000
@@ -99,7 +99,7 @@ EXPORT_SYMBOL(remove_shrinker);
 EXPORT_SYMBOL(kmalloc);
 EXPORT_SYMBOL(kfree);
 #ifdef CONFIG_SMP
-EXPORT_SYMBOL(kmalloc_percpu);
+EXPORT_SYMBOL(__alloc_percpu);
 EXPORT_SYMBOL(kfree_percpu);
 EXPORT_SYMBOL(percpu_counter_mod);
 #endif
@@ -607,7 +607,7 @@ EXPORT_SYMBOL(init_thread_union);
 EXPORT_SYMBOL(tasklist_lock);
 EXPORT_SYMBOL(find_task_by_pid);
 EXPORT_SYMBOL(next_thread);
-#if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU)
+#if defined(CONFIG_SMP)
 EXPORT_SYMBOL(__per_cpu_offset);
 #endif
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/mm/Makefile .29880-linux-2.5.69.updated/mm/Makefile
--- .29880-linux-2.5.69/mm/Makefile	2003-02-11 14:26:20.000000000 +1100
+++ .29880-linux-2.5.69.updated/mm/Makefile	2003-05-05 17:36:25.000000000 +1000
@@ -12,3 +12,4 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SMP)	+= percpu.o
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/mm/percpu.c .29880-linux-2.5.69.updated/mm/percpu.c
--- .29880-linux-2.5.69/mm/percpu.c	1970-01-01 10:00:00.000000000 +1000
+++ .29880-linux-2.5.69.updated/mm/percpu.c	2003-05-05 17:36:26.000000000 +1000
@@ -0,0 +1,204 @@
+/*
+ * Dynamic per-cpu allocation.
+ * Originally by Dipankar Sarma <dipankar@in.ibm.com>
+ * This version (C) 2003 Rusty Russell, IBM Corporation.
+ */
+
+/* Simple allocator: we don't stress it hard, but do want it
+   fairly space-efficient. */
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/semaphore.h>
+
+static DECLARE_MUTEX(pcpu_lock);
+
+struct pcpu_block
+{
+	/* Number of blocks used and allocated. */
+	unsigned short num_used, num_allocated;
+
+	/* Size of each block.  -ve means used. */
+	int size[0];
+};
+static struct pcpu_block *pcpu; /* = NULL */
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+/* Splits a block into two.  Reallocs pcpu if neccessary. */
+static int split_block(unsigned int i, unsigned short size)
+{
+	/* Reallocation required? */
+	if (pcpu->num_used + 1 > pcpu->num_allocated) {
+		struct pcpu_block *new;
+
+		new = kmalloc(sizeof(*pcpu)
+			      + sizeof(pcpu->size[0]) * pcpu->num_allocated*2,
+			      GFP_KERNEL);
+		if (!new)
+			return 0;
+		new->num_used = pcpu->num_used;
+		new->num_allocated = pcpu->num_allocated * 2;
+		memcpy(new->size, pcpu->size,
+		       sizeof(pcpu->size[0])*pcpu->num_used);
+		kfree(pcpu);
+		pcpu = new;
+	}
+
+	/* Insert a new subblock */
+	memmove(&pcpu->size[i+1], &pcpu->size[i],
+		sizeof(pcpu->size[0]) * (pcpu->num_used - i));
+	pcpu->num_used++;
+
+	pcpu->size[i+1] -= size;
+	pcpu->size[i] = size;
+	return 1;
+}
+
+static inline unsigned int abs(int val)
+{
+	if (val < 0)
+		return -val;
+	return val;
+}
+
+static inline void zero_all(void *pcpuptr, unsigned int size)
+{
+	unsigned int i;;
+
+	for (i = 0; i < NR_CPUS; i++)
+		memset(per_cpu_ptr(pcpuptr, i), 0, size);
+}
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	unsigned long extra;
+	unsigned int i;
+	void *ptr;
+
+	BUG_ON(align > SMP_CACHE_BYTES);
+	BUG_ON(size > PERCPU_POOL_SIZE/2);
+
+	down(&pcpu_lock);
+	ptr = __per_cpu_start;
+	for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+		/* Extra for alignment requirement. */
+		extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+
+		/* Allocated or not large enough? */
+		if (pcpu->size[i] < 0 || pcpu->size[i] < extra + size)
+			continue;
+
+		/* Transfer extra to previous block. */
+		if (pcpu->size[i-1] < 0)
+			pcpu->size[i-1] -= extra;
+		else
+			pcpu->size[i-1] += extra;
+		pcpu->size[i] -= extra;
+		ptr += extra;
+
+		/* Split block if warranted */
+		if (pcpu->size[i] - size > sizeof(unsigned long))
+			if (!split_block(i, size))
+				break;
+
+		/* Mark allocated */
+		pcpu->size[i] = -pcpu->size[i];
+		zero_all(ptr, size);
+		goto out;
+	}
+	ptr = NULL;
+ out:
+	up(&pcpu_lock);
+	return ptr;
+}
+
+void kfree_percpu(const void *freeme)
+{
+	unsigned int i;
+	void *ptr = __per_cpu_start;
+
+	down(&pcpu_lock);
+	for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+		if (ptr == freeme) {
+			/* Double free? */
+			BUG_ON(pcpu->size[i] > 0);
+			/* Block 0 is for non-dynamic per-cpu data. */
+			BUG_ON(i == 0);
+			pcpu->size[i] = -pcpu->size[i];
+			goto merge;
+		}
+	}
+	BUG();
+
+ merge:
+	/* Merge with previous? */
+	if (pcpu->size[i-1] >= 0) {
+		pcpu->size[i-1] += pcpu->size[i];
+		pcpu->num_used--;
+		memmove(&pcpu->size[i], &pcpu->size[i+1],
+			(pcpu->num_used - i) * sizeof(pcpu->size[0]));
+		i--;
+	}
+	/* Merge with next? */
+	if (i+1 < pcpu->num_used && pcpu->size[i+1] >= 0) {
+		pcpu->size[i] += pcpu->size[i+1];
+		pcpu->num_used--;
+		memmove(&pcpu->size[i+1], &pcpu->size[i+2],
+			(pcpu->num_used - (i+1)) * sizeof(pcpu->size[0]));
+	}
+
+	/* There's always one block: the core kernel one. */
+	BUG_ON(pcpu->num_used == 0);
+	up(&pcpu_lock);
+}
+
+unsigned long __per_cpu_offset[NR_CPUS];
+EXPORT_SYMBOL(__per_cpu_offset);
+
+#define PERCPU_INIT_BLOCKS 4
+
+#ifdef __NEED_SETUP_PER_CPU_AREAS
+/* Generic version: allocates for all NR_CPUs. */
+void __init setup_per_cpu_areas(void)
+{
+	unsigned long i;
+	void *ptr;
+
+	ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);
+
+	/* Don't panic yet, they won't see it */
+	if (__per_cpu_end - __per_cpu_start > PERCPU_POOL_SIZE)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++, ptr += PERCPU_POOL_SIZE) {
+		__per_cpu_offset[i] = ptr - (void *)__per_cpu_start;
+		/* Copy section for each CPU (we discard the original) */
+		memcpy(__per_cpu_start + __per_cpu_offset[i],
+		       __per_cpu_start,
+		       __per_cpu_end - __per_cpu_start);
+	}
+}
+#endif
+
+static int init_alloc_percpu(void)
+{
+	printk("Per-cpu data: %Zu of %u bytes\n",
+	       __per_cpu_end - __per_cpu_start, PERCPU_POOL_SIZE);
+
+	if (__per_cpu_end - __per_cpu_start > PERCPU_POOL_SIZE)
+		panic("Too much per-cpu data.\n");
+
+	pcpu = kmalloc(sizeof(*pcpu)+sizeof(pcpu->size[0])*PERCPU_INIT_BLOCKS,
+		       GFP_KERNEL);
+	pcpu->num_allocated = PERCPU_INIT_BLOCKS;
+	pcpu->num_used = 2;
+	pcpu->size[0] = -(__per_cpu_end - __per_cpu_start);
+	pcpu->size[1] = PERCPU_POOL_SIZE-(__per_cpu_end - __per_cpu_start);
+
+	return 0;
+}
+
+__initcall(init_alloc_percpu);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/mm/slab.c .29880-linux-2.5.69.updated/mm/slab.c
--- .29880-linux-2.5.69/mm/slab.c	2003-04-20 18:05:16.000000000 +1000
+++ .29880-linux-2.5.69.updated/mm/slab.c	2003-05-05 17:36:25.000000000 +1000
@@ -1902,54 +1902,6 @@ void * kmalloc (size_t size, int flags)
 	return NULL;
 }
 
-#ifdef CONFIG_SMP
-/**
- * kmalloc_percpu - allocate one copy of the object for every present
- * cpu in the system.
- * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr
- * macros only.
- *
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user.  May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
- */
-void *
-kmalloc_percpu(size_t size, int flags)
-{
-	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
-
-	if (!pdata)
-		return NULL;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		pdata->ptrs[i] = kmalloc(size, flags);
-		if (!pdata->ptrs[i])
-			goto unwind_oom;
-	}
-
-	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
-	while (--i >= 0) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(pdata->ptrs[i]);
-	}
-	kfree(pdata);
-	return NULL;
-}
-#endif
-
 /**
  * kmem_cache_free - Deallocate an object
  * @cachep: The cache the allocation was from.
@@ -1988,28 +1940,6 @@ void kfree (const void *objp)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_SMP
-/**
- * kfree_percpu - free previously allocated percpu memory
- * @objp: pointer returned by kmalloc_percpu.
- *
- * Don't free memory not originally allocated by kmalloc_percpu()
- * The complemented objp is to check for that.
- */
-void
-kfree_percpu(const void *objp)
-{
-	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(p->ptrs[i]);
-	}
-}
-#endif
-
 unsigned int kmem_cache_size(kmem_cache_t *cachep)
 {
 	unsigned int objlen = cachep->objsize;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv4/af_inet.c .29880-linux-2.5.69.updated/net/ipv4/af_inet.c
--- .29880-linux-2.5.69/net/ipv4/af_inet.c	2003-05-05 12:37:14.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv4/af_inet.c	2003-05-05 17:36:25.000000000 +1000
@@ -1061,54 +1061,21 @@ static struct inet_protocol icmp_protoco
 
 static int __init init_ipv4_mibs(void)
 {
-	int i;
-
-	net_statistics[0] =
-	    kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
-	net_statistics[1] =
-	    kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
-	ip_statistics[0] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
-	ip_statistics[1] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
-	icmp_statistics[0] =
-	    kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
-	icmp_statistics[1] =
-	    kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
-	tcp_statistics[0] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
-	tcp_statistics[1] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
-	udp_statistics[0] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
-	udp_statistics[1] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
+	net_statistics[0] = kmalloc_percpu(struct linux_mib);
+	net_statistics[1] = kmalloc_percpu(struct linux_mib);
+	ip_statistics[0] = kmalloc_percpu(struct ip_mib);
+	ip_statistics[1] = kmalloc_percpu(struct ip_mib);
+	icmp_statistics[0] = kmalloc_percpu(struct icmp_mib);
+	icmp_statistics[1] = kmalloc_percpu(struct icmp_mib);
+	tcp_statistics[0] = kmalloc_percpu(struct tcp_mib);
+	tcp_statistics[1] = kmalloc_percpu(struct tcp_mib);
+	udp_statistics[0] = kmalloc_percpu(struct udp_mib);
+	udp_statistics[1] = kmalloc_percpu(struct udp_mib);
 	if (!
 	    (net_statistics[0] && net_statistics[1] && ip_statistics[0]
 	     && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
 	     && udp_statistics[0] && udp_statistics[1]))
 		return -ENOMEM;
-
-	/* Set all the per cpu copies of the mibs to zero */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(net_statistics[0], i), 0,
-			       sizeof (struct linux_mib));
-			memset(per_cpu_ptr(net_statistics[1], i), 0,
-			       sizeof (struct linux_mib));
-			memset(per_cpu_ptr(ip_statistics[0], i), 0,
-			       sizeof (struct ip_mib));
-			memset(per_cpu_ptr(ip_statistics[1], i), 0,
-			       sizeof (struct ip_mib));
-			memset(per_cpu_ptr(icmp_statistics[0], i), 0,
-			       sizeof (struct icmp_mib));
-			memset(per_cpu_ptr(icmp_statistics[1], i), 0,
-			       sizeof (struct icmp_mib));
-			memset(per_cpu_ptr(tcp_statistics[0], i), 0,
-			       sizeof (struct tcp_mib));
-			memset(per_cpu_ptr(tcp_statistics[1], i), 0,
-			       sizeof (struct tcp_mib));
-			memset(per_cpu_ptr(udp_statistics[0], i), 0,
-			       sizeof (struct udp_mib));
-			memset(per_cpu_ptr(udp_statistics[1], i), 0,
-			       sizeof (struct udp_mib));
-		}
-	}
-
 	(void) tcp_mib_init();
 
 	return 0;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv4/route.c .29880-linux-2.5.69.updated/net/ipv4/route.c
--- .29880-linux-2.5.69/net/ipv4/route.c	2003-05-05 12:37:14.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv4/route.c	2003-05-05 17:36:25.000000000 +1000
@@ -2689,17 +2689,9 @@ int __init ip_rt_init(void)
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
-	rt_cache_stat = kmalloc_percpu(sizeof (struct rt_cache_stat),
-					GFP_KERNEL);
+	rt_cache_stat = kmalloc_percpu(struct rt_cache_stat);
 	if (!rt_cache_stat) 
 		goto out_enomem1;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(rt_cache_stat, i), 0,
-			       sizeof (struct rt_cache_stat));
-		}
-	}
-
 	devinet_init();
 	ip_fib_init();
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv6/af_inet6.c .29880-linux-2.5.69.updated/net/ipv6/af_inet6.c
--- .29880-linux-2.5.69/net/ipv6/af_inet6.c	2003-05-05 12:37:15.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv6/af_inet6.c	2003-05-05 17:36:25.000000000 +1000
@@ -633,28 +633,20 @@ inet6_unregister_protosw(struct inet_pro
 }
 
 int
-snmp6_mib_init(void *ptr[2], size_t mibsize)
+snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 {
 	int i;
 
 	if (ptr == NULL)
 		return -EINVAL;
 
-	ptr[0] = kmalloc_percpu(mibsize, GFP_KERNEL);
+	ptr[0] = __alloc_percpu(mibsize, mibalign);
 	if (!ptr[0])
 		goto err0;
 
-	ptr[1] = kmalloc_percpu(mibsize, GFP_KERNEL);
+	ptr[1] = __alloc_percpu(mibsize, mibalign);
 	if (!ptr[1])
 		goto err1;
-
-	/* Zero percpu version of the mibs */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(ptr[0], i), 0, mibsize);
-			memset(per_cpu_ptr(ptr[1], i), 0, mibsize);
-		}
-	}
 	return 0;
 
 err1:
@@ -676,11 +668,14 @@ snmp6_mib_free(void *ptr[2])
 
 static int __init init_ipv6_mibs(void)
 {
-	if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib)) < 0)
+	if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib),
+			   __alignof__(struct ipv6_mib)) < 0)
 		goto err_ip_mib;
-	if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib)) < 0)
+	if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib),
+			   __alignof__(struct icmpv6_mib)) < 0)
 		goto err_icmp_mib;
-	if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0)
+	if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
+			   __alignof__(struct udp_mib)) < 0)
 		goto err_udp_mib;
 	return 0;
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/ipv6/proc.c .29880-linux-2.5.69.updated/net/ipv6/proc.c
--- .29880-linux-2.5.69/net/ipv6/proc.c	2003-05-05 12:37:15.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/ipv6/proc.c	2003-05-05 17:36:25.000000000 +1000
@@ -226,7 +226,7 @@ int snmp6_register_dev(struct inet6_dev 
 	if (!idev || !idev->dev)
 		return -EINVAL;
 
-	if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib)) < 0)
+	if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib), __alignof__(struct icmpv6_mib)) < 0)
 		goto err_icmp;
 
 #ifdef CONFIG_PROC_FS
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .29880-linux-2.5.69/net/sctp/protocol.c .29880-linux-2.5.69.updated/net/sctp/protocol.c
--- .29880-linux-2.5.69/net/sctp/protocol.c	2003-05-05 12:37:16.000000000 +1000
+++ .29880-linux-2.5.69.updated/net/sctp/protocol.c	2003-05-05 17:36:26.000000000 +1000
@@ -855,26 +855,14 @@ static int __init init_sctp_mibs(void)
 {
 	int i;
 
-	sctp_statistics[0] = kmalloc_percpu(sizeof (struct sctp_mib),
-					    GFP_KERNEL);
+	sctp_statistics[0] = kmalloc_percpu(struct sctp_mib);
 	if (!sctp_statistics[0])
 		return -ENOMEM;
-	sctp_statistics[1] = kmalloc_percpu(sizeof (struct sctp_mib),
-					    GFP_KERNEL);
+	sctp_statistics[1] = kmalloc_percpu(struct sctp_mib);
 	if (!sctp_statistics[1]) {
 		kfree_percpu(sctp_statistics[0]);
 		return -ENOMEM;
 	}
-
-	/* Zero all percpu versions of the mibs */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(sctp_statistics[0], i), 0,
-					sizeof (struct sctp_mib));
-			memset(per_cpu_ptr(sctp_statistics[1], i), 0,
-					sizeof (struct sctp_mib));
-		}
-	}
 	return 0;
 
 }

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-05  8:08 [PATCH] kmalloc_percpu Rusty Russell
@ 2003-05-05  8:47 ` Andrew Morton
  2003-05-06  0:47   ` Rusty Russell
  2003-05-06  5:07   ` Ravikiran G Thirumalai
  2003-05-07  5:51 ` Ravikiran G Thirumalai
  1 sibling, 2 replies; 57+ messages in thread
From: Andrew Morton @ 2003-05-05  8:47 UTC (permalink / raw)
  To: Rusty Russell; +Cc: dipankar, linux-kernel

Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> This is the kmalloc_percpu patch.

How does it work?  What restrictions does it have, and
what compromises were made?

+#define PERCPU_POOL_SIZE 32768

What's this?


The current implementation of kmalloc_per_cpu() turned out to be fairly
disappointing because of the number of derefs which were necessary to get at
the data in fastpaths.   How does this implementation compare?


Thanks.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-05  8:47 ` Andrew Morton
@ 2003-05-06  0:47   ` Rusty Russell
  2003-05-06  1:52     ` Andrew Morton
  2003-05-06  5:07   ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 57+ messages in thread
From: Rusty Russell @ 2003-05-06  0:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dipankar, linux-kernel

In message <20030505014729.5db76f70.akpm@digeo.com> you write:
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> >
> > This is the kmalloc_percpu patch.
> 
> How does it work?  What restrictions does it have, and
> what compromises were made?
> 
> +#define PERCPU_POOL_SIZE 32768
> 
> What's this?

OK.  It has a size restriction: PERCPU_POOL_SIZE is the maximum total
kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever.  This is the
main downside.  It's allocated at boot.

The __alloc_percpu allocator is extremely space efficient, by not
insisting on cache-line aligning everything: __alloc_percpu(SIZE)
overhead is sizeof(int), plus SIZE bytes (rounded up to alignment
requirements) removed from per-cpu pool.

The allocator is fairly slow: they're not expected to be thrown around
like candy.

> The current implementation of kmalloc_per_cpu() turned out to be fairly
> disappointing because of the number of derefs which were necessary to get at
> the data in fastpaths.   How does this implementation compare?

It uses the same method as the static ones, so it's a single addition
of __per_cpu_offset (assuming arch doesn't override implementation).
This is a requirement for modules to use them (which was my aim: the
other side effects are cream).

Hope that clarifies,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  0:47   ` Rusty Russell
@ 2003-05-06  1:52     ` Andrew Morton
  2003-05-06  2:11       ` David S. Miller
  2003-05-06  4:11       ` Rusty Russell
  0 siblings, 2 replies; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  1:52 UTC (permalink / raw)
  To: Rusty Russell; +Cc: dipankar, linux-kernel

Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> > +#define PERCPU_POOL_SIZE 32768
> > 
> > What's this?
> 
> OK.  It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever.  This is the
> main downside.  It's allocated at boot.

And is subject to fragmentation.

Is it not possible to go allocate another N * PERCPU_POOL_SIZE from
slab if it runs out?

That way, PERCPU_POOL_SIZE only needs to be sized for non-modular static
percpu data, which sounds more robust.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  1:52     ` Andrew Morton
@ 2003-05-06  2:11       ` David S. Miller
  2003-05-06  4:08         ` Rusty Russell
  2003-05-06  4:11       ` Rusty Russell
  1 sibling, 1 reply; 57+ messages in thread
From: David S. Miller @ 2003-05-06  2:11 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rusty Russell, dipankar, linux-kernel

On Mon, 2003-05-05 at 18:52, Andrew Morton wrote:
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> > OK.  It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> > kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever.  This is the
> > main downside.  It's allocated at boot.
> 
> And is subject to fragmentation.
> 
> Is it not possible to go allocate another N * PERCPU_POOL_SIZE from
> slab if it runs out?

No, then you go back to things requireing multiple levels of
dereferencing.  It's hard to realloc() because you have to
freeze the whole kernel to do that properly, and that is not
simple at all.

I think the fixed size pool is perfectly reasonable.

-- 
David S. Miller <davem@redhat.com>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  4:28           ` Andrew Morton
@ 2003-05-06  3:37             ` David S. Miller
  0 siblings, 0 replies; 57+ messages in thread
From: David S. Miller @ 2003-05-06  3:37 UTC (permalink / raw)
  To: akpm; +Cc: rusty, dipankar, linux-kernel

   From: Andrew Morton <akpm@digeo.com>
   Date: Mon, 5 May 2003 21:28:16 -0700
   
   It's OK as long as nobody uses the feature!
   
I think this is closer to say, allocation of kmap types,
than it is to vmalloc() et al. (as you suggest).

   Ho-hum.  Can the magical constant become a __setup thing?
   
Remember that there are physical limitations, for example
on ia64, as to how big this thing can be.  So whatever any
of us think about physical limitations, we have to deal with
them anyways :-)

I think firstly, that we should define that this isn't
something you be doing after module_init()  (ie. your
->open() example, that's rediculious).  Ideas on how to
enforce this are welcome.

Next, we can calculate how much per-cpu space all the modules
need.  And because we can do that, we can preallocate slots
if we wanted to in order to deal with whatever theoretical
fragmentation problems you think exist (much like how Jakub Jelink's
prelinking works).

I personally don't know how smart it is to let random modules use
kmalloc_percpu() with impunity.  But aparently someone thinks
there is some value in that.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  4:08         ` Rusty Russell
@ 2003-05-06  3:40           ` David S. Miller
  2003-05-06  5:02             ` Andrew Morton
  2003-05-06  5:03             ` Dipankar Sarma
  2003-05-06  4:28           ` Andrew Morton
  1 sibling, 2 replies; 57+ messages in thread
From: David S. Miller @ 2003-05-06  3:40 UTC (permalink / raw)
  To: rusty; +Cc: akpm, dipankar, linux-kernel

   From: Rusty Russell <rusty@rustcorp.com.au>
   Date: Tue, 06 May 2003 14:08:27 +1000

   In message <1052187119.983.5.camel@rth.ninka.net> you write:
   > I think the fixed size pool is perfectly reasonable.
   
   Yes.  It's a tradeoff.  I think it's worth it at the moment (although
   I'll add a limited printk to __alloc_percpu if it fails).

I think you should BUG() if a module calls kmalloc_percpu() outside
of mod->init(), this is actually implementable.

Andrew's example with some module doing kmalloc_percpu() inside
of fops->open() is just rediculious.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  2:11       ` David S. Miller
@ 2003-05-06  4:08         ` Rusty Russell
  2003-05-06  3:40           ` David S. Miller
  2003-05-06  4:28           ` Andrew Morton
  0 siblings, 2 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-06  4:08 UTC (permalink / raw)
  To: David S. Miller; +Cc: Andrew Morton, dipankar, linux-kernel

In message <1052187119.983.5.camel@rth.ninka.net> you write:
> On Mon, 2003-05-05 at 18:52, Andrew Morton wrote:
> > Rusty Russell <rusty@rustcorp.com.au> wrote:
> > > OK.  It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> > > kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever.  This is the
> > > main downside.  It's allocated at boot.
> > 
> > And is subject to fragmentation.
> > 
> > Is it not possible to go allocate another N * PERCPU_POOL_SIZE from
> > slab if it runs out?
> 
> No, then you go back to things requireing multiple levels of
> dereferencing.

Actually, you can; my previous patch did this.  But then all CPUS have
to be one continuous allocation: since modern big-SMP machines are
non-uniform, so you don't want this.

	http://www.kernel.org/pub/linux/kernel/people/rusty/patches/Misc/kmalloc_percpu-orig.patch.gz

> I think the fixed size pool is perfectly reasonable.

Yes.  It's a tradeoff.  I think it's worth it at the moment (although
I'll add a limited printk to __alloc_percpu if it fails).

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  1:52     ` Andrew Morton
  2003-05-06  2:11       ` David S. Miller
@ 2003-05-06  4:11       ` Rusty Russell
  1 sibling, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-06  4:11 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dipankar, linux-kernel

In message <20030505185248.3c3a478f.akpm@digeo.com> you write:
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> >
> > > +#define PERCPU_POOL_SIZE 32768
> > > 
> > > What's this?
> > 
> > OK.  It has a size restriction: PERCPU_POOL_SIZE is the maximum total
> > kmalloc_percpu + static DECLARE_PER_CPU you'll get, ever.  This is the
> > main downside.  It's allocated at boot.
> 
> And is subject to fragmentation.

Absolutely.  However, we're looking at an allocation at module insert,
and maybe at each mount (if used for your fuzzy counters).  Until we
see this in practice, I don't think complicating the allocator is
worth it.

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:02             ` Andrew Morton
@ 2003-05-06  4:16               ` David S. Miller
  2003-05-06  5:48                 ` Andrew Morton
  2003-05-06  8:06               ` Rusty Russell
  1 sibling, 1 reply; 57+ messages in thread
From: David S. Miller @ 2003-05-06  4:16 UTC (permalink / raw)
  To: akpm; +Cc: rusty, dipankar, linux-kernel

   From: Andrew Morton <akpm@digeo.com>
   Date: Mon, 5 May 2003 22:02:50 -0700

   "David S. Miller" <davem@redhat.com> wrote:
   > Andrew's example with some module doing kmalloc_percpu() inside
   > of fops->open() is just rediculious.
   
   crap.  Modules deal with per-device and per-mount objects.  If a module
   cannot use kmalloc_per_cpu on behalf of the primary object which it manages
   then the facility is simply not useful to modules.

Ok then.

Please address the ia64 concerns then :-)  It probably means we
have to stay with the dereferencing stuff...  at which point you
might as well use normal kmalloc() and smp_processor_id() indexing
inside of modules.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  4:08         ` Rusty Russell
  2003-05-06  3:40           ` David S. Miller
@ 2003-05-06  4:28           ` Andrew Morton
  2003-05-06  3:37             ` David S. Miller
  1 sibling, 1 reply; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  4:28 UTC (permalink / raw)
  To: Rusty Russell; +Cc: davem, dipankar, linux-kernel

Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> > I think the fixed size pool is perfectly reasonable.
> 
> Yes.  It's a tradeoff.  I think it's worth it at the moment (although
> I'll add a limited printk to __alloc_percpu if it fails).
> 

It's OK as long as nobody uses the feature!  Once it starts to be commonly
used (say, in driver ->open() methods) then we'll get into the same problems
as with vmalloc exhaustion, vmalloc fragmentation, large physically-contig
allocations, etc.

Ho-hum.  Can the magical constant become a __setup thing?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  3:40           ` David S. Miller
@ 2003-05-06  5:02             ` Andrew Morton
  2003-05-06  4:16               ` David S. Miller
  2003-05-06  8:06               ` Rusty Russell
  2003-05-06  5:03             ` Dipankar Sarma
  1 sibling, 2 replies; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  5:02 UTC (permalink / raw)
  To: David S. Miller; +Cc: rusty, dipankar, linux-kernel

"David S. Miller" <davem@redhat.com> wrote:
>
> I think you should BUG() if a module calls kmalloc_percpu() outside
> of mod->init(), this is actually implementable.
> 
> Andrew's example with some module doing kmalloc_percpu() inside
> of fops->open() is just rediculious.

crap.  Modules deal with per-device and per-mount objects.  If a module
cannot use kmalloc_per_cpu on behalf of the primary object which it manages
then the facility is simply not useful to modules.

The static DEFINE_PER_CPU allocation works OK in core kernel because core
kernel does _not_ use per-instance objects.  But modules do.

A case in point, which Rusty has twice mentioned, is the three per-mount
fuzzy counters in the ext2 superblock.  And lo, ext2 cannot use the code in
this patch, because people want to scale to 4000 mounts.

In those very rare cases where a module wants allocation to be performed at
module_init()-time (presumably global stats counters), they can use
DEFINE_PER_CPU, so we should just not export kmalloc_per_cpu() to modules at
all.



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  3:40           ` David S. Miller
  2003-05-06  5:02             ` Andrew Morton
@ 2003-05-06  5:03             ` Dipankar Sarma
  1 sibling, 0 replies; 57+ messages in thread
From: Dipankar Sarma @ 2003-05-06  5:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: rusty, akpm, linux-kernel

On Mon, May 05, 2003 at 08:40:02PM -0700, David S. Miller wrote:
> I think you should BUG() if a module calls kmalloc_percpu() outside
> of mod->init(), this is actually implementable.
> 
> Andrew's example with some module doing kmalloc_percpu() inside
> of fops->open() is just rediculious.

The disk stats are already per-cpu. So, what happens when you offline/online
a disk ? How do you allocate per-cpu memory during that ?

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-05  8:47 ` Andrew Morton
  2003-05-06  0:47   ` Rusty Russell
@ 2003-05-06  5:07   ` Ravikiran G Thirumalai
  2003-05-06  8:03     ` Rusty Russell
  1 sibling, 1 reply; 57+ messages in thread
From: Ravikiran G Thirumalai @ 2003-05-06  5:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Rusty Russell, dipankar, linux-kernel

On Mon, May 05, 2003 at 08:46:58AM +0000, Andrew Morton wrote:
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> >
> > This is the kmalloc_percpu patch.
> 
> How does it work?  What restrictions does it have, and
> what compromises were made?
> 
> +#define PERCPU_POOL_SIZE 32768
> 
> What's this?
> 
> 
> The current implementation of kmalloc_per_cpu() turned out to be fairly
> disappointing because of the number of derefs which were necessary to get at
> the data in fastpaths.   How does this implementation compare?
>
Andrew,
Here is a comparision of kmalloc_percpu techniques as I see it,

Current Implementation:
1. Two dereferences to get to the per-cpu data 
2. Allocates for cpu_possible cpus only, and can deal with sparse cpu nos
 
Rusty's Implementation
1. One extra memory reference (__per_cpu_offset) 
2. allocates for NR_CPUS and probably breaks with sparse cpu nos?
3. Let you do per-cpu data in modules
4. fragmentation

The simpler patch I mailed you sometime back,
1. Minimal dereference overhead, offsets to per-cpu data calculated at 
   compile time
2. allocates for NR_CPUS and problems with sparse cpu nos
3. Very Simple.

My guess is performancewise Rusty's iplementation and the simpler
implementation of kmalloc_percpu will be comparable. (I'll run some
tests to compare them and post them later).  I am including the
simpler kmalloc_percpu patch which I'd mailed to you earlier.

Thanks,
Kiran 

diff -ruN -X dontdiff linux-2.5.65/include/linux/percpu.h kmalloc-new-2.5.65/include/linux/percpu.h
--- linux-2.5.65/include/linux/percpu.h	Tue Mar 18 03:14:43 2003
+++ kmalloc-new-2.5.65/include/linux/percpu.h	Wed Mar 19 17:18:59 2003
@@ -9,22 +9,14 @@
 #define put_cpu_var(var) preempt_enable()
 
 #ifdef CONFIG_SMP
-
-struct percpu_data {
-	void *ptrs[NR_CPUS];
-	void *blkp;
-};
-
 /* 
  * Use this to get to a cpu's version of the per-cpu object allocated using
  * kmalloc_percpu.  If you want to get "this cpu's version", maybe you want
  * to use get_cpu_ptr... 
  */ 
 #define per_cpu_ptr(ptr, cpu)                   \
-({                                              \
-        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	\
-})
+        ((__typeof__(ptr)) 			\
+		(RELOC_HIDE(ptr, ALIGN(sizeof (*ptr), SMP_CACHE_BYTES)*cpu))) 
 
 extern void *kmalloc_percpu(size_t size, int flags);
 extern void kfree_percpu(const void *);
diff -ruN -X dontdiff linux-2.5.65/mm/slab.c kmalloc-new-2.5.65/mm/slab.c
--- linux-2.5.65/mm/slab.c	Tue Mar 18 03:14:38 2003
+++ kmalloc-new-2.5.65/mm/slab.c	Wed Mar 19 16:32:33 2003
@@ -1951,31 +1951,7 @@
 void *
 kmalloc_percpu(size_t size, int flags)
 {
-	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
-
-	if (!pdata)
-		return NULL;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		pdata->ptrs[i] = kmalloc(size, flags);
-		if (!pdata->ptrs[i])
-			goto unwind_oom;
-	}
-
-	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
-	while (--i >= 0) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(pdata->ptrs[i]);
-	}
-	kfree(pdata);
-	return NULL;
+	return kmalloc(ALIGN(size, SMP_CACHE_BYTES)*NR_CPUS, flags);
 }
 #endif
 
@@ -2028,14 +2004,7 @@
 void
 kfree_percpu(const void *objp)
 {
-	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(p->ptrs[i]);
-	}
+	kfree(objp);
 }
 #endif
 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:48                 ` Andrew Morton
@ 2003-05-06  5:35                   ` David S. Miller
  2003-05-06  6:55                     ` Andrew Morton
  2003-05-06  6:42                   ` Andrew Morton
  1 sibling, 1 reply; 57+ messages in thread
From: David S. Miller @ 2003-05-06  5:35 UTC (permalink / raw)
  To: akpm; +Cc: rusty, dipankar, linux-kernel

   From: Andrew Morton <akpm@digeo.com>
   Date: Mon, 5 May 2003 22:48:15 -0700

   I think so.  So we'd end up with:
   
   - DEFINE_PER_CPU and kmalloc_percpu() work in core kernel, and use the 32k
     pool.
   
   - DEFINE_PER_CPU in modules uses the 32k pool as well (core kernel does the
     allocation).
   
   - kmalloc_per_cpu() is unavailble to modules (it ain't exported).
   
   AFAICT the only thing which will break is sctp, which needs a trivial
   conversion to DEFINE_PER_CPU.
   
Your grep is faulty, we're using kmalloc_percpu() in ipv6 for per-cpu
and per-device icmp stats.

You solution doesn't work in that case.  Also ipv4 will have the same
problems if we make that modular at some point.

I also don't see how this fits in for your ext2 fuzzy counter stuff.
It isn't a "module" for most people, I can't even remember if I've
ever built ext2 non-statically.  :-)  It almost appears as if you
are suggesting kmalloc_percpu() is not usable at all.

So there we have it, there are a total of 3 users of kmalloc_percpu()
(ipv4/ipv6/diskstats) so let's decide if it's going to continue to
live longer or not before there are any more. :-)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  6:42                   ` Andrew Morton
@ 2003-05-06  5:39                     ` David S. Miller
  2003-05-06  6:57                       ` Andrew Morton
  0 siblings, 1 reply; 57+ messages in thread
From: David S. Miller @ 2003-05-06  5:39 UTC (permalink / raw)
  To: akpm; +Cc: rusty, dipankar, linux-kernel

   From: Andrew Morton <akpm@digeo.com>
   Date: Mon, 5 May 2003 23:42:48 -0700
   
   Can't think of anything very clever there, except to go and un-percpuify the
   disk stats.  I think that's best, really - disk requests only come in at 100
   to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.
   
Use some spinlock we already have to be holding during the
counter bumps.

Frankly, these things don't need to be %100 accurate.  Using
a new spinlock or an atomic_t for this seems rediculious.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  4:16               ` David S. Miller
@ 2003-05-06  5:48                 ` Andrew Morton
  2003-05-06  5:35                   ` David S. Miller
  2003-05-06  6:42                   ` Andrew Morton
  0 siblings, 2 replies; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  5:48 UTC (permalink / raw)
  To: David S. Miller; +Cc: rusty, dipankar, linux-kernel

"David S. Miller" <davem@redhat.com> wrote:
>
> 
> Please address the ia64 concerns then :-)  It probably means we
> have to stay with the dereferencing stuff...  at which point you
> might as well use normal kmalloc() and smp_processor_id() indexing
> inside of modules.

I think so.  So we'd end up with:

- DEFINE_PER_CPU and kmalloc_percpu() work in core kernel, and use the 32k
  pool.

- DEFINE_PER_CPU in modules uses the 32k pool as well (core kernel does the
  allocation).

- kmalloc_per_cpu() is unavailble to modules (it ain't exported).

AFAICT the only thing which will break is sctp, which needs a trivial
conversion to DEFINE_PER_CPU.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  6:55                     ` Andrew Morton
@ 2003-05-06  5:57                       ` David S. Miller
  2003-05-06  7:22                         ` Andrew Morton
  2003-05-06  7:20                       ` Dipankar Sarma
  2003-05-06  8:28                       ` Rusty Russell
  2 siblings, 1 reply; 57+ messages in thread
From: David S. Miller @ 2003-05-06  5:57 UTC (permalink / raw)
  To: akpm; +Cc: rusty, dipankar, linux-kernel

   From: Andrew Morton <akpm@digeo.com>
   Date: Mon, 5 May 2003 23:55:49 -0700

   How about we leave kmalloc_per_cpu as-is (it uses kmalloc()), and
   only apply Rusty's new code to DEFINE_PER_CPU?
   
I propose to make it use kmalloc() all the time.

It simply doesn't make sense to use a pool given what you've
shown me.  If we've decided that any limit whatsover is bad,
why impose any limit at all?  Smells like bad design frankly.

Normal DEFINE_PER_CPU() need not a pool, therefore we don't need
a pool for anything.

Make kmalloc_per_cpu() merely a convenience macro, made up of existing
non-percpu primitives.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  7:22                         ` Andrew Morton
@ 2003-05-06  6:15                           ` David S. Miller
  2003-05-06  7:34                             ` Andrew Morton
  0 siblings, 1 reply; 57+ messages in thread
From: David S. Miller @ 2003-05-06  6:15 UTC (permalink / raw)
  To: akpm; +Cc: rusty, dipankar, linux-kernel

   From: Andrew Morton <akpm@digeo.com>
   Date: Tue, 6 May 2003 00:22:29 -0700

   "David S. Miller" <davem@redhat.com> wrote:
   >
   > Make kmalloc_per_cpu() merely a convenience macro, made up of existing
   > non-percpu primitives.
   
   I think we're agreeing here.
   
As just pointed out by dipankar the only issue is NUMA...
so it has to be something more sophisticated than simply
kmalloc()[smp_processor_id];

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:48                 ` Andrew Morton
  2003-05-06  5:35                   ` David S. Miller
@ 2003-05-06  6:42                   ` Andrew Morton
  2003-05-06  5:39                     ` David S. Miller
  1 sibling, 1 reply; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  6:42 UTC (permalink / raw)
  To: davem, rusty, dipankar, linux-kernel

Andrew Morton <akpm@digeo.com> wrote:
>
> - DEFINE_PER_CPU and kmalloc_percpu() work in core kernel, and use the 32k
>   pool.

except sizeof(struct disk_stats)=44, so we run out of percpu space at 744
disks.

Can't think of anything very clever there, except to go and un-percpuify the
disk stats.  I think that's best, really - disk requests only come in at 100
to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:35                   ` David S. Miller
@ 2003-05-06  6:55                     ` Andrew Morton
  2003-05-06  5:57                       ` David S. Miller
                                         ` (2 more replies)
  0 siblings, 3 replies; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  6:55 UTC (permalink / raw)
  To: David S. Miller; +Cc: rusty, dipankar, linux-kernel

"David S. Miller" <davem@redhat.com> wrote:
>
> Your grep is faulty, we're using kmalloc_percpu() in ipv6 for per-cpu
> and per-device icmp stats.

My grep is fine.  It's the brain which failed ;)

> You solution doesn't work in that case.  Also ipv4 will have the same
> problems if we make that modular at some point.
> 
> I also don't see how this fits in for your ext2 fuzzy counter stuff.

Well it doesn't fit.  With the proposals which we are discussing here, ext2
(and, we hope, soon ext3) would continue to use foo[NR_CPUS].

> It isn't a "module" for most people, I can't even remember if I've
> ever built ext2 non-statically.  :-)  It almost appears as if you
> are suggesting kmalloc_percpu() is not usable at all.

kmalloc_per_cpu() is useful at present.  But with Rusty's patch it becomes
less useful.

Aside: I rather think that 4000 filesystems isn't sane.  4000 disks is, but I
find it hard to believe that people will have a separate fs on each one...

> So there we have it, there are a total of 3 users of kmalloc_percpu()
> (ipv4/ipv6/diskstats) so let's decide if it's going to continue to
> live longer or not before there are any more. :-)

How about we leave kmalloc_per_cpu as-is (it uses kmalloc()), and only apply
Rusty's new code to DEFINE_PER_CPU?


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:39                     ` David S. Miller
@ 2003-05-06  6:57                       ` Andrew Morton
  2003-05-06  7:25                         ` Jens Axboe
  2003-05-06 16:05                         ` Bryan O'Sullivan
  0 siblings, 2 replies; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  6:57 UTC (permalink / raw)
  To: David S. Miller; +Cc: rusty, dipankar, linux-kernel

"David S. Miller" <davem@redhat.com> wrote:
>
>    From: Andrew Morton <akpm@digeo.com>
>    Date: Mon, 5 May 2003 23:42:48 -0700
>    
>    Can't think of anything very clever there, except to go and un-percpuify the
>    disk stats.  I think that's best, really - disk requests only come in at 100
>    to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.
>    
> Use some spinlock we already have to be holding during the
> counter bumps.

Last time we looked at that, q->lock was already held in almost all the right
places so yes, that'd work.

> Frankly, these things don't need to be %100 accurate.  Using
> a new spinlock or an atomic_t for this seems rediculious.

The disk_stats structure has an "in flight" member.  If we don't have proper
locking around that, disks will appear to have -3 requests in flight for all
time, which would look a tad odd.



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  6:55                     ` Andrew Morton
  2003-05-06  5:57                       ` David S. Miller
@ 2003-05-06  7:20                       ` Dipankar Sarma
  2003-05-06  8:28                       ` Rusty Russell
  2 siblings, 0 replies; 57+ messages in thread
From: Dipankar Sarma @ 2003-05-06  7:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: David S. Miller, rusty, linux-kernel

On Mon, May 05, 2003 at 11:55:49PM -0700, Andrew Morton wrote:
> > I also don't see how this fits in for your ext2 fuzzy counter stuff.
> 
> Well it doesn't fit.  With the proposals which we are discussing here, ext2
> (and, we hope, soon ext3) would continue to use foo[NR_CPUS].

And that is not what we want to do in a NUMA box.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:57                       ` David S. Miller
@ 2003-05-06  7:22                         ` Andrew Morton
  2003-05-06  6:15                           ` David S. Miller
  0 siblings, 1 reply; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  7:22 UTC (permalink / raw)
  To: David S. Miller; +Cc: rusty, dipankar, linux-kernel

"David S. Miller" <davem@redhat.com> wrote:
>
> Make kmalloc_per_cpu() merely a convenience macro, made up of existing
> non-percpu primitives.

I think we're agreeing here.

The current kmalloc_percpu() is a wrapper around kmalloc.  That seems OK to
me.

What we _do_ want to solve is the problem that DEFINE_PERCPU() does not work
in modules.  Rusty's patch (reworked to not alter kmalloc_percpu) would suit
that requirement.


(kiran has a new version of kmalloc_percpu() which may be faster than the
current one, but for the purposes of this discussion it's equivalent).


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  6:57                       ` Andrew Morton
@ 2003-05-06  7:25                         ` Jens Axboe
  2003-05-06 10:41                           ` Ingo Oeser
  2003-05-06 16:05                         ` Bryan O'Sullivan
  1 sibling, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2003-05-06  7:25 UTC (permalink / raw)
  To: Andrew Morton; +Cc: David S. Miller, rusty, dipankar, linux-kernel

On Mon, May 05 2003, Andrew Morton wrote:
> "David S. Miller" <davem@redhat.com> wrote:
> >
> >    From: Andrew Morton <akpm@digeo.com>
> >    Date: Mon, 5 May 2003 23:42:48 -0700
> >    
> >    Can't think of anything very clever there, except to go and un-percpuify the
> >    disk stats.  I think that's best, really - disk requests only come in at 100
> >    to 200 per second - atomic_t's or int-plus-per-disk-spinlock will be fine.
> >    
> > Use some spinlock we already have to be holding during the
> > counter bumps.
> 
> Last time we looked at that, q->lock was already held in almost all the right
> places so yes, that'd work.

As far as I can see, queue lock _is_ held in all the right spot. At
least where it matters, adding new samples.

> > Frankly, these things don't need to be %100 accurate.  Using
> > a new spinlock or an atomic_t for this seems rediculious.
> 
> The disk_stats structure has an "in flight" member.  If we don't have proper
> locking around that, disks will appear to have -3 requests in flight for all
> time, which would look a tad odd.

So check for < 0 in flight? I totally agree with davem here.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  6:15                           ` David S. Miller
@ 2003-05-06  7:34                             ` Andrew Morton
  2003-05-06  8:42                               ` William Lee Irwin III
  2003-05-06 14:38                               ` Martin J. Bligh
  0 siblings, 2 replies; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  7:34 UTC (permalink / raw)
  To: David S. Miller; +Cc: rusty, dipankar, linux-kernel

"David S. Miller" <davem@redhat.com> wrote:
>
> As just pointed out by dipankar the only issue is NUMA...
> so it has to be something more sophisticated than simply
> kmalloc()[smp_processor_id];

The proposed patch doesn't do anything about that either.

+	ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);

So yes, we need an api which could be extended to use node-affine memory at
some time in the future.  I think we have that.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:07   ` Ravikiran G Thirumalai
@ 2003-05-06  8:03     ` Rusty Russell
  2003-05-06  9:23       ` David S. Miller
  2003-05-06  9:34       ` Ravikiran G Thirumalai
  0 siblings, 2 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-06  8:03 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: dipankar, linux-kernel, akpm

In message <20030506050744.GA29352@in.ibm.com> you write:
> On Mon, May 05, 2003 at 08:46:58AM +0000, Andrew Morton wrote:
> Andrew,
> Here is a comparision of kmalloc_percpu techniques as I see it,
> 
> Current Implementation:
> 1. Two dereferences to get to the per-cpu data 
> 2. Allocates for cpu_possible cpus only, and can deal with sparse cpu nos
>  
> Rusty's Implementation
> 1. One extra memory reference (__per_cpu_offset) 
> 2. allocates for NR_CPUS and probably breaks with sparse cpu nos?
> 3. Let you do per-cpu data in modules
> 4. fragmentation

And #3 is, in fact, the one I care about.  The extra memory reference
is already probably cachehot (all static per-cpu use it), and might be
in a register on some archs.

Doesn't break with sparce CPU #s, but yes, it is inefficient.

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  5:02             ` Andrew Morton
  2003-05-06  4:16               ` David S. Miller
@ 2003-05-06  8:06               ` Rusty Russell
  1 sibling, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-06  8:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: David S. Miller, rusty, dipankar, linux-kernel

In message <20030505220250.213417f6.akpm@digeo.com> you write:
> "David S. Miller" <davem@redhat.com> wrote:
> >
> > I think you should BUG() if a module calls kmalloc_percpu() outside
> > of mod->init(), this is actually implementable.
> > 
> > Andrew's example with some module doing kmalloc_percpu() inside
> > of fops->open() is just rediculious.
> 
> crap.  Modules deal with per-device and per-mount objects.  If a module
> cannot use kmalloc_per_cpu on behalf of the primary object which it manages
> then the facility is simply not useful to modules.

No, the allocator is dog-slow.  If you want to use it on open, you
need a new allocator, not one that does a linear search for blocks.

> A case in point, which Rusty has twice mentioned, is the three per-mount
> fuzzy counters in the ext2 superblock.  And lo, ext2 cannot use the code in
> this patch, because people want to scale to 4000 mounts.

Well, 4000 will fit on 32-bit archs, easily.  Yes, it's a limit.

Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  6:55                     ` Andrew Morton
  2003-05-06  5:57                       ` David S. Miller
  2003-05-06  7:20                       ` Dipankar Sarma
@ 2003-05-06  8:28                       ` Rusty Russell
  2003-05-06  8:47                         ` Andrew Morton
  2003-05-06 14:41                         ` Martin J. Bligh
  2 siblings, 2 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-06  8:28 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dipankar, linux-kernel

In message <20030505235549.5df75866.akpm@digeo.com> you write:
> > It isn't a "module" for most people, I can't even remember if I've
> > ever built ext2 non-statically.  :-)  It almost appears as if you
> > are suggesting kmalloc_percpu() is not usable at all.
> 
> kmalloc_per_cpu() is useful at present.  But with Rusty's patch it becomes
> less useful.

It's a tradeoff, but I think it's worth it for a kmalloc_percpu which
is fast, space-efficient and numa-aware, since the code is needed
anyway.

How about a compromise like the following (scaled with mem)?
Untested, but you get the idea...

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: kmalloc_percpu to use same percpu operators
Author: Rusty Russell
Status: Untested

D: By overallocating the per-cpu data at boot, we can make quite an
D: efficient allocator, and then use it to support per-cpu data in
D: modules (next patch).

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/asm-generic/percpu.h .26261-linux-2.5.69.updated/include/asm-generic/percpu.h
--- .26261-linux-2.5.69/include/asm-generic/percpu.h	2003-01-02 12:32:47.000000000 +1100
+++ .26261-linux-2.5.69.updated/include/asm-generic/percpu.h	2003-05-06 18:14:22.000000000 +1000
@@ -2,37 +2,11 @@
 #define _ASM_GENERIC_PERCPU_H_
 #include <linux/compiler.h>
 
-#define __GENERIC_PER_CPU
+/* Some archs may want to keep __per_cpu_offset for this CPU in a register,
+   or do their own allocation. */
 #ifdef CONFIG_SMP
-
-extern unsigned long __per_cpu_offset[NR_CPUS];
-
-/* Separate out the type, so (int[3], foo) works. */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
-    __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
-#endif
-
-/* var is in discarded region: offset to particular copy we want */
-#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
-
-#else /* ! SMP */
-
-/* Can't define per-cpu variables in modules.  Sorry --RR */
-#ifndef MODULE
-#define DEFINE_PER_CPU(type, name) \
-    __typeof__(type) name##__per_cpu
-#endif
-
-#define per_cpu(var, cpu)			((void)cpu, var##__per_cpu)
-#define __get_cpu_var(var)			var##__per_cpu
-
-#endif	/* SMP */
-
-#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
-
-#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
-#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
-
+#define __get_cpu_ptr(var) per_cpu_ptr(ptr, smp_processor_id())
+#define __NEED_SETUP_PER_CPU_AREAS
+#endif /* SMP */
 #endif /* _ASM_GENERIC_PERCPU_H_ */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/linux/genhd.h .26261-linux-2.5.69.updated/include/linux/genhd.h
--- .26261-linux-2.5.69/include/linux/genhd.h	2003-05-05 12:37:12.000000000 +1000
+++ .26261-linux-2.5.69.updated/include/linux/genhd.h	2003-05-06 18:14:22.000000000 +1000
@@ -160,10 +160,9 @@ static inline void disk_stat_set_all(str
 #ifdef  CONFIG_SMP
 static inline int init_disk_stats(struct gendisk *disk)
 {
-	disk->dkstats = kmalloc_percpu(sizeof (struct disk_stats), GFP_KERNEL);
+	disk->dkstats = kmalloc_percpu(struct disk_stats);
 	if (!disk->dkstats)
 		return 0;
-	disk_stat_set_all(disk, 0);
 	return 1;
 }
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/linux/percpu.h .26261-linux-2.5.69.updated/include/linux/percpu.h
--- .26261-linux-2.5.69/include/linux/percpu.h	2003-02-07 19:20:01.000000000 +1100
+++ .26261-linux-2.5.69.updated/include/linux/percpu.h	2003-05-06 18:26:53.000000000 +1000
@@ -1,71 +1,79 @@
 #ifndef __LINUX_PERCPU_H
 #define __LINUX_PERCPU_H
-#include <linux/spinlock.h> /* For preempt_disable() */
-#include <linux/slab.h> /* For kmalloc_percpu() */
+#include <linux/preempt.h> /* For preempt_disable() */
+#include <linux/slab.h> /* For kmalloc() */
+#include <linux/cache.h>
+#include <linux/string.h>
+#include <asm/bug.h>
 #include <asm/percpu.h>
 
-/* Must be an lvalue. */
+/* For variables declared with DECLARE_PER_CPU()/DEFINE_PER_CPU(). */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
+/* Also, per_cpu(var, cpu) to get another cpu's value. */
+
+/* For ptrs allocated with kmalloc_percpu */
+#define get_cpu_ptr(ptr) ({ preempt_disable(); __get_cpu_ptr(ptr); })
+#define put_cpu_ptr(ptr) preempt_enable()
+/* Also, per_cpu_ptr(ptr, cpu) to get another cpu's value. */
 
 #ifdef CONFIG_SMP
 
-struct percpu_data {
-	void *ptrs[NR_CPUS];
-	void *blkp;
-};
+/* __alloc_percpu zeros memory for every cpu, as a convenience. */
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void kfree_percpu(const void *);
 
-/* 
- * Use this to get to a cpu's version of the per-cpu object allocated using
- * kmalloc_percpu.  If you want to get "this cpu's version", maybe you want
- * to use get_cpu_ptr... 
- */ 
-#define per_cpu_ptr(ptr, cpu)                   \
-({                                              \
-        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	\
-})
+extern unsigned long __per_cpu_offset[NR_CPUS];
 
-extern void *kmalloc_percpu(size_t size, int flags);
-extern void kfree_percpu(const void *);
-extern void kmalloc_percpu_init(void);
+/* Separate out the type, so (int[3], foo) works. */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
+#endif
 
-#else /* CONFIG_SMP */
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
+#define per_cpu_ptr(ptr, cpu) ((__typeof__(ptr))(RELOC_HIDE(ptr, __per_cpu_offset[cpu])))
 
-#define per_cpu_ptr(ptr, cpu) (ptr)
+extern void setup_per_cpu_areas(void);
+#else /* !CONFIG_SMP */
 
-static inline void *kmalloc_percpu(size_t size, int flags)
+/* Can't define per-cpu variables in modules.  Sorry --RR */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+    __typeof__(type) name##__per_cpu
+#endif
+
+#define per_cpu(var, cpu)			((void)(cpu), var##__per_cpu)
+#define __get_cpu_var(var)			var##__per_cpu
+#define per_cpu_ptr(ptr, cpu)			((void)(cpu), (ptr))
+#define __get_cpu_ptr(ptr)			(ptr)
+
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
-	return(kmalloc(size, flags));
+	void *ret;
+	/* kmalloc always cacheline aligns. */
+	BUG_ON(align > SMP_CACHE_BYTES);
+	ret = kmalloc(size, GFP_KERNEL);
+	if (ret)
+		memset(ret, 0, size);
+	return ret;
 }
 static inline void kfree_percpu(const void *ptr)
 {	
 	kfree(ptr);
 }
-static inline void kmalloc_percpu_init(void) { }
 
+static inline void setup_per_cpu_areas(void) { }
 #endif /* CONFIG_SMP */
 
-/* 
- * Use these with kmalloc_percpu. If
- * 1. You want to operate on memory allocated by kmalloc_percpu (dereference
- *    and read/modify/write)  AND 
- * 2. You want "this cpu's version" of the object AND 
- * 3. You want to do this safely since:
- *    a. On multiprocessors, you don't want to switch between cpus after 
- *    you've read the current processor id due to preemption -- this would 
- *    take away the implicit  advantage to not have any kind of traditional 
- *    serialization for per-cpu data
- *    b. On uniprocessors, you don't want another kernel thread messing
- *    up with the same per-cpu data due to preemption
- *    
- * So, Use get_cpu_ptr to disable preemption and get pointer to the 
- * local cpu version of the per-cpu object. Use put_cpu_ptr to enable
- * preemption.  Operations on per-cpu data between get_ and put_ is
- * then considered to be safe. And ofcourse, "Thou shalt not sleep between 
- * get_cpu_ptr and put_cpu_ptr"
- */
-#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
-#define put_cpu_ptr(ptr) put_cpu()
+/* Simple wrapper for the common case.  Zeros memory. */
+#define kmalloc_percpu(type) \
+	((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
 
 #endif /* __LINUX_PERCPU_H */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/include/net/ipv6.h .26261-linux-2.5.69.updated/include/net/ipv6.h
--- .26261-linux-2.5.69/include/net/ipv6.h	2003-05-05 12:37:12.000000000 +1000
+++ .26261-linux-2.5.69.updated/include/net/ipv6.h	2003-05-06 18:14:22.000000000 +1000
@@ -145,7 +145,7 @@ extern atomic_t			inet6_sock_nr;
 
 int snmp6_register_dev(struct inet6_dev *idev);
 int snmp6_unregister_dev(struct inet6_dev *idev);
-int snmp6_mib_init(void *ptr[2], size_t mibsize);
+int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign);
 void snmp6_mib_free(void *ptr[2]);
 
 struct ip6_ra_chain
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/init/main.c .26261-linux-2.5.69.updated/init/main.c
--- .26261-linux-2.5.69/init/main.c	2003-05-05 12:37:13.000000000 +1000
+++ .26261-linux-2.5.69.updated/init/main.c	2003-05-06 18:14:22.000000000 +1000
@@ -301,35 +301,10 @@ static void __init smp_init(void)
 #define smp_init()	do { } while (0)
 #endif
 
-static inline void setup_per_cpu_areas(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 
 #else
 
-#ifdef __GENERIC_PER_CPU
-unsigned long __per_cpu_offset[NR_CPUS];
-
-static void __init setup_per_cpu_areas(void)
-{
-	unsigned long size, i;
-	char *ptr;
-	/* Created by linker magic */
-	extern char __per_cpu_start[], __per_cpu_end[];
-
-	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
-	if (!size)
-		return;
-
-	ptr = alloc_bootmem(size * NR_CPUS);
-
-	for (i = 0; i < NR_CPUS; i++, ptr += size) {
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, size);
-	}
-}
-#endif /* !__GENERIC_PER_CPU */
-
 /* Called by boot processor to activate the rest. */
 static void __init smp_init(void)
 {
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/kernel/ksyms.c .26261-linux-2.5.69.updated/kernel/ksyms.c
--- .26261-linux-2.5.69/kernel/ksyms.c	2003-05-05 12:37:13.000000000 +1000
+++ .26261-linux-2.5.69.updated/kernel/ksyms.c	2003-05-06 18:14:22.000000000 +1000
@@ -99,7 +99,7 @@ EXPORT_SYMBOL(remove_shrinker);
 EXPORT_SYMBOL(kmalloc);
 EXPORT_SYMBOL(kfree);
 #ifdef CONFIG_SMP
-EXPORT_SYMBOL(kmalloc_percpu);
+EXPORT_SYMBOL(__alloc_percpu);
 EXPORT_SYMBOL(kfree_percpu);
 EXPORT_SYMBOL(percpu_counter_mod);
 #endif
@@ -607,7 +607,7 @@ EXPORT_SYMBOL(init_thread_union);
 EXPORT_SYMBOL(tasklist_lock);
 EXPORT_SYMBOL(find_task_by_pid);
 EXPORT_SYMBOL(next_thread);
-#if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU)
+#if defined(CONFIG_SMP)
 EXPORT_SYMBOL(__per_cpu_offset);
 #endif
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/mm/Makefile .26261-linux-2.5.69.updated/mm/Makefile
--- .26261-linux-2.5.69/mm/Makefile	2003-02-11 14:26:20.000000000 +1100
+++ .26261-linux-2.5.69.updated/mm/Makefile	2003-05-06 18:14:22.000000000 +1000
@@ -12,3 +12,4 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SMP)	+= percpu.o
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/mm/percpu.c .26261-linux-2.5.69.updated/mm/percpu.c
--- .26261-linux-2.5.69/mm/percpu.c	1970-01-01 10:00:00.000000000 +1000
+++ .26261-linux-2.5.69.updated/mm/percpu.c	2003-05-06 18:26:34.000000000 +1000
@@ -0,0 +1,217 @@
+/*
+ * Dynamic per-cpu allocation.
+ * Originally by Dipankar Sarma <dipankar@in.ibm.com>
+ * This version (C) 2003 Rusty Russell, IBM Corporation.
+ */
+
+/* Simple allocator: we don't stress it hard, but do want it
+   fairly space-efficient. */
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/semaphore.h>
+
+static DECLARE_MUTEX(pcpu_lock);
+
+struct pcpu_block
+{
+	/* Number of blocks used and allocated. */
+	unsigned short num_used, num_allocated;
+
+	/* Size of each block.  -ve means used. */
+	int size[0];
+};
+static struct pcpu_block *pcpu; /* = NULL */
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+/* Splits a block into two.  Reallocs pcpu if neccessary. */
+static int split_block(unsigned int i, unsigned short size)
+{
+	/* Reallocation required? */
+	if (pcpu->num_used + 1 > pcpu->num_allocated) {
+		struct pcpu_block *new;
+
+		new = kmalloc(sizeof(*pcpu)
+			      + sizeof(pcpu->size[0]) * pcpu->num_allocated*2,
+			      GFP_KERNEL);
+		if (!new)
+			return 0;
+		new->num_used = pcpu->num_used;
+		new->num_allocated = pcpu->num_allocated * 2;
+		memcpy(new->size, pcpu->size,
+		       sizeof(pcpu->size[0])*pcpu->num_used);
+		kfree(pcpu);
+		pcpu = new;
+	}
+
+	/* Insert a new subblock */
+	memmove(&pcpu->size[i+1], &pcpu->size[i],
+		sizeof(pcpu->size[0]) * (pcpu->num_used - i));
+	pcpu->num_used++;
+
+	pcpu->size[i+1] -= size;
+	pcpu->size[i] = size;
+	return 1;
+}
+
+static inline unsigned int abs(int val)
+{
+	if (val < 0)
+		return -val;
+	return val;
+}
+
+static inline void zero_all(void *pcpuptr, unsigned int size)
+{
+	unsigned int i;;
+
+	for (i = 0; i < NR_CPUS; i++)
+		memset(per_cpu_ptr(pcpuptr, i), 0, size);
+}
+
+static unsigned long pool_size;
+
+void *__alloc_percpu(size_t size, size_t align)
+{
+	unsigned long extra;
+	unsigned int i;
+	void *ptr;
+
+	BUG_ON(align > SMP_CACHE_BYTES);
+	BUG_ON(size > pool_size/2);
+
+	down(&pcpu_lock);
+	ptr = __per_cpu_start;
+	for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+		/* Extra for alignment requirement. */
+		extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+
+		/* Allocated or not large enough? */
+		if (pcpu->size[i] < 0 || pcpu->size[i] < extra + size)
+			continue;
+
+		/* Transfer extra to previous block. */
+		if (pcpu->size[i-1] < 0)
+			pcpu->size[i-1] -= extra;
+		else
+			pcpu->size[i-1] += extra;
+		pcpu->size[i] -= extra;
+		ptr += extra;
+
+		/* Split block if warranted */
+		if (pcpu->size[i] - size > sizeof(unsigned long))
+			if (!split_block(i, size))
+				break;
+
+		/* Mark allocated */
+		pcpu->size[i] = -pcpu->size[i];
+		zero_all(ptr, size);
+		goto out;
+	}
+	ptr = NULL;
+ out:
+	up(&pcpu_lock);
+	return ptr;
+}
+
+void kfree_percpu(const void *freeme)
+{
+	unsigned int i;
+	void *ptr = __per_cpu_start;
+
+	down(&pcpu_lock);
+	for (i = 0; i < pcpu->num_used; ptr += abs(pcpu->size[i]), i++) {
+		if (ptr == freeme) {
+			/* Double free? */
+			BUG_ON(pcpu->size[i] > 0);
+			/* Block 0 is for non-dynamic per-cpu data. */
+			BUG_ON(i == 0);
+			pcpu->size[i] = -pcpu->size[i];
+			goto merge;
+		}
+	}
+	BUG();
+
+ merge:
+	/* Merge with previous? */
+	if (pcpu->size[i-1] >= 0) {
+		pcpu->size[i-1] += pcpu->size[i];
+		pcpu->num_used--;
+		memmove(&pcpu->size[i], &pcpu->size[i+1],
+			(pcpu->num_used - i) * sizeof(pcpu->size[0]));
+		i--;
+	}
+	/* Merge with next? */
+	if (i+1 < pcpu->num_used && pcpu->size[i+1] >= 0) {
+		pcpu->size[i] += pcpu->size[i+1];
+		pcpu->num_used--;
+		memmove(&pcpu->size[i+1], &pcpu->size[i+2],
+			(pcpu->num_used - (i+1)) * sizeof(pcpu->size[0]));
+	}
+
+	/* There's always one block: the core kernel one. */
+	BUG_ON(pcpu->num_used == 0);
+	up(&pcpu_lock);
+}
+
+unsigned long __per_cpu_offset[NR_CPUS];
+EXPORT_SYMBOL(__per_cpu_offset);
+
+#define PERCPU_INIT_BLOCKS 4
+
+#ifdef __NEED_SETUP_PER_CPU_AREAS
+/* Generic version: allocates for all NR_CPUs. */
+void __init setup_per_cpu_areas(void)
+{
+	unsigned long i;
+	void *ptr;
+
+	/* Leave at least 16k for __alloc_percpu */
+	pool_size = ALIGN(__per_cpu_end - __per_cpu_start + 16384,
+			  SMP_CACHE_BYTES);
+	/* Plenty of memory?  1GB = 64k per-cpu. */
+	pool_size = max(((long long)num_physpages << PAGE_SHIFT) / 16384,
+			(long long)pool_size);
+#ifdef PERCPU_POOL_MAX
+	if (pool_size > PERCPU_POOL_MAX)
+		pool_size = PERCPU_POOL_MAX;
+#endif
+
+	ptr = alloc_bootmem(pool_size * NR_CPUS);
+
+	/* Don't panic yet, they won't see it */
+	if (__per_cpu_end - __per_cpu_start > pool_size)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++, ptr += pool_size) {
+		__per_cpu_offset[i] = ptr - (void *)__per_cpu_start;
+		/* Copy section for each CPU (we discard the original) */
+		memcpy(__per_cpu_start + __per_cpu_offset[i],
+		       __per_cpu_start,
+		       __per_cpu_end - __per_cpu_start);
+	}
+}
+#endif
+
+static int init_alloc_percpu(void)
+{
+	printk("Per-cpu data: %Zu of %u bytes\n",
+	       __per_cpu_end - __per_cpu_start, pool_size);
+
+	if (__per_cpu_end - __per_cpu_start > pool_size)
+		panic("Too much per-cpu data.\n");
+
+	pcpu = kmalloc(sizeof(*pcpu)+sizeof(pcpu->size[0])*PERCPU_INIT_BLOCKS,
+		       GFP_KERNEL);
+	pcpu->num_allocated = PERCPU_INIT_BLOCKS;
+	pcpu->num_used = 2;
+	pcpu->size[0] = -(__per_cpu_end - __per_cpu_start);
+	pcpu->size[1] = pool_size-(__per_cpu_end - __per_cpu_start);
+
+	return 0;
+}
+
+__initcall(init_alloc_percpu);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/mm/slab.c .26261-linux-2.5.69.updated/mm/slab.c
--- .26261-linux-2.5.69/mm/slab.c	2003-04-20 18:05:16.000000000 +1000
+++ .26261-linux-2.5.69.updated/mm/slab.c	2003-05-06 18:14:22.000000000 +1000
@@ -1902,54 +1902,6 @@ void * kmalloc (size_t size, int flags)
 	return NULL;
 }
 
-#ifdef CONFIG_SMP
-/**
- * kmalloc_percpu - allocate one copy of the object for every present
- * cpu in the system.
- * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr
- * macros only.
- *
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user.  May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
- */
-void *
-kmalloc_percpu(size_t size, int flags)
-{
-	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
-
-	if (!pdata)
-		return NULL;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		pdata->ptrs[i] = kmalloc(size, flags);
-		if (!pdata->ptrs[i])
-			goto unwind_oom;
-	}
-
-	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
-	while (--i >= 0) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(pdata->ptrs[i]);
-	}
-	kfree(pdata);
-	return NULL;
-}
-#endif
-
 /**
  * kmem_cache_free - Deallocate an object
  * @cachep: The cache the allocation was from.
@@ -1988,28 +1940,6 @@ void kfree (const void *objp)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_SMP
-/**
- * kfree_percpu - free previously allocated percpu memory
- * @objp: pointer returned by kmalloc_percpu.
- *
- * Don't free memory not originally allocated by kmalloc_percpu()
- * The complemented objp is to check for that.
- */
-void
-kfree_percpu(const void *objp)
-{
-	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(p->ptrs[i]);
-	}
-}
-#endif
-
 unsigned int kmem_cache_size(kmem_cache_t *cachep)
 {
 	unsigned int objlen = cachep->objsize;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv4/af_inet.c .26261-linux-2.5.69.updated/net/ipv4/af_inet.c
--- .26261-linux-2.5.69/net/ipv4/af_inet.c	2003-05-05 12:37:14.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv4/af_inet.c	2003-05-06 18:14:22.000000000 +1000
@@ -1061,54 +1061,21 @@ static struct inet_protocol icmp_protoco
 
 static int __init init_ipv4_mibs(void)
 {
-	int i;
-
-	net_statistics[0] =
-	    kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
-	net_statistics[1] =
-	    kmalloc_percpu(sizeof (struct linux_mib), GFP_KERNEL);
-	ip_statistics[0] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
-	ip_statistics[1] = kmalloc_percpu(sizeof (struct ip_mib), GFP_KERNEL);
-	icmp_statistics[0] =
-	    kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
-	icmp_statistics[1] =
-	    kmalloc_percpu(sizeof (struct icmp_mib), GFP_KERNEL);
-	tcp_statistics[0] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
-	tcp_statistics[1] = kmalloc_percpu(sizeof (struct tcp_mib), GFP_KERNEL);
-	udp_statistics[0] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
-	udp_statistics[1] = kmalloc_percpu(sizeof (struct udp_mib), GFP_KERNEL);
+	net_statistics[0] = kmalloc_percpu(struct linux_mib);
+	net_statistics[1] = kmalloc_percpu(struct linux_mib);
+	ip_statistics[0] = kmalloc_percpu(struct ip_mib);
+	ip_statistics[1] = kmalloc_percpu(struct ip_mib);
+	icmp_statistics[0] = kmalloc_percpu(struct icmp_mib);
+	icmp_statistics[1] = kmalloc_percpu(struct icmp_mib);
+	tcp_statistics[0] = kmalloc_percpu(struct tcp_mib);
+	tcp_statistics[1] = kmalloc_percpu(struct tcp_mib);
+	udp_statistics[0] = kmalloc_percpu(struct udp_mib);
+	udp_statistics[1] = kmalloc_percpu(struct udp_mib);
 	if (!
 	    (net_statistics[0] && net_statistics[1] && ip_statistics[0]
 	     && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
 	     && udp_statistics[0] && udp_statistics[1]))
 		return -ENOMEM;
-
-	/* Set all the per cpu copies of the mibs to zero */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(net_statistics[0], i), 0,
-			       sizeof (struct linux_mib));
-			memset(per_cpu_ptr(net_statistics[1], i), 0,
-			       sizeof (struct linux_mib));
-			memset(per_cpu_ptr(ip_statistics[0], i), 0,
-			       sizeof (struct ip_mib));
-			memset(per_cpu_ptr(ip_statistics[1], i), 0,
-			       sizeof (struct ip_mib));
-			memset(per_cpu_ptr(icmp_statistics[0], i), 0,
-			       sizeof (struct icmp_mib));
-			memset(per_cpu_ptr(icmp_statistics[1], i), 0,
-			       sizeof (struct icmp_mib));
-			memset(per_cpu_ptr(tcp_statistics[0], i), 0,
-			       sizeof (struct tcp_mib));
-			memset(per_cpu_ptr(tcp_statistics[1], i), 0,
-			       sizeof (struct tcp_mib));
-			memset(per_cpu_ptr(udp_statistics[0], i), 0,
-			       sizeof (struct udp_mib));
-			memset(per_cpu_ptr(udp_statistics[1], i), 0,
-			       sizeof (struct udp_mib));
-		}
-	}
-
 	(void) tcp_mib_init();
 
 	return 0;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv4/route.c .26261-linux-2.5.69.updated/net/ipv4/route.c
--- .26261-linux-2.5.69/net/ipv4/route.c	2003-05-05 12:37:14.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv4/route.c	2003-05-06 18:14:22.000000000 +1000
@@ -2689,17 +2689,9 @@ int __init ip_rt_init(void)
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
-	rt_cache_stat = kmalloc_percpu(sizeof (struct rt_cache_stat),
-					GFP_KERNEL);
+	rt_cache_stat = kmalloc_percpu(struct rt_cache_stat);
 	if (!rt_cache_stat) 
 		goto out_enomem1;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(rt_cache_stat, i), 0,
-			       sizeof (struct rt_cache_stat));
-		}
-	}
-
 	devinet_init();
 	ip_fib_init();
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv6/af_inet6.c .26261-linux-2.5.69.updated/net/ipv6/af_inet6.c
--- .26261-linux-2.5.69/net/ipv6/af_inet6.c	2003-05-05 12:37:15.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv6/af_inet6.c	2003-05-06 18:14:22.000000000 +1000
@@ -633,28 +633,20 @@ inet6_unregister_protosw(struct inet_pro
 }
 
 int
-snmp6_mib_init(void *ptr[2], size_t mibsize)
+snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign)
 {
 	int i;
 
 	if (ptr == NULL)
 		return -EINVAL;
 
-	ptr[0] = kmalloc_percpu(mibsize, GFP_KERNEL);
+	ptr[0] = __alloc_percpu(mibsize, mibalign);
 	if (!ptr[0])
 		goto err0;
 
-	ptr[1] = kmalloc_percpu(mibsize, GFP_KERNEL);
+	ptr[1] = __alloc_percpu(mibsize, mibalign);
 	if (!ptr[1])
 		goto err1;
-
-	/* Zero percpu version of the mibs */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(ptr[0], i), 0, mibsize);
-			memset(per_cpu_ptr(ptr[1], i), 0, mibsize);
-		}
-	}
 	return 0;
 
 err1:
@@ -676,11 +668,14 @@ snmp6_mib_free(void *ptr[2])
 
 static int __init init_ipv6_mibs(void)
 {
-	if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib)) < 0)
+	if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipv6_mib),
+			   __alignof__(struct ipv6_mib)) < 0)
 		goto err_ip_mib;
-	if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib)) < 0)
+	if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib),
+			   __alignof__(struct icmpv6_mib)) < 0)
 		goto err_icmp_mib;
-	if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0)
+	if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
+			   __alignof__(struct udp_mib)) < 0)
 		goto err_udp_mib;
 	return 0;
 
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/ipv6/proc.c .26261-linux-2.5.69.updated/net/ipv6/proc.c
--- .26261-linux-2.5.69/net/ipv6/proc.c	2003-05-05 12:37:15.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/ipv6/proc.c	2003-05-06 18:14:22.000000000 +1000
@@ -226,7 +226,7 @@ int snmp6_register_dev(struct inet6_dev 
 	if (!idev || !idev->dev)
 		return -EINVAL;
 
-	if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib)) < 0)
+	if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib), __alignof__(struct icmpv6_mib)) < 0)
 		goto err_icmp;
 
 #ifdef CONFIG_PROC_FS
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .26261-linux-2.5.69/net/sctp/protocol.c .26261-linux-2.5.69.updated/net/sctp/protocol.c
--- .26261-linux-2.5.69/net/sctp/protocol.c	2003-05-05 12:37:16.000000000 +1000
+++ .26261-linux-2.5.69.updated/net/sctp/protocol.c	2003-05-06 18:14:23.000000000 +1000
@@ -855,26 +855,14 @@ static int __init init_sctp_mibs(void)
 {
 	int i;
 
-	sctp_statistics[0] = kmalloc_percpu(sizeof (struct sctp_mib),
-					    GFP_KERNEL);
+	sctp_statistics[0] = kmalloc_percpu(struct sctp_mib);
 	if (!sctp_statistics[0])
 		return -ENOMEM;
-	sctp_statistics[1] = kmalloc_percpu(sizeof (struct sctp_mib),
-					    GFP_KERNEL);
+	sctp_statistics[1] = kmalloc_percpu(struct sctp_mib);
 	if (!sctp_statistics[1]) {
 		kfree_percpu(sctp_statistics[0]);
 		return -ENOMEM;
 	}
-
-	/* Zero all percpu versions of the mibs */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			memset(per_cpu_ptr(sctp_statistics[0], i), 0,
-					sizeof (struct sctp_mib));
-			memset(per_cpu_ptr(sctp_statistics[1], i), 0,
-					sizeof (struct sctp_mib));
-		}
-	}
 	return 0;
 
 }

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  7:34                             ` Andrew Morton
@ 2003-05-06  8:42                               ` William Lee Irwin III
  2003-05-06 14:38                               ` Martin J. Bligh
  1 sibling, 0 replies; 57+ messages in thread
From: William Lee Irwin III @ 2003-05-06  8:42 UTC (permalink / raw)
  To: Andrew Morton; +Cc: David S. Miller, rusty, dipankar, linux-kernel

"David S. Miller" <davem@redhat.com> wrote:
>> As just pointed out by dipankar the only issue is NUMA...
>> so it has to be something more sophisticated than simply
>> kmalloc()[smp_processor_id];

On Tue, May 06, 2003 at 12:34:12AM -0700, Andrew Morton wrote:
> The proposed patch doesn't do anything about that either.
> +	ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);
> So yes, we need an api which could be extended to use node-affine memory at
> some time in the future.  I think we have that.

IIRC that can be overridden; I wrote something to do node-local per-cpu
areas for i386 for some prior incarnations of per-cpu stuff and even
posted it, and this looks like it bootstraps at the same time (before
free_area_init_core() that is) and has the same override hook.


-- wli

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  8:28                       ` Rusty Russell
@ 2003-05-06  8:47                         ` Andrew Morton
  2003-05-07  1:57                           ` Rusty Russell
  2003-05-06 14:41                         ` Martin J. Bligh
  1 sibling, 1 reply; 57+ messages in thread
From: Andrew Morton @ 2003-05-06  8:47 UTC (permalink / raw)
  To: Rusty Russell; +Cc: dipankar, linux-kernel

Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> It's a tradeoff, but I think it's worth it for a kmalloc_percpu which
> is fast, space-efficient and numa-aware, since the code is needed
> anyway.

I don't beleive that kmalloc_percpu() itself needs to be fast, as you say.

The code is _not_ NUMA-aware.  Is it?

> How about a compromise like the following (scaled with mem)?
> Untested, but you get the idea...

> +	/* Plenty of memory?  1GB = 64k per-cpu. */
> +	pool_size = max(((long long)num_physpages << PAGE_SHIFT) / 16384,
> +			(long long)pool_size);

On 64GB 32-way that's 128MB of lowmem.  Unpopular.  I'd settle for a __setup
thingy here, and a printk when the memory runs out.

btw, what's wrong with leaving kmalloc_percpu() as-is, and only using this
allocator for DEFINE_PERCPU()?



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  8:03     ` Rusty Russell
@ 2003-05-06  9:23       ` David S. Miller
  2003-05-06  9:34       ` Ravikiran G Thirumalai
  1 sibling, 0 replies; 57+ messages in thread
From: David S. Miller @ 2003-05-06  9:23 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ravikiran G Thirumalai, dipankar, linux-kernel, akpm

On Tue, 2003-05-06 at 01:03, Rusty Russell wrote:
> And #3 is, in fact, the one I care about.  The extra memory reference
> is already probably cachehot (all static per-cpu use it), and might be
> in a register on some archs.
> 
> Doesn't break with sparce CPU #s, but yes, it is inefficient.

Maybe we should treat this stuff like architecture ABIs that
have a "small data" section?

Before elaborating, let me state that if we are going to use this
for things like disk stats and per-interface ipv6 ICMP stats, a
fixed size pool is absolutely unacceptable.  People are configuring up
thousands upon thousands of VLAN net devices today, and if IPV6 is
enabled each of those will get a per-cpu ICMP stats block allocated.
And as Andrew said, there can be thousands of block devices.

Therefore, for these per-cpu applications there must be no limits built
into the mechanism.

Now, what I propose is that kmalloc_percpu() keeps it's current
implementation.  Then we have a kmalloc_percpu_small() that must only
be invoked during module init and we limit the size to some reasonable
amount.  This kmalloc_percpu_small() uses the 32K pool or whatever, as
does DECLARE_PERCPU in a module.

How about this?

-- 
David S. Miller <davem@redhat.com>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  8:03     ` Rusty Russell
  2003-05-06  9:23       ` David S. Miller
@ 2003-05-06  9:34       ` Ravikiran G Thirumalai
  2003-05-06  9:38         ` Dipankar Sarma
  2003-05-07  2:14         ` Rusty Russell
  1 sibling, 2 replies; 57+ messages in thread
From: Ravikiran G Thirumalai @ 2003-05-06  9:34 UTC (permalink / raw)
  To: Rusty Russell; +Cc: dipankar, linux-kernel, akpm

On Tue, May 06, 2003 at 06:03:15PM +1000, Rusty Russell wrote:
> In message <20030506050744.GA29352@in.ibm.com> you write:
> .. 
> Doesn't break with sparce CPU #s, but yes, it is inefficient.
> 

If you don't reduce NR_CPUS with CONFIG_NR_CPUS, you waste space (32 bit folks
won't like it) and if you say change CONFIG_NR_CPUS to 2, 
and we have cpuid 4 on a 2 way you break right?  If we have to address these
issues at all, why can't we use the simpler kmalloc_percpu patch
which  I posted in the morning and avoid so much complexity and arch
dependency?  

Thanks,
Kiran


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  9:34       ` Ravikiran G Thirumalai
@ 2003-05-06  9:38         ` Dipankar Sarma
  2003-05-07  2:14         ` Rusty Russell
  1 sibling, 0 replies; 57+ messages in thread
From: Dipankar Sarma @ 2003-05-06  9:38 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: Rusty Russell, linux-kernel, akpm

On Tue, May 06, 2003 at 03:04:11PM +0530, Ravikiran G Thirumalai wrote:
> > Doesn't break with sparce CPU #s, but yes, it is inefficient.
> > 
> 
> If you don't reduce NR_CPUS with CONFIG_NR_CPUS, you waste space (32 bit folks
> won't like it) and if you say change CONFIG_NR_CPUS to 2, 
> and we have cpuid 4 on a 2 way you break right?  If we have to address these
> issues at all, why can't we use the simpler kmalloc_percpu patch
> which  I posted in the morning and avoid so much complexity and arch
> dependency?  

We can have something like that for !CONFIG_NUMA and a NUMA-aware
allocator with additional dereferencing cost for CONFIG_NUMA.
Hopefully gains from numa-awareness will more than offset
dereferencing costs.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  7:25                         ` Jens Axboe
@ 2003-05-06 10:41                           ` Ingo Oeser
  0 siblings, 0 replies; 57+ messages in thread
From: Ingo Oeser @ 2003-05-06 10:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, David S. Miller, rusty, dipankar, linux-kernel

On Tue, May 06, 2003 at 09:25:00AM +0200, Jens Axboe wrote:
> On Mon, May 05 2003, Andrew Morton wrote:
> > "David S. Miller" <davem@redhat.com> wrote:
> > The disk_stats structure has an "in flight" member.  If we don't have proper
> > locking around that, disks will appear to have -3 requests in flight for all
> > time, which would look a tad odd.
> 
> So check for < 0 in flight? I totally agree with davem here.

If the disk_stats structure will never be accurate, it will show
nonsense values. If it shows nonsense values, the values have no
meaning anymore and we could remove them alltogether.

If some additions/subtractions will come in later it will not
matter, but if the value we add to or subtract from is already
wrong, then the error will really propagate to much.

Has somebody analyzed this? Can we tell userspace a maximum error
value? Is the maximum error value acceptable?

Is the above questions are all answered with "no", then I (and
several sysadmins) would prefer to just rip the stats or make
them a compile time option and be exact, if we really want them.

What about that?

Most people just want them per system to measure their
IO bandwidth and they want the spots (=per disk stats), just
to analyze the problematic cases.

For the real performance, they could always compile out the per
disk stats, once they are satisfied with the overall bandwidth.

Regards

Ingo Oeser

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  7:34                             ` Andrew Morton
  2003-05-06  8:42                               ` William Lee Irwin III
@ 2003-05-06 14:38                               ` Martin J. Bligh
  1 sibling, 0 replies; 57+ messages in thread
From: Martin J. Bligh @ 2003-05-06 14:38 UTC (permalink / raw)
  To: Andrew Morton, David S. Miller; +Cc: rusty, dipankar, linux-kernel

>> As just pointed out by dipankar the only issue is NUMA...
>> so it has to be something more sophisticated than simply
>> kmalloc()[smp_processor_id];
> 
> The proposed patch doesn't do anything about that either.
> 
> +	ptr = alloc_bootmem(PERCPU_POOL_SIZE * NR_CPUS);
> 
> So yes, we need an api which could be extended to use node-affine memory at
> some time in the future.  I think we have that.

You can just call alloc_bootmem_node for each CPU instead. It doesn't 
work on i386 at the moment (well, it'll work but come out of node 0), 
but it probably ought to.

M.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  8:28                       ` Rusty Russell
  2003-05-06  8:47                         ` Andrew Morton
@ 2003-05-06 14:41                         ` Martin J. Bligh
  1 sibling, 0 replies; 57+ messages in thread
From: Martin J. Bligh @ 2003-05-06 14:41 UTC (permalink / raw)
  To: Rusty Russell, Andrew Morton; +Cc: dipankar, linux-kernel

--On Tuesday, May 06, 2003 18:28:24 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:

> +	pool_size = max(((long long)num_physpages << PAGE_SHIFT) / 16384,
> +			(long long)pool_size);

I like the idea of scaling it with the machine, but can you scale this off 
lowmem instead? Something like max_low_pfn or similar.

Thanks,

M.



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  6:57                       ` Andrew Morton
  2003-05-06  7:25                         ` Jens Axboe
@ 2003-05-06 16:05                         ` Bryan O'Sullivan
  1 sibling, 0 replies; 57+ messages in thread
From: Bryan O'Sullivan @ 2003-05-06 16:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: David S. Miller, rusty, dipankar, linux-kernel

On Mon, 2003-05-05 at 23:57, Andrew Morton wrote:

> The disk_stats structure has an "in flight" member.  If we don't have proper
> locking around that, disks will appear to have -3 requests in flight for all
> time, which would look a tad odd.

For what it's worth, the disk stats patch that vendors have been
shipping forever with 2.4 (and that Christoph pushed into 2.4.20) has
this problem, so at least people will be used to it.

	<b


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  8:47                         ` Andrew Morton
@ 2003-05-07  1:57                           ` Rusty Russell
  2003-05-07  2:41                             ` William Lee Irwin III
  2003-05-07  5:37                             ` Andrew Morton
  0 siblings, 2 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-07  1:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dipankar, linux-kernel, Bartlomiej Zolnierkiewicz, paulus

In message <20030506014745.02508f0d.akpm@digeo.com> you write:
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> >
> > It's a tradeoff, but I think it's worth it for a kmalloc_percpu which
> > is fast, space-efficient and numa-aware, since the code is needed
> > anyway.
> 
> I don't beleive that kmalloc_percpu() itself needs to be fast, as you say.
> 
> The code is _not_ NUMA-aware.  Is it?

No, it's arch-overridable.  You know me, I don't mess with
arch-specific stuff. 8)

> On 64GB 32-way that's 128MB of lowmem.  Unpopular.  I'd settle for a __setup
> thingy here, and a printk when the memory runs out.

Currently __setup is run far too late.  Fortunately Bartlomiej has
been working on moving __setup earlier, for other reasons (two-stage
parameter parsing).  In this case, it'd need to be before arch setup,
hmm...

> btw, what's wrong with leaving kmalloc_percpu() as-is, and only using this
> allocator for DEFINE_PERCPU()?

I figured that since the allocator is going to be there anyway, it
made sense to express kmalloc_percpu() in those terms.  If you think
the price is too high, I can respect that.

The more people use the __per_cpu_offset instead of
smp_processor_id(), the cheaper it gets, even to the stage where some
archs might want to express smp_processor_id() in terms of
__per_cpu_offset, rather than vice-versa.  Given that it's also more
space efficient (each var doesn't take one cacheline) too, I'd
recommend __alloc_percpu() of eg. single ints for long-lived objects
where I wouldn't recommend the current kmalloc_percpu().

Paul Mackerras points out that we could get the numa-aware allocation
plus "one big alloc" properties by playing with page mappings: reserve
1MB of virtual address, and map more pages as required.  I didn't
think that we'd need that yet, though.

So, numerous options, and you're smarter than me, so you can decide 8)
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-06  9:34       ` Ravikiran G Thirumalai
  2003-05-06  9:38         ` Dipankar Sarma
@ 2003-05-07  2:14         ` Rusty Russell
  1 sibling, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-07  2:14 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: dipankar, linux-kernel, akpm

In message <20030506093411.GB29352@in.ibm.com> you write:
> On Tue, May 06, 2003 at 06:03:15PM +1000, Rusty Russell wrote:
> > In message <20030506050744.GA29352@in.ibm.com> you write:
> > .. 
> > Doesn't break with sparce CPU #s, but yes, it is inefficient.
> > 
> 
> If you don't reduce NR_CPUS with CONFIG_NR_CPUS, you waste space (32
> bit folks won't like it) and if you say change CONFIG_NR_CPUS to 2,
> and we have cpuid 4 on a 2 way you break right?

You misunderstand, I think: on *all* archs, smp_processor_id() <
NR_CPUS is an axiom.  Always has been.

The generic solution involved cpu_possible(), but it's too early in
boot for that: attempts to make it a requirement have to change
numerous archs.  Not that I'm against that change...

Of course, an arch can override this default allocator, and may have
more info with which to do optimal allocation.

> If we have to address these issues at all, why can't we use the
> simpler kmalloc_percpu patch which I posted in the morning and avoid
> so much complexity and arch dependency?

Because we still need the __alloc_percpu for DECLARE_PER_CPU support
in modules 8(

Hope that clarifies,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  1:57                           ` Rusty Russell
@ 2003-05-07  2:41                             ` William Lee Irwin III
  2003-05-07  4:03                               ` Paul Mackerras
  2003-05-07  4:15                               ` Rusty Russell
  2003-05-07  5:37                             ` Andrew Morton
  1 sibling, 2 replies; 57+ messages in thread
From: William Lee Irwin III @ 2003-05-07  2:41 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, dipankar, linux-kernel, Bartlomiej Zolnierkiewicz, paulus

On Wed, May 07, 2003 at 11:57:13AM +1000, Rusty Russell wrote:
> Paul Mackerras points out that we could get the numa-aware allocation
> plus "one big alloc" properties by playing with page mappings: reserve
> 1MB of virtual address, and map more pages as required.  I didn't
> think that we'd need that yet, though.

This is somewhat painful to do (though possible) on i386. The cost of
task migration would increase at the very least.


-- wli

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  2:41                             ` William Lee Irwin III
@ 2003-05-07  4:03                               ` Paul Mackerras
  2003-05-07  4:22                                 ` William Lee Irwin III
  2003-05-07  4:15                               ` Rusty Russell
  1 sibling, 1 reply; 57+ messages in thread
From: Paul Mackerras @ 2003-05-07  4:03 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Rusty Russell, Andrew Morton, dipankar, linux-kernel,
	Bartlomiej Zolnierkiewicz

William Lee Irwin III writes:
> On Wed, May 07, 2003 at 11:57:13AM +1000, Rusty Russell wrote:
> > Paul Mackerras points out that we could get the numa-aware allocation
> > plus "one big alloc" properties by playing with page mappings: reserve
> > 1MB of virtual address, and map more pages as required.  I didn't
> > think that we'd need that yet, though.
> 
> This is somewhat painful to do (though possible) on i386. The cost of
> task migration would increase at the very least.

Explain?  We're not talking about having the same address mapped
differently on different CPUs, we're just talking about something
which is the equivalent of a sparsely-instantiated vmalloc.

Paul.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  5:19                                     ` William Lee Irwin III
@ 2003-05-07  4:10                                       ` Martin J. Bligh
  2003-05-07 12:13                                         ` William Lee Irwin III
  0 siblings, 1 reply; 57+ messages in thread
From: Martin J. Bligh @ 2003-05-07  4:10 UTC (permalink / raw)
  To: William Lee Irwin III, Paul Mackerras
  Cc: Rusty Russell, Andrew Morton, dipankar, linux-kernel,
	Bartlomiej Zolnierkiewicz

>>> Same address mapped differently on different cpus is what I thought
>>> you meant. It does make sense, and besides, it only really matters
>>> when the thing is being switched in, so I think it's not such a big
>>> deal. e.g. mark per-thread mm context with the cpu it was prepped for,
>>> if they don't match at load-time then reset the kernel pmd's pgd entry
>>> in the per-thread pgd at the top level. x86 blows away the TLB at the
> 
> On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
>> Having to have a pgdir per thread would be a bit sucky, wouldn't it?
> 
> Not as bad as it initially sounds; on non-PAE i386, it's 4KB and would
> hurt. On PAE i386, it's 32B and can be shoehorned, say, in thread_info.
> Then the rest is just a per-cpu kernel pmd and properly handling vmalloc
> faults (which are already handled properly for non-PAE vmallocspace).
> There might be other reasons to do it, like reducing the virtualspace
> overhead of the atomic kmap area, but it's not really time yet.

Even if the space isn't a problem, having a full TLB flush on thread 
context switch sounds sucky. Per-node is sufficient for most things,
and is much cheaper (update on cross-node migrate). 

M.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  2:41                             ` William Lee Irwin III
  2003-05-07  4:03                               ` Paul Mackerras
@ 2003-05-07  4:15                               ` Rusty Russell
  1 sibling, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-07  4:15 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Andrew Morton, dipankar, linux-kernel, Bartlomiej Zolnierkiewicz, paulus

In message <20030507024135.GW8978@holomorphy.com> you write:
> On Wed, May 07, 2003 at 11:57:13AM +1000, Rusty Russell wrote:
> > Paul Mackerras points out that we could get the numa-aware allocation
> > plus "one big alloc" properties by playing with page mappings: reserve
> > 1MB of virtual address, and map more pages as required.  I didn't
> > think that we'd need that yet, though.
> 
> This is somewhat painful to do (though possible) on i386. The cost of
> task migration would increase at the very least.

You misunderstand I think.  All cpus would have the same mappings.

Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  4:03                               ` Paul Mackerras
@ 2003-05-07  4:22                                 ` William Lee Irwin III
  2003-05-07  4:56                                   ` Paul Mackerras
  0 siblings, 1 reply; 57+ messages in thread
From: William Lee Irwin III @ 2003-05-07  4:22 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Rusty Russell, Andrew Morton, dipankar, linux-kernel,
	Bartlomiej Zolnierkiewicz

William Lee Irwin III writes:
>> This is somewhat painful to do (though possible) on i386. The cost of
>> task migration would increase at the very least.

On Wed, May 07, 2003 at 02:03:46PM +1000, Paul Mackerras wrote:
> Explain?  We're not talking about having the same address mapped
> differently on different CPUs, we're just talking about something
> which is the equivalent of a sparsely-instantiated vmalloc.

Same address mapped differently on different cpus is what I thought
you meant. It does make sense, and besides, it only really matters
when the thing is being switched in, so I think it's not such a big
deal. e.g. mark per-thread mm context with the cpu it was prepped for,
if they don't match at load-time then reset the kernel pmd's pgd entry
in the per-thread pgd at the top level. x86 blows away the TLB at the
drop of a hat anyway, and it wouldn't even introduce an extra whole-TLB
flush. That said, I'm not terribly interested in hacking all that up
even though the proper hooks to make it pure arch code appear to exist.

The vmallocspace bit is easier, though the virtualspace reservation
could get uncomfortably large depending on how much is crammed in there.
That can go node-local also. I guess it has some runtime arithmetic
overhead vs. the per-cpu TLB entries in exchange for less complex code.

-- wli

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  4:22                                 ` William Lee Irwin III
@ 2003-05-07  4:56                                   ` Paul Mackerras
  2003-05-07  5:19                                     ` William Lee Irwin III
  0 siblings, 1 reply; 57+ messages in thread
From: Paul Mackerras @ 2003-05-07  4:56 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Rusty Russell, Andrew Morton, dipankar, linux-kernel,
	Bartlomiej Zolnierkiewicz

William Lee Irwin III writes:

> Same address mapped differently on different cpus is what I thought
> you meant. It does make sense, and besides, it only really matters
> when the thing is being switched in, so I think it's not such a big
> deal. e.g. mark per-thread mm context with the cpu it was prepped for,
> if they don't match at load-time then reset the kernel pmd's pgd entry
> in the per-thread pgd at the top level. x86 blows away the TLB at the

Having to have a pgdir per thread would be a bit sucky, wouldn't it?

On PPCs with the hash-table based MMU, if we wanted to do different
mappings of the same address on different CPUs, we would have to have
a separate hash table for each CPU, which would chew up a lot of
memory.  On PPC64 machines with logical partitioning, I don't think
the hypervisor would let you have a separate hash table for each CPU.
On the flip side, PPC can afford a register to point to a per-cpu data
area more easily than x86 can.

> The vmallocspace bit is easier, though the virtualspace reservation
> could get uncomfortably large depending on how much is crammed in there.
> That can go node-local also. I guess it has some runtime arithmetic
> overhead vs. the per-cpu TLB entries in exchange for less complex code.

I was thinking of something like 64kB per cpu times 32 cpus = 2MB.
Anyway, 32-bit machines with > 8 cpus are a pretty rare corner case.
On 64-bit machines we have enough virtual space to give each cpu
gigabytes of per-cpu data if we want to.

Paul.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  4:56                                   ` Paul Mackerras
@ 2003-05-07  5:19                                     ` William Lee Irwin III
  2003-05-07  4:10                                       ` Martin J. Bligh
  0 siblings, 1 reply; 57+ messages in thread
From: William Lee Irwin III @ 2003-05-07  5:19 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Rusty Russell, Andrew Morton, dipankar, linux-kernel,
	Bartlomiej Zolnierkiewicz

William Lee Irwin III writes:
>> Same address mapped differently on different cpus is what I thought
>> you meant. It does make sense, and besides, it only really matters
>> when the thing is being switched in, so I think it's not such a big
>> deal. e.g. mark per-thread mm context with the cpu it was prepped for,
>> if they don't match at load-time then reset the kernel pmd's pgd entry
>> in the per-thread pgd at the top level. x86 blows away the TLB at the

On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
> Having to have a pgdir per thread would be a bit sucky, wouldn't it?

Not as bad as it initially sounds; on non-PAE i386, it's 4KB and would
hurt. On PAE i386, it's 32B and can be shoehorned, say, in thread_info.
Then the rest is just a per-cpu kernel pmd and properly handling vmalloc
faults (which are already handled properly for non-PAE vmallocspace).
There might be other reasons to do it, like reducing the virtualspace
overhead of the atomic kmap area, but it's not really time yet.


On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
> On PPCs with the hash-table based MMU, if we wanted to do different
> mappings of the same address on different CPUs, we would have to have
> a separate hash table for each CPU, which would chew up a lot of
> memory.  On PPC64 machines with logical partitioning, I don't think
> the hypervisor would let you have a separate hash table for each CPU.
> On the flip side, PPC can afford a register to point to a per-cpu data
> area more easily than x86 can.

Well, presumably it'd have to be abstracted so the mechanism isn't
exposed to core code if ever done. The arch code insulation appears to
be there to keep one going, though not necessarily accessors. Probably
the only reason to seriously think about it is that the arithmetic
shows up as a disincentive on the register-starved FPOS's I'm stuck on.


William Lee Irwin III writes:
>> The vmallocspace bit is easier, though the virtualspace reservation
>> could get uncomfortably large depending on how much is crammed in there.
>> That can go node-local also. I guess it has some runtime arithmetic
>> overhead vs. the per-cpu TLB entries in exchange for less complex code.

On Wed, May 07, 2003 at 02:56:53PM +1000, Paul Mackerras wrote:
> I was thinking of something like 64kB per cpu times 32 cpus = 2MB.
> Anyway, 32-bit machines with > 8 cpus are a pretty rare corner case.
> On 64-bit machines we have enough virtual space to give each cpu
> gigabytes of per-cpu data if we want to.

2MB vmallocspace is doable; it'd need to be bigger or per-something
besides cpus to hurt.


-- wli

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  1:57                           ` Rusty Russell
  2003-05-07  2:41                             ` William Lee Irwin III
@ 2003-05-07  5:37                             ` Andrew Morton
  2003-05-08  0:53                               ` Rusty Russell
  1 sibling, 1 reply; 57+ messages in thread
From: Andrew Morton @ 2003-05-07  5:37 UTC (permalink / raw)
  To: Rusty Russell; +Cc: dipankar, linux-kernel, B.Zolnierkiewicz, paulus

Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> I figured that since the allocator is going to be there anyway, it
>  made sense to express kmalloc_percpu() in those terms.  If you think
>  the price is too high, I can respect that.

I suggest that we use your new mechanism to fix DEFINE_PERCPU() in modules,
and leave kmalloc_percpu() as it is for now.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-05  8:08 [PATCH] kmalloc_percpu Rusty Russell
  2003-05-05  8:47 ` Andrew Morton
@ 2003-05-07  5:51 ` Ravikiran G Thirumalai
  2003-05-07  6:16   ` Rusty Russell
  1 sibling, 1 reply; 57+ messages in thread
From: Ravikiran G Thirumalai @ 2003-05-07  5:51 UTC (permalink / raw)
  To: Rusty Russell; +Cc: akpm, Dipankar Sarma, linux-kernel

On Mon, May 05, 2003 at 08:17:07AM +0000, Rusty Russell wrote:
> Hi Andrew,
> 
> 	This is the kmalloc_percpu patch.  I have another patch which
> tests the allocator if you want to see that to.  This is the precursor
> to per-cpu stuff in modules, but also allows other space gains for
> structures which currently embed per-cpu arrays (eg. your fuzzy counters).
>

I tried to run a test to compare this implementation, but got an oops.
Here is the oops and the patch I was trying...  

btw, why the change from kmalloc_percpu(size) to kmalloc_percpu(type)?
You do kmalloc(sizeof (long)) for the usual kmalloc, but 
kmalloc_percpu(long) for percpu data...looks strange no?

Thanks,
Kiran


diff -ruN -X dontdiff linux-2.5.69/mm/swap.c rusty-1-2.5.69/mm/swap.c
--- linux-2.5.69/mm/swap.c	Mon May  5 05:23:13 2003
+++ rusty-1-2.5.69/mm/swap.c	Wed May  7 10:01:00 2003
@@ -356,14 +356,14 @@
  */
 #define ACCT_THRESHOLD	max(16, NR_CPUS * 2)
 
-static DEFINE_PER_CPU(long, committed_space) = 0;
+static long *committed_space;
 
 void vm_acct_memory(long pages)
 {
 	long *local;
 
 	preempt_disable();
-	local = &__get_cpu_var(committed_space);
+	local = per_cpu_ptr(committed_space, smp_processor_id());
 	*local += pages;
 	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
 		atomic_add(*local, &vm_committed_space);
@@ -371,6 +371,12 @@
 	}
 	preempt_enable();
 }
+
+static int init_committed_space(void)
+{
+	committed_space = kmalloc_percpu(long);
+	return committed_space;
+}
 #endif
 
 
@@ -390,4 +396,6 @@
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+	if (!init_committed_space)
+		panic("swap_setup\n");
 }



Unable to handle kernel paging request at virtual address 0115d040
 printing eip:
c01392ae
*pde = 1fccf001
*pte = 00000000
Oops: 0000 [#1]
CPU:    0
EIP:    0060:[<c01392ae>]    Not tainted
EFLAGS: 00010202
EIP is at vm_acct_memory+0x1e/0x60
eax: 00000000   ebx: dfcca4e0   ecx: 0115d040   edx: 00000001
esi: 00000001   edi: dff7e000   ebp: dff7fe3c   esp: dff7fc7c
ds: 007b   es: 007b   ss: 0068
Process init (pid: 1, threadinfo=dff7e000 task=dff7c040)
Stack: c013fa8f 00000001 dffe4120 00000000 dfcd8720 dff7fe3c dfcca4e0 dfcd3dc0
       c0155966 00000001 dff7e000 bffe0000 dff7e000 dff7fe3c c02ba305 00000000
       c0171013 dff7fe3c 00000000 00000000 00000000 ffffffff 00000003 00000000
Call Trace:
 [<c013fa8f>] vm_enough_memory+0xf/0xc0
 [<c0155966>] setup_arg_pages+0x96/0x190
 [<c0171013>] load_elf_binary+0x4c3/0x9f0
 [<c01376ea>] cache_grow+0x1fa/0x2d0
 [<c0134ab9>] __alloc_pages+0x89/0x2f0
 [<c01556c8>] copy_strings+0x238/0x250
 [<c0170b50>] load_elf_binary+0x0/0x9f0
 [<c0156975>] search_binary_handler+0xa5/0x1f0
 [<c0155707>] copy_strings_kernel+0x27/0x30
 [<c0156c3f>] do_execve+0x17f/0x200
 [<c015821e>] getname+0x5e/0xa0
 [<c01078f0>] sys_execve+0x30/0x70
 [<c0109087>] syscall_call+0x7/0xb
 [<c01051b1>] init+0x151/0x1d0
 [<c0105060>] init+0x0/0x1d0
 [<c01070f5>] kernel_thread_helper+0x5/0x10

Code: 8b 01 01 c2 89 11 83 fa 40 7f 09 89 d0 83 f8 c0 7d 11 eb 02
 <0>Kernel panic: Attempted to kill init!


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  5:51 ` Ravikiran G Thirumalai
@ 2003-05-07  6:16   ` Rusty Russell
  2003-05-08  7:42     ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 57+ messages in thread
From: Rusty Russell @ 2003-05-07  6:16 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: akpm, Dipankar Sarma, linux-kernel

In message <20030507055103.GA31797@in.ibm.com> you write:
> I tried to run a test to compare this implementation, but got an oops.
> Here is the oops and the patch I was trying...  

> +	if (!init_committed_space)

init_committed_space is a function.  You meant to call it 8)

> btw, why the change from kmalloc_percpu(size) to kmalloc_percpu(type)?
> You do kmalloc(sizeof (long)) for the usual kmalloc, but 
> kmalloc_percpu(long) for percpu data...looks strange no?

Yes, I'd probably want to change the name, if Andrew had agreed to the
concept.  But the type is very convenient, because you want to know
the alignment (kmalloc doesn't care, it just pads to cachline).

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  4:10                                       ` Martin J. Bligh
@ 2003-05-07 12:13                                         ` William Lee Irwin III
  0 siblings, 0 replies; 57+ messages in thread
From: William Lee Irwin III @ 2003-05-07 12:13 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Paul Mackerras, Rusty Russell, Andrew Morton, dipankar,
	linux-kernel, Bartlomiej Zolnierkiewicz

At some point in the past, my attribution was stripped from:
>> Not as bad as it initially sounds; on non-PAE i386, it's 4KB and would
>> hurt. On PAE i386, it's 32B and can be shoehorned, say, in thread_info.
>> Then the rest is just a per-cpu kernel pmd and properly handling vmalloc
>> faults (which are already handled properly for non-PAE vmallocspace).
>> There might be other reasons to do it, like reducing the virtualspace
>> overhead of the atomic kmap area, but it's not really time yet.

On Tue, May 06, 2003 at 09:10:24PM -0700, Martin J. Bligh wrote:
> Even if the space isn't a problem, having a full TLB flush on thread 
> context switch sounds sucky. Per-node is sufficient for most things,
> and is much cheaper (update on cross-node migrate). 

The scheme described requires no TLB flushing or pgd switching when
switching between threads in the same mm. Providing a true per-thread
mapping, like one for the stack, might require an invlpg or two at each
switch, but that wasn't mentioned here.


-- wli

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  5:37                             ` Andrew Morton
@ 2003-05-08  0:53                               ` Rusty Russell
  0 siblings, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-08  0:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dipankar, linux-kernel, B.Zolnierkiewicz, paulus

In message <20030506223751.2b49024a.akpm@digeo.com> you write:
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> >
> > I figured that since the allocator is going to be there anyway, it
> >  made sense to express kmalloc_percpu() in those terms.  If you think
> >  the price is too high, I can respect that.
> 
> I suggest that we use your new mechanism to fix DEFINE_PERCPU() in modules,
> and leave kmalloc_percpu() as it is for now.

Sure, we can revisit it later if it makes sense.  After all, there's
supposed to be this freeze thing...

I'll prepare a new, minimal patch.

Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-07  6:16   ` Rusty Russell
@ 2003-05-08  7:42     ` Ravikiran G Thirumalai
  2003-05-08  7:47       ` Rusty Russell
  0 siblings, 1 reply; 57+ messages in thread
From: Ravikiran G Thirumalai @ 2003-05-08  7:42 UTC (permalink / raw)
  To: Rusty Russell; +Cc: akpm, Dipankar Sarma, linux-kernel

On Wed, May 07, 2003 at 04:16:06PM +1000, Rusty Russell wrote:
> In message <20030507055103.GA31797@in.ibm.com> you write:
> > I tried to run a test to compare this implementation, but got an oops.
> > Here is the oops and the patch I was trying...  
> 
> > +	if (!init_committed_space)
> 
> init_committed_space is a function.  You meant to call it 8)

Yeah :(..sorry abt that, I'd actually tested an earlier patch of
yours (one u sent to Dipankar) and I'd got an oops in
__alloc_percpu (Same application but I'd actually called
init_commmited_space earlier).  So when I got a oops  again
with the newer patch, I just mailed it.  
But still, even after actually calling init_committed_space
with the newer patch (The first one which you mailed to Andrew in this
thread) I get an oops at __alloc_percpu.

Here's the oops report...looks like the 
struct pcpu_block *pcpu has NULL...

Thanks,
Kiran

Unable to handle kernel NULL pointer dereference at virtual address 00000000
 printing eip:
c014a5de
*pde = 00104001
*pte = 00000000
Oops: 0000 [#1]
CPU:    0
EIP:    0060:[<c014a5de>]    Not tainted
EFLAGS: 00010246
EIP is at __alloc_percpu+0x4e/0x190
eax: c02f9cbc   ebx: c03a8328   ecx: 00000000   edx: 00000000
esi: 00000001   edi: 00000000   ebp: 00000000   esp: dff7ff94
ds: 007b   es: 007b   ss: 0068
Process swapper (pid: 1, threadinfo=dff7e000 task=dff7c040)
Stack: c01070f0 00000003 c03cfe20 c03ad080 c03a8328 00000001 00000020 00000000
       c01392f9 00000004 00000004 c036ccda c036cd26 c03a8328 c03607eb dff7e000
       00000000 c01050c2 00000020 c0105060 00000000 00000000 00000000 c01070f5
Call Trace:
 [<c01070f0>] kernel_thread_helper+0x0/0x10
 [<c01392f9>] init_committed_space+0x9/0x11
 [<c036ccda>] swap_setup+0x1a/0x30
 [<c036cd26>] kswapd_init+0x6/0x40
 [<c03607eb>] do_initcalls+0x3b/0x90
 [<c01050c2>] init+0x62/0x1d0
 [<c0105060>] init+0x0/0x1d0
 [<c01070f5>] kernel_thread_helper+0x5/0x10

Code: 66 83 39 00 0f 84 08 01 00 00 c7 44 24 04 fc ff ff ff 31 f6
 <0>Kernel panic: Attempted to kill init!


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] kmalloc_percpu
  2003-05-08  7:42     ` Ravikiran G Thirumalai
@ 2003-05-08  7:47       ` Rusty Russell
  0 siblings, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2003-05-08  7:47 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: akpm, Dipankar Sarma, linux-kernel

In message <20030508074237.GA2760@in.ibm.com> you write:
> But still, even after actually calling init_committed_space
> with the newer patch (The first one which you mailed to Andrew in this
> thread) I get an oops at __alloc_percpu.
> 
> Here's the oops report...looks like the 
> struct pcpu_block *pcpu has NULL...

Damn initialization order.

Change:
	__initcall(init_alloc_percpu);
to
	core_initcall(init_alloc_percpu);

Anyway, patch has been scrapped, at least for now.
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [patch] kmalloc_percpu
  2002-11-01  8:33 ` Rusty Russell
@ 2002-11-05 16:00   ` Dipankar Sarma
  0 siblings, 0 replies; 57+ messages in thread
From: Dipankar Sarma @ 2002-11-05 16:00 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Ravikiran G Thirumalai, akpm, linux-kernel

On Fri, Nov 01, 2002 at 07:33:54PM +1100, Rusty Russell wrote:
> On Thu, 31 Oct 2002 21:36:40 +0530
> Ravikiran G Thirumalai <kiran@in.ibm.com> wrote:
> 
> > Here's  kmalloc_percpu interfaces ported (...mostly rediffed) to 
> > 2.5.45.  (One last try for 2.6).
> 
> IMHO this is a "when we need it" patch.  Baby steps...

Agreed. It could be useful if we want to use per-CPU statistics
for things like disk I/O (now in struct gendisk).

> 
> > +#define this_cpu_ptr(ptr) per_cpu_ptr(ptr, smp_processor_id())
> 
> Probably want a get_cpu_ptr() & put_cpu_ptr() using get_cpu() and put_cpu()
> (and this would become __get_cpu_ptr()).
> 
> And probably move them all to linux/percpu.h.

Yes, it needs to sync up with its static twin ;-)

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [patch] kmalloc_percpu
  2002-10-31 16:06 [patch] kmalloc_percpu Ravikiran G Thirumalai
@ 2002-11-01  8:33 ` Rusty Russell
  2002-11-05 16:00   ` Dipankar Sarma
  0 siblings, 1 reply; 57+ messages in thread
From: Rusty Russell @ 2002-11-01  8:33 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: akpm, dipankar, linux-kernel

On Thu, 31 Oct 2002 21:36:40 +0530
Ravikiran G Thirumalai <kiran@in.ibm.com> wrote:

> Here's  kmalloc_percpu interfaces ported (...mostly rediffed) to 
> 2.5.45.  (One last try for 2.6).

IMHO this is a "when we need it" patch.  Baby steps...

> +#define this_cpu_ptr(ptr) per_cpu_ptr(ptr, smp_processor_id())

Probably want a get_cpu_ptr() & put_cpu_ptr() using get_cpu() and put_cpu()
(and this would become __get_cpu_ptr()).

And probably move them all to linux/percpu.h.

> +extern void *kmalloc_percpu(size_t size, int flags);
> +extern void kfree_percpu(const void *);
> +extern int percpu_interlaced_alloc(struct percpu_data *, size_t, int);
> +extern void percpu_interlaced_free(struct percpu_data *);
> +extern void percpu_data_sys_init(void);

Hmm... It'd be nice to remove these three from the header so that the
interface is clear (which may mean exposing some slab.c internals so
you can access them in mm/percpu_data.c).

> +	if(!blklist->cachep)

A space between the "if" and the "(" is traditional.

Cheers,
Rusty.
-- 
   there are those who do and those who hang on and you don't see too
   many doers quoting their contemporaries.  -- Larry McVoy

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [patch] kmalloc_percpu
@ 2002-10-31 16:06 Ravikiran G Thirumalai
  2002-11-01  8:33 ` Rusty Russell
  0 siblings, 1 reply; 57+ messages in thread
From: Ravikiran G Thirumalai @ 2002-10-31 16:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dipankar, linux-kernel

Here's  kmalloc_percpu interfaces ported (...mostly rediffed) to 
2.5.45.  (One last try for 2.6).  I guess we can use  
DECLARE_PER_CPU interfaces most of the places; But for per_cpu stuff
which could possibly be defined in modules, kmalloc_percpu might be useful 
(SNMP stats for ex...ipv6 can be compiled as a module). 

-Kiran

Name: kmalloc_percpu
Description: Dynamic percpu allocator
Author: Dipankar Sarma & Ravikiran Thirumalai

diff -X dontdiff -ruN linux-2.5.45/include/linux/slab.h kmalloc_percpu-2.5.45/include/linux/slab.h
--- linux-2.5.45/include/linux/slab.h	Thu Oct 31 06:13:45 2002
+++ kmalloc_percpu-2.5.45/include/linux/slab.h	Thu Oct 31 18:26:24 2002
@@ -13,6 +13,7 @@
 
 #include	<linux/gfp.h>
 #include	<linux/types.h>
+#include 	<linux/smp.h>
 
 /* flags for kmem_cache_alloc() */
 #define	SLAB_NOFS		GFP_NOFS
@@ -63,6 +64,42 @@
 
 extern int FASTCALL(kmem_cache_reap(int));
 
+#ifdef CONFIG_SMP
+
+struct percpu_data {
+	void *ptrs[NR_CPUS];
+	void *blkp;
+};
+
+#define per_cpu_ptr(ptr, cpu)                   \
+({                                              \
+        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
+        (__typeof__(ptr))__p->ptrs[(cpu)];	\
+})
+#define this_cpu_ptr(ptr) per_cpu_ptr(ptr, smp_processor_id())
+extern void *kmalloc_percpu(size_t size, int flags);
+extern void kfree_percpu(const void *);
+extern int percpu_interlaced_alloc(struct percpu_data *, size_t, int);
+extern void percpu_interlaced_free(struct percpu_data *);
+extern void percpu_data_sys_init(void);
+
+#else
+
+#define per_cpu_ptr(ptr, cpu) (ptr)
+#define this_cpu_ptr(ptr) (ptr)
+static inline void *kmalloc_percpu(size_t size, int flags)
+{
+	return(kmalloc(size, flags));
+}
+static inline void kfree_percpu(const void *ptr)
+{	
+	kfree(ptr);
+}
+static inline void percpu_data_sys_init(void) { }
+
+#endif
+
+
 /* System wide caches */
 extern kmem_cache_t	*vm_area_cachep;
 extern kmem_cache_t	*mm_cachep;
diff -X dontdiff -ruN linux-2.5.45/init/main.c kmalloc_percpu-2.5.45/init/main.c
--- linux-2.5.45/init/main.c	Thu Oct 31 06:11:58 2002
+++ kmalloc_percpu-2.5.45/init/main.c	Thu Oct 31 17:48:09 2002
@@ -423,6 +423,7 @@
 	page_address_init();
 	mem_init();
 	kmem_cache_sizes_init();
+	percpu_data_sys_init();
 	pidhash_init();
 	pgtable_cache_init();
 	pte_chain_init();
diff -X dontdiff -ruN linux-2.5.45/kernel/ksyms.c kmalloc_percpu-2.5.45/kernel/ksyms.c
--- linux-2.5.45/kernel/ksyms.c	Thu Oct 31 06:11:34 2002
+++ kmalloc_percpu-2.5.45/kernel/ksyms.c	Thu Oct 31 17:48:09 2002
@@ -104,6 +104,10 @@
 EXPORT_SYMBOL(remove_shrinker);
 EXPORT_SYMBOL(kmalloc);
 EXPORT_SYMBOL(kfree);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(kmalloc_percpu);
+EXPORT_SYMBOL(kfree_percpu);
+#endif
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(__vmalloc);
 EXPORT_SYMBOL(vmalloc);
diff -X dontdiff -ruN linux-2.5.45/mm/Makefile kmalloc_percpu-2.5.45/mm/Makefile
--- linux-2.5.45/mm/Makefile	Thu Oct 31 06:13:03 2002
+++ kmalloc_percpu-2.5.45/mm/Makefile	Thu Oct 31 17:52:55 2002
@@ -2,7 +2,8 @@
 # Makefile for the linux memory manager.
 #
 
-export-objs := shmem.o filemap.o mempool.o page_alloc.o page-writeback.o
+export-objs := shmem.o filemap.o mempool.o page_alloc.o page-writeback.o \
+	       percpu_data.o
 
 obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
@@ -11,4 +12,5 @@
 	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o \
 	    truncate.o
 
+obj-$(CONFIG_SMP) += percpu_data.o
 include $(TOPDIR)/Rules.make
diff -X dontdiff -ruN linux-2.5.45/mm/percpu_data.c kmalloc_percpu-2.5.45/mm/percpu_data.c
--- linux-2.5.45/mm/percpu_data.c	Thu Jan  1 05:30:00 1970
+++ kmalloc_percpu-2.5.45/mm/percpu_data.c	Thu Oct 31 17:48:09 2002
@@ -0,0 +1,376 @@
+/*
+ * Dynamic Per-CPU Data Allocator.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) IBM Corporation, 2002
+ *
+ * Author:              Dipankar Sarma <dipankar@in.ibm.com>
+ * 			Ravikiran G. Thirumalai <kiran@in.ibm.com>
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+
+struct percpu_data_blklist {
+	struct list_head        blks;
+	struct list_head        *firstnotfull;
+	spinlock_t              lock;
+	size_t			objsize;
+	size_t			blksize;
+	kmem_cache_t		*cachep;
+	char			*cachename;
+};
+
+struct percpu_data_blk {
+	struct list_head        linkage; 
+	void             	*blkaddr[NR_CPUS]; 
+	unsigned int            usecount; 
+	int			*freearr;  
+	int			freehead;
+	struct percpu_data_blklist *blklist;
+};
+
+static struct percpu_data_blklist data_blklist[] = {
+	{
+		.blks =	LIST_HEAD_INIT(data_blklist[0].blks),
+		.firstnotfull =	&data_blklist[0].blks,
+		.lock =	SPIN_LOCK_UNLOCKED,
+		.objsize = 4,
+		.blksize = ((4 + SMP_CACHE_BYTES - 1) & ~(SMP_CACHE_BYTES - 1)),
+		.cachep = NULL,
+		.cachename = "percpu_data_4"
+	},
+	{
+		.blks =	LIST_HEAD_INIT(data_blklist[1].blks),
+		.firstnotfull =	&data_blklist[1].blks,
+		.lock =	SPIN_LOCK_UNLOCKED,
+		.objsize = 8,
+		.blksize = ((8 + SMP_CACHE_BYTES - 1) & ~(SMP_CACHE_BYTES - 1)),
+		.cachep = NULL,
+		.cachename = "percpu_data_8"
+	},
+	{
+		.blks =	LIST_HEAD_INIT(data_blklist[2].blks),
+		.firstnotfull =	&data_blklist[2].blks,
+		.lock =	SPIN_LOCK_UNLOCKED,
+		.objsize = 16,
+		.blksize = ((16 + SMP_CACHE_BYTES - 1) &
+					~(SMP_CACHE_BYTES - 1)),
+		.cachep = NULL,
+		.cachename = "percpu_data_16"
+	},
+#if PAGE_SIZE != 4096
+	{
+		.blks =	LIST_HEAD_INIT(data_blklist[3].blks),
+		.firstnotfull =	&data_blklist[3].blks,
+		.lock =	SPIN_LOCK_UNLOCKED,
+		.objsize = 32,
+		.blksize = ((32 + SMP_CACHE_BYTES - 1) &
+					~(SMP_CACHE_BYTES - 1)),
+		.cachep = NULL,
+		.cachename = "percpu_data_32"
+	}
+#endif
+};
+
+static int data_blklist_count = 
+	sizeof(data_blklist)/sizeof(struct percpu_data_blklist);
+
+/*
+ * Allocate a block descriptor structure and initialize it.  
+ * Returns the address of the block descriptor or NULL on failure.
+ */
+static struct percpu_data_blk *percpu_data_blk_alloc(
+		struct percpu_data_blklist *blklist, int flags)
+{
+        struct percpu_data_blk *blkp;
+        int i;
+	int count;
+
+        if (!(blkp = kmalloc(sizeof(struct percpu_data_blk), flags)))
+                goto out1;
+	INIT_LIST_HEAD(&blkp->linkage);
+        blkp->usecount = 0;
+	count = blklist->blksize / blklist->objsize;	
+	blkp->freearr = kmalloc(count, flags); 
+	if (!blkp->freearr)
+		goto out;
+        blkp->freehead = 0;
+        for (i = 0; i < count; i++)
+                blkp->freearr[i] = i+1;
+        blkp->freearr[i-1] = -1;	/* Marks the end of the array */
+	blkp->blklist = blklist;
+        return blkp;
+out:
+	kfree(blkp);
+out1:
+	return NULL;
+}
+
+/*
+ * Frees the block descriptor structure
+ */
+static void percpu_data_blk_free(struct percpu_data_blk *blkp)
+{
+	kfree(blkp);
+}
+
+/*
+ * Add a block to the percpu data object memory pool.
+ * Returns 0 on failure and 1 on success
+ */
+static int percpu_data_mem_grow(struct percpu_data_blklist *blklist, int flags)
+{
+        struct percpu_data_blk *blkp;
+        unsigned long save_flags;
+        int i;
+ 
+        if (!(blkp = percpu_data_blk_alloc(blklist, flags)))
+                goto out;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		blkp->blkaddr[i] = kmem_cache_alloc(blklist->cachep, flags);
+		if(!(blkp->blkaddr[i])) 
+			goto out1;
+		memset(blkp->blkaddr[i], 0, blklist->blksize);
+        }
+ 
+        /* 
+         * Now that we have the block successfully allocated 
+         * and instantiated..  add it.....
+         */
+        spin_lock_irqsave(&blklist->lock, save_flags);
+        list_add_tail(&blkp->linkage, &blklist->blks);
+        if (blklist->firstnotfull == &blklist->blks)
+                blklist->firstnotfull = &blkp->linkage;
+        spin_unlock_irqrestore(&blklist->lock, save_flags);
+        return 1;
+ 
+out1:
+	i--;
+	for (; i >= 0; i--) {
+		if (!cpu_possible(i))
+			continue;
+		kmem_cache_free(blklist->cachep, blkp->blkaddr[i]);
+	}
+	percpu_data_blk_free(blkp);
+out:
+        return 0;
+}
+
+/*
+ * Initialise the main percpu data control structure.
+ */
+static void percpu_data_blklist_init(struct percpu_data_blklist *blklist)
+{
+	blklist->cachep = kmem_cache_create(blklist->cachename,
+			blklist->blksize, 0, SLAB_HWCACHE_ALIGN, 
+			NULL, NULL);
+	if(!blklist->cachep)
+		BUG();
+}
+
+
+static struct percpu_data_blklist *percpu_data_get_blklist(size_t size,
+					int flags)
+{
+	int i;
+	for (i = 0; i < data_blklist_count; i++) {
+		if (size > data_blklist[i].objsize)
+			continue;
+		return &data_blklist[i];
+	}
+	return NULL;
+}
+
+	
+/*
+ * Initialize the percpu_data subsystem.
+ */
+void __init percpu_data_sys_init(void)
+{
+	int i;
+	for (i = 0; i < data_blklist_count; i++) {
+		percpu_data_blklist_init(&data_blklist[i]);
+	}
+}
+
+/*
+ * Allocs an object from the block.  Returns back the object index.
+ */
+static int __percpu_interlaced_alloc_one(struct percpu_data_blklist *blklist, 
+				struct percpu_data_blk *blkp)
+{
+        unsigned int objidx;
+ 
+        objidx = blkp->freehead;
+        blkp->freehead = blkp->freearr[objidx];
+        blkp->usecount++;
+        if(blkp->freehead < 0) {
+                blklist->firstnotfull = blkp->linkage.next;
+        }
+        return objidx;
+}
+
+/*
+ * Allocate a per cpu data object and return a pointer to it.
+ */
+static int __percpu_interlaced_alloc(struct percpu_data *percpu,
+		struct percpu_data_blklist *blklist, int flags)
+{
+        struct percpu_data_blk *blkp;
+        unsigned long save_flags;
+        struct list_head *l;
+	int objidx;
+	int i;
+
+tryagain:
+ 
+        spin_lock_irqsave(&blklist->lock, save_flags);
+        l = blklist->firstnotfull;
+        if (l == &blklist->blks)
+                goto unlock_and_get_mem;
+        blkp = list_entry(l, struct percpu_data_blk, linkage);
+ 
+        objidx = __percpu_interlaced_alloc_one(blklist, blkp);
+        spin_unlock_irqrestore(&blklist->lock, save_flags);
+        /* 
+         * Since we hold the lock and firstnotfull is not the
+         * head list, we should be getting an object alloc here. firstnotfull 
+	 * can be pointing to head of the list when all the blks are 
+	 * full or when there're no blocks left 
+         */
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		percpu->ptrs[i] = blkp->blkaddr[i] + objidx * blklist->objsize;
+	}
+	percpu->blkp = (void *)blkp;
+        return 1;
+ 
+unlock_and_get_mem:
+ 
+        spin_unlock_irqrestore(&blklist->lock, save_flags);
+        if(percpu_data_mem_grow(blklist, flags))
+                goto tryagain;  /* added another block..try allocing obj ..*/
+        
+        return 0;
+}
+
+/*
+ * Allocate a per-cpu data object and return a pointer to it.
+ * Returns NULL on failure. 
+ */
+int percpu_interlaced_alloc(struct percpu_data *pdata, size_t size, int flags)
+{
+	struct percpu_data_blklist *blklist;
+	
+	blklist = percpu_data_get_blklist(size, flags);
+	if (blklist == NULL)
+		return 0;
+	return __percpu_interlaced_alloc(pdata, blklist, flags);
+}
+
+/*
+ * Frees memory associated with a percpu data block
+ */
+static void percpu_data_blk_destroy(struct percpu_data_blklist *blklist, 
+			struct percpu_data_blk *blkp)
+{
+	int i;
+	
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kmem_cache_free(blklist->cachep, blkp->blkaddr[i]);
+	}
+	percpu_data_blk_free(blkp);
+}
+
+/*
+ * Frees an object from a block and fixes the freelist accdly.
+ * Frees the slab cache memory if a block gets empty during free.
+ */
+static void __percpu_interlaced_free(struct percpu_data_blklist *blklist,
+		struct percpu_data *percpu)
+{
+        struct percpu_data_blk *blkp;
+        int objidx;
+        int objoffset;
+        struct list_head *t;
+        unsigned long save_flags;
+	
+        spin_lock_irqsave(&blklist->lock, save_flags);
+        blkp = (struct percpu_data_blk *)percpu->blkp;
+        objoffset = percpu->ptrs[0] - blkp->blkaddr[0];
+        objidx = objoffset / blklist->objsize;
+
+	kfree(percpu);
+
+        blkp->freearr[objidx] = blkp->freehead;
+        blkp->freehead = objidx;
+        blkp->usecount--;
+ 
+        if (blkp->freearr[objidx] < 0)  {
+                /* 
+                 * block was previously full and is now just partially full ..
+                 * so make firstnotfull pt to this block and fix list accdly 
+                 */
+                t = blklist->firstnotfull;
+                blklist->firstnotfull = &blkp->linkage;
+                if (blkp->linkage.next == t) {
+			spin_unlock_irqrestore(&blklist->lock, save_flags);
+                        return;
+		}
+                list_del(&blkp->linkage);
+                list_add_tail(&blkp->linkage, t);
+        	
+		spin_unlock_irqrestore(&blklist->lock, save_flags);
+                return;
+        }
+ 
+        if (blkp->usecount == 0) {
+                t = blklist->firstnotfull->prev;
+ 
+                list_del(&blkp->linkage);
+                if (blklist->firstnotfull == &blkp->linkage)
+                        blklist->firstnotfull = t->next;
+        	
+		spin_unlock_irqrestore(&blklist->lock, save_flags);
+		percpu_data_blk_destroy(blklist, blkp);
+                return;
+        }
+ 
+        spin_unlock_irqrestore(&blklist->lock, save_flags);
+        return;
+}
+
+/*
+ * Frees up a percpu data object
+ */ 
+void percpu_interlaced_free(struct percpu_data *percpu)
+{
+	struct percpu_data_blk *blkp = percpu->blkp;
+	__percpu_interlaced_free(blkp->blklist, percpu);
+}
diff -X dontdiff -ruN linux-2.5.45/mm/slab.c kmalloc_percpu-2.5.45/mm/slab.c
--- linux-2.5.45/mm/slab.c	Thu Oct 31 06:13:36 2002
+++ kmalloc_percpu-2.5.45/mm/slab.c	Thu Oct 31 18:44:11 2002
@@ -1824,6 +1824,58 @@
 	return NULL;
 }
 
+#ifdef CONFIG_SMP
+/**
+ * kmalloc_percpu - allocate one copy of the object for every present
+ * cpu in the system.
+ *
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ * The @flags argument may be one of:
+ *
+ * %GFP_USER - Allocate memory on behalf of user.  May sleep.
+ *
+ * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
+ *
+ * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
+ */
+void *kmalloc_percpu(size_t size, int flags)
+{
+        int i;
+        struct percpu_data *pdata = kmalloc(sizeof(*pdata), flags);
+ 
+        if (!pdata) 
+		goto out_done;
+	pdata->blkp = NULL;
+	if (size <= (malloc_sizes[0].cs_size << 1)) {
+		if (!percpu_interlaced_alloc(pdata, size, flags))
+			goto out;
+	} else {
+		for (i = 0; i < NR_CPUS; i++) {
+			if (!cpu_possible(i))
+				continue;
+			pdata->ptrs[i] = kmalloc(size, flags);
+			if (!pdata->ptrs[i])
+				goto unwind_oom;
+		}
+	}
+        /* Catch derefs w/o wrappers */
+        return (void *)(~(unsigned long)pdata);
+ 
+unwind_oom:
+        while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+                kfree(pdata->ptrs[i]);
+	}
+out:
+        kfree(pdata);
+out_done:
+	return NULL;
+}
+#endif
+
+
 /**
  * kmem_cache_free - Deallocate an object
  * @cachep: The cache the allocation was from.
@@ -1871,6 +1923,32 @@
 	return cachep->objsize;
 }
 
+#ifdef CONFIG_SMP
+/**
+ * kfree_percpu - free previously allocated percpu memory
+ * @objp: pointer returned by kmalloc_percpu.
+ *
+ * Don't free memory not originally allocated by kmalloc_percpu()
+ * The complemented objp is to check for that.
+ */
+void kfree_percpu(const void *objp)
+{
+	int i;
+        struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
+
+	if (p->blkp) {
+		percpu_interlaced_free(p);
+	} else {
+		for (i = 0; i < NR_CPUS; i++) {
+			if (!cpu_possible(i))
+				continue;
+			kfree(p->ptrs[i]);
+		}
+	}
+}
+#endif
+
+
 kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2003-05-08  7:36 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-05-05  8:08 [PATCH] kmalloc_percpu Rusty Russell
2003-05-05  8:47 ` Andrew Morton
2003-05-06  0:47   ` Rusty Russell
2003-05-06  1:52     ` Andrew Morton
2003-05-06  2:11       ` David S. Miller
2003-05-06  4:08         ` Rusty Russell
2003-05-06  3:40           ` David S. Miller
2003-05-06  5:02             ` Andrew Morton
2003-05-06  4:16               ` David S. Miller
2003-05-06  5:48                 ` Andrew Morton
2003-05-06  5:35                   ` David S. Miller
2003-05-06  6:55                     ` Andrew Morton
2003-05-06  5:57                       ` David S. Miller
2003-05-06  7:22                         ` Andrew Morton
2003-05-06  6:15                           ` David S. Miller
2003-05-06  7:34                             ` Andrew Morton
2003-05-06  8:42                               ` William Lee Irwin III
2003-05-06 14:38                               ` Martin J. Bligh
2003-05-06  7:20                       ` Dipankar Sarma
2003-05-06  8:28                       ` Rusty Russell
2003-05-06  8:47                         ` Andrew Morton
2003-05-07  1:57                           ` Rusty Russell
2003-05-07  2:41                             ` William Lee Irwin III
2003-05-07  4:03                               ` Paul Mackerras
2003-05-07  4:22                                 ` William Lee Irwin III
2003-05-07  4:56                                   ` Paul Mackerras
2003-05-07  5:19                                     ` William Lee Irwin III
2003-05-07  4:10                                       ` Martin J. Bligh
2003-05-07 12:13                                         ` William Lee Irwin III
2003-05-07  4:15                               ` Rusty Russell
2003-05-07  5:37                             ` Andrew Morton
2003-05-08  0:53                               ` Rusty Russell
2003-05-06 14:41                         ` Martin J. Bligh
2003-05-06  6:42                   ` Andrew Morton
2003-05-06  5:39                     ` David S. Miller
2003-05-06  6:57                       ` Andrew Morton
2003-05-06  7:25                         ` Jens Axboe
2003-05-06 10:41                           ` Ingo Oeser
2003-05-06 16:05                         ` Bryan O'Sullivan
2003-05-06  8:06               ` Rusty Russell
2003-05-06  5:03             ` Dipankar Sarma
2003-05-06  4:28           ` Andrew Morton
2003-05-06  3:37             ` David S. Miller
2003-05-06  4:11       ` Rusty Russell
2003-05-06  5:07   ` Ravikiran G Thirumalai
2003-05-06  8:03     ` Rusty Russell
2003-05-06  9:23       ` David S. Miller
2003-05-06  9:34       ` Ravikiran G Thirumalai
2003-05-06  9:38         ` Dipankar Sarma
2003-05-07  2:14         ` Rusty Russell
2003-05-07  5:51 ` Ravikiran G Thirumalai
2003-05-07  6:16   ` Rusty Russell
2003-05-08  7:42     ` Ravikiran G Thirumalai
2003-05-08  7:47       ` Rusty Russell
  -- strict thread matches above, loose matches on Subject: below --
2002-10-31 16:06 [patch] kmalloc_percpu Ravikiran G Thirumalai
2002-11-01  8:33 ` Rusty Russell
2002-11-05 16:00   ` Dipankar Sarma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).