linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/2] mm: Reimplementation of alloc_percpu - move qsort from xfs/support to linux/lib
@ 2005-01-17 18:30 Ravikiran G Thirumalai
  2005-01-17 18:36 ` [patch 2/2] mm: Reimplementation of alloc_percpu Ravikiran G Thirumalai
  0 siblings, 1 reply; 5+ messages in thread
From: Ravikiran G Thirumalai @ 2005-01-17 18:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, nathans

Here's a patch to move qsort.c from fs/xfs/support/ to linux/lib so that
alloc_percpu reimplementation can use it.

Thanks,
Kiran


Patch to move qsort routine from the xfs file system to linux/lib so that
other subsystems can make use of it.

Signed-off-by: Ravikiran Thirumalai <kiran@in.ibm.com>
---

 fs/xfs/Makefile              |    1 
 fs/xfs/linux-2.6/xfs_linux.h |    2 
 fs/xfs/support/qsort.c       |  155 -------------------------------------------
 fs/xfs/support/qsort.h       |   41 -----------
 include/linux/qsort.h        |   41 +++++++++++
 lib/Makefile                 |    3 
 lib/qsort.c                  |  155 +++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 199 insertions(+), 199 deletions(-)

diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/fs/xfs/linux-2.6/xfs_linux.h alloc_percpu-2.6.11-rc1mm1/fs/xfs/linux-2.6/xfs_linux.h
--- linux-2.6.11-rc1mm1/fs/xfs/linux-2.6/xfs_linux.h	2004-12-25 03:05:50.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/fs/xfs/linux-2.6/xfs_linux.h	2005-01-15 21:02:17.000000000 +0530
@@ -64,7 +64,6 @@
 #include <sema.h>
 #include <time.h>
 
-#include <support/qsort.h>
 #include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/move.h>
@@ -88,6 +87,7 @@
 #include <linux/list.h>
 #include <linux/proc_fs.h>
 #include <linux/version.h>
+#include <linux/qsort.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/fs/xfs/Makefile alloc_percpu-2.6.11-rc1mm1/fs/xfs/Makefile
--- linux-2.6.11-rc1mm1/fs/xfs/Makefile	2005-01-15 21:26:56.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/fs/xfs/Makefile	2005-01-15 21:03:40.000000000 +0530
@@ -143,7 +143,6 @@
 xfs-y				+= $(addprefix support/, \
 				   debug.o \
 				   move.o \
-				   qsort.o \
 				   uuid.o)
 
 xfs-$(CONFIG_XFS_TRACE)		+= support/ktrace.o
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/fs/xfs/support/qsort.c alloc_percpu-2.6.11-rc1mm1/fs/xfs/support/qsort.c
--- linux-2.6.11-rc1mm1/fs/xfs/support/qsort.c	2004-12-25 03:05:28.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/fs/xfs/support/qsort.c	1970-01-01 05:30:00.000000000 +0530
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 1992, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-/*
- * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
- */
-#define swapcode(TYPE, parmi, parmj, n) { 		\
-	long i = (n) / sizeof (TYPE); 			\
-	register TYPE *pi = (TYPE *) (parmi); 		\
-	register TYPE *pj = (TYPE *) (parmj); 		\
-	do { 						\
-		register TYPE	t = *pi;		\
-		*pi++ = *pj;				\
-		*pj++ = t;				\
-        } while (--i > 0);				\
-}
-
-#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
-	es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
-
-static __inline void
-swapfunc(char *a, char *b, int n, int swaptype)
-{
-	if (swaptype <= 1) 
-		swapcode(long, a, b, n)
-	else
-		swapcode(char, a, b, n)
-}
-
-#define swap(a, b)					\
-	if (swaptype == 0) {				\
-		long t = *(long *)(a);			\
-		*(long *)(a) = *(long *)(b);		\
-		*(long *)(b) = t;			\
-	} else						\
-		swapfunc(a, b, es, swaptype)
-
-#define vecswap(a, b, n) 	if ((n) > 0) swapfunc(a, b, n, swaptype)
-
-static __inline char *
-med3(char *a, char *b, char *c, int (*cmp)(const void *, const void *))
-{
-	return cmp(a, b) < 0 ?
-	       (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a ))
-              :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
-}
-
-void
-qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *))
-{
-	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
-	int d, r, swaptype, swap_cnt;
-	register char *a = aa;
-
-loop:	SWAPINIT(a, es);
-	swap_cnt = 0;
-	if (n < 7) {
-		for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es)
-			for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
-			     pl -= es)
-				swap(pl, pl - es);
-		return;
-	}
-	pm = (char *)a + (n / 2) * es;
-	if (n > 7) {
-		pl = (char *)a;
-		pn = (char *)a + (n - 1) * es;
-		if (n > 40) {
-			d = (n / 8) * es;
-			pl = med3(pl, pl + d, pl + 2 * d, cmp);
-			pm = med3(pm - d, pm, pm + d, cmp);
-			pn = med3(pn - 2 * d, pn - d, pn, cmp);
-		}
-		pm = med3(pl, pm, pn, cmp);
-	}
-	swap(a, pm);
-	pa = pb = (char *)a + es;
-
-	pc = pd = (char *)a + (n - 1) * es;
-	for (;;) {
-		while (pb <= pc && (r = cmp(pb, a)) <= 0) {
-			if (r == 0) {
-				swap_cnt = 1;
-				swap(pa, pb);
-				pa += es;
-			}
-			pb += es;
-		}
-		while (pb <= pc && (r = cmp(pc, a)) >= 0) {
-			if (r == 0) {
-				swap_cnt = 1;
-				swap(pc, pd);
-				pd -= es;
-			}
-			pc -= es;
-		}
-		if (pb > pc)
-			break;
-		swap(pb, pc);
-		swap_cnt = 1;
-		pb += es;
-		pc -= es;
-	}
-	if (swap_cnt == 0) {  /* Switch to insertion sort */
-		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
-			for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; 
-			     pl -= es)
-				swap(pl, pl - es);
-		return;
-	}
-
-	pn = (char *)a + n * es;
-	r = min(pa - (char *)a, pb - pa);
-	vecswap(a, pb - r, r);
-	r = min((long)(pd - pc), (long)(pn - pd - es));
-	vecswap(pb, pn - r, r);
-	if ((r = pb - pa) > es)
-		qsort(a, r / es, es, cmp);
-	if ((r = pd - pc) > es) { 
-		/* Iterate rather than recurse to save stack space */
-		a = pn - r;
-		n = r / es;
-		goto loop;
-	}
-/*		qsort(pn - r, r / es, es, cmp);*/
-}
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/fs/xfs/support/qsort.h alloc_percpu-2.6.11-rc1mm1/fs/xfs/support/qsort.h
--- linux-2.6.11-rc1mm1/fs/xfs/support/qsort.h	2004-12-25 03:05:23.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/fs/xfs/support/qsort.h	1970-01-01 05:30:00.000000000 +0530
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * Further, this software is distributed without any warranty that it is
- * free of the rightful claim of any third person regarding infringement
- * or the like.  Any license provided herein, whether implied or
- * otherwise, applies only to this software file.  Patent licenses, if
- * any, provided herein do not apply to combinations of this program with
- * other software, or any other product whatsoever.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
- * Mountain View, CA  94043, or:
- *
- * http://www.sgi.com
- *
- * For further information regarding this notice, see:
- *
- * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
- */
-
-#ifndef QSORT_H
-#define QSORT_H
-
-extern void qsort (void *const pbase,
-		    size_t total_elems,
-		    size_t size,
-		    int (*cmp)(const void *, const void *));
-
-#endif
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/include/linux/qsort.h alloc_percpu-2.6.11-rc1mm1/include/linux/qsort.h
--- linux-2.6.11-rc1mm1/include/linux/qsort.h	1970-01-01 05:30:00.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/include/linux/qsort.h	2004-12-25 03:05:23.000000000 +0530
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef QSORT_H
+#define QSORT_H
+
+extern void qsort (void *const pbase,
+		    size_t total_elems,
+		    size_t size,
+		    int (*cmp)(const void *, const void *));
+
+#endif
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/lib/Makefile alloc_percpu-2.6.11-rc1mm1/lib/Makefile
--- linux-2.6.11-rc1mm1/lib/Makefile	2005-01-15 21:26:39.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/lib/Makefile	2005-01-15 21:00:11.000000000 +0530
@@ -5,7 +5,8 @@
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 kobject.o kref.o idr.o div64.o parser.o int_sqrt.o \
-	 bitmap.o extable.o kobject_uevent.o prio_tree.o
+	 bitmap.o extable.o kobject_uevent.o prio_tree.o \
+	 qsort.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/lib/qsort.c alloc_percpu-2.6.11-rc1mm1/lib/qsort.c
--- linux-2.6.11-rc1mm1/lib/qsort.c	1970-01-01 05:30:00.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/lib/qsort.c	2004-12-25 03:05:28.000000000 +0530
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+/*
+ * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ */
+#define swapcode(TYPE, parmi, parmj, n) { 		\
+	long i = (n) / sizeof (TYPE); 			\
+	register TYPE *pi = (TYPE *) (parmi); 		\
+	register TYPE *pj = (TYPE *) (parmj); 		\
+	do { 						\
+		register TYPE	t = *pi;		\
+		*pi++ = *pj;				\
+		*pj++ = t;				\
+        } while (--i > 0);				\
+}
+
+#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
+	es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
+
+static __inline void
+swapfunc(char *a, char *b, int n, int swaptype)
+{
+	if (swaptype <= 1) 
+		swapcode(long, a, b, n)
+	else
+		swapcode(char, a, b, n)
+}
+
+#define swap(a, b)					\
+	if (swaptype == 0) {				\
+		long t = *(long *)(a);			\
+		*(long *)(a) = *(long *)(b);		\
+		*(long *)(b) = t;			\
+	} else						\
+		swapfunc(a, b, es, swaptype)
+
+#define vecswap(a, b, n) 	if ((n) > 0) swapfunc(a, b, n, swaptype)
+
+static __inline char *
+med3(char *a, char *b, char *c, int (*cmp)(const void *, const void *))
+{
+	return cmp(a, b) < 0 ?
+	       (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a ))
+              :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
+}
+
+void
+qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *))
+{
+	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+	int d, r, swaptype, swap_cnt;
+	register char *a = aa;
+
+loop:	SWAPINIT(a, es);
+	swap_cnt = 0;
+	if (n < 7) {
+		for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es)
+			for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
+			     pl -= es)
+				swap(pl, pl - es);
+		return;
+	}
+	pm = (char *)a + (n / 2) * es;
+	if (n > 7) {
+		pl = (char *)a;
+		pn = (char *)a + (n - 1) * es;
+		if (n > 40) {
+			d = (n / 8) * es;
+			pl = med3(pl, pl + d, pl + 2 * d, cmp);
+			pm = med3(pm - d, pm, pm + d, cmp);
+			pn = med3(pn - 2 * d, pn - d, pn, cmp);
+		}
+		pm = med3(pl, pm, pn, cmp);
+	}
+	swap(a, pm);
+	pa = pb = (char *)a + es;
+
+	pc = pd = (char *)a + (n - 1) * es;
+	for (;;) {
+		while (pb <= pc && (r = cmp(pb, a)) <= 0) {
+			if (r == 0) {
+				swap_cnt = 1;
+				swap(pa, pb);
+				pa += es;
+			}
+			pb += es;
+		}
+		while (pb <= pc && (r = cmp(pc, a)) >= 0) {
+			if (r == 0) {
+				swap_cnt = 1;
+				swap(pc, pd);
+				pd -= es;
+			}
+			pc -= es;
+		}
+		if (pb > pc)
+			break;
+		swap(pb, pc);
+		swap_cnt = 1;
+		pb += es;
+		pc -= es;
+	}
+	if (swap_cnt == 0) {  /* Switch to insertion sort */
+		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+			for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; 
+			     pl -= es)
+				swap(pl, pl - es);
+		return;
+	}
+
+	pn = (char *)a + n * es;
+	r = min(pa - (char *)a, pb - pa);
+	vecswap(a, pb - r, r);
+	r = min((long)(pd - pc), (long)(pn - pd - es));
+	vecswap(pb, pn - r, r);
+	if ((r = pb - pa) > es)
+		qsort(a, r / es, es, cmp);
+	if ((r = pd - pc) > es) { 
+		/* Iterate rather than recurse to save stack space */
+		a = pn - r;
+		n = r / es;
+		goto loop;
+	}
+/*		qsort(pn - r, r / es, es, cmp);*/
+}

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2/2] mm: Reimplementation of alloc_percpu
  2005-01-17 18:30 [patch 1/2] mm: Reimplementation of alloc_percpu - move qsort from xfs/support to linux/lib Ravikiran G Thirumalai
@ 2005-01-17 18:36 ` Ravikiran G Thirumalai
  2005-01-18  1:30   ` Rusty Russell
  0 siblings, 1 reply; 5+ messages in thread
From: Ravikiran G Thirumalai @ 2005-01-17 18:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, manfred, rusty, dipankar

Here's the alloc_percpu reimplementation changed to
- Use qsort 
- Use GFP_KERNEL|__GFP_HIGHMEM|__GFP_ZERO for BLOCK_MANAGEMENT_PAGES
  (GFP_HIGHZERO would have been ideal)
- Changed currency size to sizeof (int) from sizeof (void *) for better
  utilization for small objects

The allocator can be easily modified to use __per_cpu_offset[] table at a later
stage by: 
1. Allocating ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE) for the
   static percpu areas and populating __per_cpu_offset[] offset table
2. Making PCPU_BLKSIZE same as the static per cpu area size above
3. Serving dynamic percpu requests from modules etc from blocks by
   returning ret -= __per_cpu_offset[0] from a percpu block.  This way
   modules need not have a limit on static percpu areas.

Thanks,
Kiran

---

The following patch re-implements the linux dynamic percpu memory allocator
so that:
1. Percpu memory dereference is faster 
	- One less memory reference compared to existing simple alloc_percpu
	- As fast as with static percpu areas, one mem ref less actually.
2. Better memory usage
	- Doesn't need a NR_CPUS pointer array for each allocation
	- Interlaces objects making better utilization of memory/cachelines
	- Userspace tests show 98% utilization with random sized allocations
	  after repeated random frees.  Utilization with small sized 
	  allocations (counters) expected to be better than random sized 
	  allocations.
3. Provides truly node local allocation
	- The percpu memory with existing alloc_percpu does node local
	  allocation, but the NR_CPUS place holder is not node local. This
	  problem doesn't exist with the new implementation.

Design:
We have "blocks" of memory akin to slabs.  Each block has 
(percpu blocksize) * NR_CPUS + (block management data structures) of kernel 
VA space allocated to it.  Node local pages are allocated and mapped 
against the corresponding cpus' VA space.  Pages are allocated for block 
management data structures and mapped to their corresponding VA.  These 
reside at (percpu blocksize) * NR_CPUS offset from the beginning of the block.  
The allocator maintains a circular linked list of blocks, sorted in 
descending order of block utilization.  On requests for objects, allocator 
tries to serve the request from the most utilized block.
The allocator allocates memory in multiples of a fixed currency size for 
a request.  The allocator returns address of the percpu
object corresponding to cpu0.  The cpu local variable for any given cpu
can be obtained by simple arithmetic:
obj_address + cpu_id  * PCPU_BLKSIZE.

Testing:
The block allocator has undergone some userspace stress testing with small
counter sized objects as well as a large number of random sized objects.  

Changes:
- Allocator uses qsort now
- Use GFP_KERNEL|__GFP_HIGHMEM|__GFP_ZERO for block management pages
- Changed currency size to sizeof (int) from sizeof (void *)

Signed-off-by: Ravikiran Thirumalai <kiran@in.ibm.com>
---

 include/linux/kernel.h |    2 
 include/linux/percpu.h |   18 -
 mm/Makefile            |    1 
 mm/percpu.c            |  686 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/slab.c              |   70 -----
 5 files changed, 696 insertions(+), 81 deletions(-)

diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/include/linux/kernel.h alloc_percpu-2.6.11-rc1mm1/include/linux/kernel.h
--- linux-2.6.11-rc1mm1/include/linux/kernel.h	2005-01-15 21:26:39.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/include/linux/kernel.h	2005-01-15 21:41:52.000000000 +0530
@@ -29,6 +29,8 @@
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+#define IS_ALIGNED(x,a) (!(((a) - 1) & (x)))
+#define IS_POWEROFTWO(x) (!(((x) - 1) & (x)))
 
 #define	KERN_EMERG	"<0>"	/* system is unusable			*/
 #define	KERN_ALERT	"<1>"	/* action must be taken immediately	*/
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/include/linux/percpu.h alloc_percpu-2.6.11-rc1mm1/include/linux/percpu.h
--- linux-2.6.11-rc1mm1/include/linux/percpu.h	2004-12-25 03:05:28.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/include/linux/percpu.h	2005-01-15 21:41:52.000000000 +0530
@@ -15,23 +15,19 @@
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
-#ifdef CONFIG_SMP
-
-struct percpu_data {
-	void *ptrs[NR_CPUS];
-	void *blkp;
-};
+/* This is the upper bound for an object using alloc_percpu */
+#define PCPU_BLKSIZE (PAGE_SIZE << 1)
+#define PCPU_CURR_SIZE        (sizeof (int))
 
+#ifdef CONFIG_SMP
 /* 
  * Use this to get to a cpu's version of the per-cpu object allocated using
  * alloc_percpu.  Non-atomic access to the current CPU's version should
  * probably be combined with get_cpu()/put_cpu().
  */ 
 #define per_cpu_ptr(ptr, cpu)                   \
-({                                              \
-        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	\
-})
+	((__typeof__(ptr))                      \
+	(RELOC_HIDE(ptr,  PCPU_BLKSIZE * cpu)))
 
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(const void *);
@@ -56,6 +52,6 @@
 
 /* Simple wrapper for the common case: zeros memory. */
 #define alloc_percpu(type) \
-	((type *)(__alloc_percpu(sizeof(type), __alignof__(type))))
+	((type *)(__alloc_percpu(ALIGN(sizeof (type), PCPU_CURR_SIZE),  __alignof__(type))))
 
 #endif /* __LINUX_PERCPU_H */
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/mm/Makefile alloc_percpu-2.6.11-rc1mm1/mm/Makefile
--- linux-2.6.11-rc1mm1/mm/Makefile	2005-01-15 21:26:39.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/mm/Makefile	2005-01-15 21:41:52.000000000 +0530
@@ -17,4 +17,5 @@
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SMP)	+= percpu.o
 
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/mm/percpu.c alloc_percpu-2.6.11-rc1mm1/mm/percpu.c
--- linux-2.6.11-rc1mm1/mm/percpu.c	1970-01-01 05:30:00.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/mm/percpu.c	2005-01-15 22:05:30.000000000 +0530
@@ -0,0 +1,686 @@
+/*
+ * Dynamic percpu memory allocator.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2003
+ *
+ * Author: Ravikiran Thirumalai <kiran@in.ibm.com>
+ * 
+ * Originally by Dipankar Sarma and Ravikiran Thirumalai,
+ * This reimplements alloc_percpu to make it 
+ * 1. Independent of slab/kmalloc
+ * 2. Use node local memory
+ * 3. Use simple pointer arithmetic 
+ * 4. Minimise fragmentation.
+ *
+ * Allocator is slow -- expected to be called during module/subsytem
+ * init. alloc_percpu can block.
+ *
+ */
+
+#include <linux/percpu.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/qsort.h>
+#include <asm/semaphore.h>
+#include <asm/pgtable.h>
+#include <asm/hardirq.h>
+
+#define MAX_OBJSIZE	PCPU_BLKSIZE
+#define OBJS_PER_BLOCK	(PCPU_BLKSIZE/PCPU_CURR_SIZE)
+#define	BITMAP_ARR_SIZE (OBJS_PER_BLOCK/(sizeof (unsigned long) * 8))
+#define MAX_NR_BITS	(OBJS_PER_BLOCK)
+#define PCPUPAGES_PER_BLOCK ((PCPU_BLKSIZE >> PAGE_SHIFT) * NR_CPUS)
+
+/* Block descriptor */
+struct pcpu_block {
+	void *start_addr;
+	struct page *pages[PCPUPAGES_PER_BLOCK * 2];	/* Extra for block mgt */
+	struct list_head blklist;
+	unsigned long bitmap[BITMAP_ARR_SIZE];	/* Object Freelist */
+	int bufctl_fl[OBJS_PER_BLOCK];	/* bufctl_fl freelist */
+	int bufctl_fl_head;
+	unsigned int size_used;
+};
+
+#define BLK_SIZE_USED(listpos) (list_entry(listpos, 		 	      \
+					struct pcpu_block, blklist)->size_used)
+
+/* Block list maintanance */
+
+/* Ordered list of pcpu_blocks -- Full, partial first */
+static struct list_head blkhead = LIST_HEAD_INIT(blkhead);
+static struct list_head *firstnotfull = &blkhead;
+static DECLARE_MUTEX(blklist_lock);
+
+/* 
+ * Bufctl descriptor and bufctl list for all allocated objs...
+ * Having one list for all buffers in the allocater might not be very efficient
+ * but we are not expecting allocs and frees in fast path (only during module
+ * load and unload hopefully
+ */
+struct buf_ctl {
+	void *addr;
+	size_t size;
+	struct buf_ctl *next;
+};
+
+static struct buf_ctl *buf_head = NULL;
+
+#define BLOCK_MANAGEMENT_SIZE						\
+({									\
+	int extra = sizeof (struct buf_ctl)*OBJS_PER_BLOCK 		\
+				+ sizeof (struct pcpu_block); 		\
+	ALIGN(extra, PAGE_SIZE);					\
+})
+
+#define BLOCK_MANAGEMENT_PAGES (BLOCK_MANAGEMENT_SIZE >> PAGE_SHIFT)
+
+void init_pcpu_block(struct pcpu_block *blkp)
+{
+	int i;
+
+	/* Setup the freelist */
+	blkp->bufctl_fl_head = 0;
+	for (i = 0; i < OBJS_PER_BLOCK - 1; i++)
+		blkp->bufctl_fl[i] = i + 1;
+	blkp->bufctl_fl[i] = -1;	/* Sentinel to mark End of list */
+}
+
+/*
+ * Allocate PCPU_BLKSIZE * NR_CPUS + BLOCK_MANAGEMENT_SIZE  worth of 
+ * contiguous kva space, and PCPU_BLKSIZE amount of node local 
+ * memory (pages) for all cpus possible + BLOCK_MANAGEMENT_SIZE pages
+ */
+static void *valloc_percpu(void)
+{
+	int i, j = 0;
+	unsigned int nr_pages;
+	struct vm_struct *area, tmp;
+	struct page **tmppage;
+	struct page *pages[BLOCK_MANAGEMENT_PAGES];
+	unsigned int cpu_pages = PCPU_BLKSIZE >> PAGE_SHIFT;
+	struct pcpu_block *blkp = NULL;
+
+	BUG_ON(!IS_ALIGNED(PCPU_BLKSIZE, PAGE_SIZE));
+	BUG_ON(!PCPU_BLKSIZE);
+	nr_pages = PCPUPAGES_PER_BLOCK + BLOCK_MANAGEMENT_PAGES;
+
+	/* Alloc Managent block pages */
+	for (i = 0; i < BLOCK_MANAGEMENT_PAGES; i++) {
+		pages[i] = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_ZERO, 0);
+		if (!pages[i]) {
+			while (--i >= 0)
+				__free_pages(pages[i], 0);
+			return NULL;
+		}
+	}
+
+	/* Get the contiguous VA space for this block */
+	area = get_vm_area(nr_pages << PAGE_SHIFT, VM_MAP);
+	if (!area)
+		goto rollback_mgt;
+
+	/* Map pages for the block management pages */
+	tmppage = pages;
+	tmp.addr = area->addr + NR_CPUS * PCPU_BLKSIZE;
+	tmp.size = BLOCK_MANAGEMENT_SIZE + PAGE_SIZE;
+	if (map_vm_area(&tmp, PAGE_KERNEL, &tmppage))
+		goto rollback_vm_area;
+
+	/* Init the block descriptor */
+	blkp = area->addr + NR_CPUS * PCPU_BLKSIZE;
+	init_pcpu_block(blkp);
+	for (i = 0; i < BLOCK_MANAGEMENT_PAGES; i++)
+		blkp->pages[i + PCPUPAGES_PER_BLOCK] = pages[i];
+
+	/* Alloc node local pages for all cpus possible */
+	for_each_cpu(i) {
+		int start_idx = i * cpu_pages;
+		for (j = start_idx; j < start_idx + cpu_pages; j++) {
+			blkp->pages[j] = alloc_pages_node(cpu_to_node(i)
+							  ,
+							  GFP_KERNEL |
+							  __GFP_HIGHMEM,
+							  0);
+			if (unlikely(!blkp->pages[j]))
+				goto rollback_pages;
+		}
+	}
+
+	/* Map pages for each cpu by splitting vm_struct for each cpu */
+	for_each_cpu(i) {
+		tmppage = &blkp->pages[i * cpu_pages];
+		tmp.addr = area->addr + i * PCPU_BLKSIZE;
+		/* map_vm_area assumes a guard page of size PAGE_SIZE */
+		tmp.size = PCPU_BLKSIZE + PAGE_SIZE;
+		if (map_vm_area(&tmp, PAGE_KERNEL, &tmppage))
+			goto fail_map;
+	}
+
+	return area->addr;
+
+fail_map:
+	i--;
+	for (; i >= 0; i--) {
+		if (cpu_possible(i)) {
+			tmp.addr = area->addr + i * PCPU_BLKSIZE;
+			/* we've mapped a guard page extra earlier... */
+			tmp.size = PCPU_BLKSIZE + PAGE_SIZE;
+			unmap_vm_area(&tmp);
+		}
+	}
+
+	/* set i and j with proper values for the roll back at fail: */
+	i = NR_CPUS - 1;
+	j = PCPUPAGES_PER_BLOCK;
+
+rollback_pages:
+	j--;
+	for (; j >= 0; j--)
+		if (cpu_possible(j / cpu_pages))
+			__free_pages(blkp->pages[j], 0);
+
+	/* Unmap  block management */
+	tmp.addr = area->addr + NR_CPUS * PCPU_BLKSIZE;
+	tmp.size = BLOCK_MANAGEMENT_SIZE + PAGE_SIZE;
+	unmap_vm_area(&tmp);
+
+rollback_vm_area:
+	/* Give back the contiguous mem area */
+	area = remove_vm_area(area->addr);
+	BUG_ON(!area);
+
+rollback_mgt:
+
+	/* Free the block management pages */
+	for (i = 0; i < BLOCK_MANAGEMENT_PAGES; i++)
+		__free_pages(pages[i], 0);
+
+	return NULL;
+}
+
+/* Free memory block allocated by valloc_percpu */
+static void vfree_percpu(void *addr)
+{
+	int i;
+	struct pcpu_block *blkp = addr + PCPUPAGES_PER_BLOCK * PAGE_SIZE;
+	struct vm_struct *area, tmp;
+	unsigned int cpu_pages = PCPU_BLKSIZE >> PAGE_SHIFT;
+	struct page *pages[BLOCK_MANAGEMENT_PAGES];
+
+	/* Backup the block management struct pages */
+	for (i = 0; i < BLOCK_MANAGEMENT_PAGES; i++)
+		pages[i] = blkp->pages[i + PCPUPAGES_PER_BLOCK];
+
+	/* Unmap all cpu_pages from the block's vm space */
+	for_each_cpu(i) {
+		tmp.addr = addr + i * PCPU_BLKSIZE;
+		/* We've mapped a guard page extra earlier */
+		tmp.size = PCPU_BLKSIZE + PAGE_SIZE;
+		unmap_vm_area(&tmp);
+	}
+
+	/* Give back all allocated pages */
+	for (i = 0; i < PCPUPAGES_PER_BLOCK; i++) {
+		if (cpu_possible(i / cpu_pages))
+			__free_pages(blkp->pages[i], 0);
+	}
+
+	/* Unmap block management pages */
+	tmp.addr = addr + NR_CPUS * PCPU_BLKSIZE;
+	tmp.size = BLOCK_MANAGEMENT_SIZE + PAGE_SIZE;
+	unmap_vm_area(&tmp);
+
+	/* Free block management pages */
+	for (i = 0; i < BLOCK_MANAGEMENT_PAGES; i++)
+		__free_pages(pages[i], 0);
+
+	/* Give back vm area for this block */
+	area = remove_vm_area(addr);
+	BUG_ON(!area);
+
+}
+
+static int add_percpu_block(void)
+{
+	struct pcpu_block *blkp;
+	void *start_addr;
+
+	start_addr = valloc_percpu();
+	if (!start_addr)
+		return 0;
+	blkp = start_addr + PCPUPAGES_PER_BLOCK * PAGE_SIZE;
+	blkp->start_addr = start_addr;
+	down(&blklist_lock);
+	list_add_tail(&blkp->blklist, &blkhead);
+	if (firstnotfull == &blkhead)
+		firstnotfull = &blkp->blklist;
+	up(&blklist_lock);
+
+	return 1;
+}
+
+struct obj_map_elmt {
+	int startbit;
+	int obj_size;
+};
+
+/* Fill the array with obj map info and return no of elements in the array */
+static int
+make_obj_map(struct obj_map_elmt arr[], struct pcpu_block *blkp)
+{
+	int nr_elements = 0;
+	int i, j, obj_size;
+
+	for (i = 0, j = 0; i < MAX_NR_BITS; i++) {
+		if (!test_bit(i, blkp->bitmap)) {
+			/* Free block start */
+			arr[j].startbit = i;
+			nr_elements++;
+			obj_size = 1;
+			i++;
+			while (i < MAX_NR_BITS && (!test_bit(i, blkp->bitmap))) {
+				i++;
+				obj_size++;
+			}
+			arr[j].obj_size = obj_size * PCPU_CURR_SIZE;
+			j++;
+		}
+	}
+
+	return nr_elements;
+}
+
+/* Compare routine for qsort -- for asceding order */
+static int obj_map_cmp(const void *a, const void *b)
+{
+	struct obj_map_elmt *sa, *sb;
+
+	sa = (struct obj_map_elmt *) a;
+	sb = (struct obj_map_elmt *) b;
+	return sa->obj_size - sb->obj_size;
+}
+
+/* Add bufctl to list of bufctl */
+static void add_bufctl(struct buf_ctl *bufp)
+{
+	if (buf_head == NULL)
+		buf_head = bufp;
+	else {
+		bufp->next = buf_head;
+		buf_head = bufp;
+	}
+}
+
+/* After you alloc from a block, It can only go up the ordered list */
+static void sort_blk_list_up(struct pcpu_block *blkp)
+{
+	struct list_head *pos;
+
+	for (pos = blkp->blklist.prev; pos != &blkhead; pos = pos->prev) {
+		if (BLK_SIZE_USED(pos) < blkp->size_used) {
+			/* Move blkp up */
+			list_del(&blkp->blklist);
+			list_add_tail(&blkp->blklist, pos);
+			pos = &blkp->blklist;
+		} else
+			break;
+	}
+	/* Fix firstnotfull if needed */
+	if (blkp->size_used == PCPU_BLKSIZE) {
+		firstnotfull = blkp->blklist.next;
+		return;
+	}
+	if (blkp->size_used > BLK_SIZE_USED(firstnotfull)) {
+		firstnotfull = &blkp->blklist;
+		return;
+	}
+}
+
+struct buf_ctl *alloc_bufctl(struct pcpu_block *blkp)
+{
+	void *bufctl;
+	int head = blkp->bufctl_fl_head;
+	BUG_ON(head == -1);	/* If bufctls for this block has exhausted */
+	blkp->bufctl_fl_head = blkp->bufctl_fl[blkp->bufctl_fl_head];
+	BUG_ON(head >= OBJS_PER_BLOCK);
+	bufctl = (void *) blkp + sizeof (struct pcpu_block) +
+	    sizeof (struct buf_ctl) * head;
+	return bufctl;
+}
+
+/* Don't want to kmalloc this -- to avoid dependence on slab for future */
+static struct obj_map_elmt obj_map[OBJS_PER_BLOCK];
+
+/* Scan the freelist and return suitable obj if found */
+static void *
+get_obj_from_block(size_t size, size_t align, struct pcpu_block *blkp)
+{
+	int nr_elements, nr_currency, obj_startbit, obj_endbit;
+	int i, j;
+	void *objp;
+	struct buf_ctl *bufctl;
+
+	nr_elements = make_obj_map(obj_map, blkp);
+	if (!nr_elements)
+		return NULL;
+
+	/* Sort list in ascending order */
+	qsort(obj_map, nr_elements, sizeof (obj_map[0]), obj_map_cmp);
+
+	/* Get the smallest obj_sized chunk for this size */
+	i = 0;
+	while (i < nr_elements - 1 && size > obj_map[i].obj_size)
+		i++;
+	if (obj_map[i].obj_size < size)	/* No suitable obj_size found */
+		return NULL;
+
+	/* chunk of obj_size >= size is found, check for suitability (align) 
+	 * and alloc 
+	 */
+	nr_currency = size / PCPU_CURR_SIZE;
+	obj_startbit = obj_map[i].startbit;
+
+try_again_for_align:
+
+	obj_endbit = obj_map[i].startbit + obj_map[i].obj_size / PCPU_CURR_SIZE
+	    - 1;
+	objp = obj_startbit * PCPU_CURR_SIZE + blkp->start_addr;
+
+	if (IS_ALIGNED((unsigned long) objp, align)) {
+		/* Alignment is ok so alloc this chunk */
+		bufctl = alloc_bufctl(blkp);
+		if (!bufctl)
+			return NULL;
+		bufctl->addr = objp;
+		bufctl->size = size;
+		bufctl->next = NULL;
+
+		/* Mark the bitmap as allocated */
+		for (j = obj_startbit; j < nr_currency + obj_startbit; j++)
+			set_bit(j, blkp->bitmap);
+		blkp->size_used += size;
+		/* Re-arrange list to preserve full, partial and free order */
+		sort_blk_list_up(blkp);
+		/* Add to the allocated buffers list and return */
+		add_bufctl(bufctl);
+		return objp;
+	} else {
+		/* Alignment is not ok */
+		int obj_size = (obj_endbit - obj_startbit + 1) * PCPU_CURR_SIZE;
+		if (obj_size > size && obj_startbit <= obj_endbit) {
+			/* Since obj_size is bigger than requested, check if
+			   alignment can be met by changing startbit */
+			obj_startbit++;
+			goto try_again_for_align;
+		} else {
+			/* Try in the next chunk */
+			if (++i < nr_elements) {
+				/* Reset start bit and try again */
+				obj_startbit = obj_map[i].startbit;
+				goto try_again_for_align;
+			}
+		}
+	}
+
+	/* Everything failed so return NULL */
+	return NULL;
+}
+
+static void zero_obj(void *obj, size_t size)
+{
+	int cpu;
+	for_each_cpu(cpu)
+		memset(per_cpu_ptr(obj, cpu), 0, size);
+}
+
+/* 
+ * __alloc_percpu - allocate one copy of the object for every present
+ * cpu in the system, zeroing them.
+ * Objects should be dereferenced using per_cpu_ptr/get_cpu_ptr
+ * macros only
+ *
+ * This allocator is slow as we assume allocs to come
+ * by during boot/module init.
+ * Should not be called from interrupt context 
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	struct pcpu_block *blkp;
+	struct list_head *l;
+	void *obj;
+
+	if (!size)
+		return NULL;
+
+	if (size < PCPU_CURR_SIZE)
+		size = PCPU_CURR_SIZE;
+
+	if (align == 0)
+		align = PCPU_CURR_SIZE;
+
+	if (size > MAX_OBJSIZE) {
+		printk("alloc_percpu: ");
+		printk("size %d requested is more than I can handle\n", size);
+		return NULL;
+	}
+
+	BUG_ON(!IS_ALIGNED(size, PCPU_CURR_SIZE));
+
+try_after_refill:
+
+	/* Get the block to allocate from */
+	down(&blklist_lock);
+	l = firstnotfull;
+
+try_next_block:
+
+	/* If you have reached end of list, add another block and try */
+	if (l == &blkhead)
+		goto unlock_and_get_mem;
+	blkp = list_entry(l, struct pcpu_block, blklist);
+	obj = get_obj_from_block(size, align, blkp);
+	if (!obj) {
+		l = l->next;
+		goto try_next_block;
+	}
+	up(&blklist_lock);
+	zero_obj(obj, size);
+	return obj;
+
+unlock_and_get_mem:
+
+	up(&blklist_lock);
+	if (add_percpu_block())
+		goto try_after_refill;
+	return NULL;
+
+}
+
+EXPORT_SYMBOL(__alloc_percpu);
+
+/* After you free from a block, It can only go down the ordered list */
+static void sort_blk_list_down(struct pcpu_block *blkp)
+{
+	struct list_head *pos, *prev, *next;
+	/* Store the actual prev and next pointers for fnof fixing later */
+	prev = blkp->blklist.prev;
+	next = blkp->blklist.next;
+
+	/* Fix the ordering on the list */
+	for (pos = blkp->blklist.next; pos != &blkhead; pos = pos->next) {
+		if (BLK_SIZE_USED(pos) > blkp->size_used) {
+			/* Move blkp down */
+			list_del(&blkp->blklist);
+			list_add(&blkp->blklist, pos);
+			pos = &blkp->blklist;
+		} else
+			break;
+	}
+	/* Fix firstnotfull if needed and return */
+	if (firstnotfull == &blkhead) {
+		/* There was no block free, so now this block is fnotfull */
+		firstnotfull = &blkp->blklist;
+		return;
+	}
+
+	if (firstnotfull == &blkp->blklist) {
+		/* This was firstnotfull so fix fnof pointer accdly */
+		if (prev != &blkhead && BLK_SIZE_USED(prev) != PCPU_BLKSIZE) {
+			/* Move fnof pointer up */
+			firstnotfull = prev;
+			prev = prev->prev;
+			/* If size_used of prev is same as fnof, fix fnof to 
+			   point to topmost of the equal sized blocks */
+			while (prev != &blkhead &&
+			       BLK_SIZE_USED(prev) != PCPU_BLKSIZE) {
+				if (BLK_SIZE_USED(prev) !=
+				    BLK_SIZE_USED(firstnotfull))
+					return;
+				firstnotfull = prev;
+				prev = prev->prev;
+			}
+		} else if (next != &blkhead) {
+			/* Move fnof pointer down */
+			firstnotfull = next;
+			next = next->next;
+			if (BLK_SIZE_USED(firstnotfull) != PCPU_BLKSIZE)
+				return;
+			/* fnof is pointing to block which is full...fix it */
+			while (next != &blkhead &&
+			       BLK_SIZE_USED(next) == PCPU_BLKSIZE) {
+				firstnotfull = next;
+				next = next->next;
+			}
+		}
+
+	}
+
+}
+
+void free_bufctl(struct pcpu_block *blkp, struct buf_ctl *bufp)
+{
+	int idx = ((void *) bufp - (void *) blkp - sizeof (struct pcpu_block))
+	    / sizeof (struct buf_ctl);
+	blkp->bufctl_fl[idx] = blkp->bufctl_fl_head;
+	blkp->bufctl_fl_head = idx;
+}
+
+/*
+ * Free the percpu obj and whatever memory can be freed
+ */
+static void free_percpu_obj(struct list_head *pos, struct buf_ctl *bufp)
+{
+	struct pcpu_block *blkp;
+	blkp = list_entry(pos, struct pcpu_block, blklist);
+
+	/* Update blkp->size_used and free if size_used is 0 */
+	blkp->size_used -= bufp->size;
+	if (blkp->size_used) {
+		/* Mark the bitmap corresponding to this object free */
+		int i, obj_startbit;
+		int nr_currency = bufp->size / PCPU_CURR_SIZE;
+		obj_startbit = (bufp->addr - blkp->start_addr) / PCPU_CURR_SIZE;
+		for (i = obj_startbit; i < obj_startbit + nr_currency; i++)
+			clear_bit(i, blkp->bitmap);
+		sort_blk_list_down(blkp);
+	} else {
+		/* Usecount is zero, so prepare to give this block back to vm */
+		/* Fix firstnotfull if freeing block was firstnotfull 
+		 * If there are more blocks with the same usecount as fnof,
+		 * point to the first block from the head */
+		if (firstnotfull == pos) {
+			firstnotfull = pos->prev;
+			while (firstnotfull != &blkhead) {
+				unsigned int fnf_size_used;
+				fnf_size_used = BLK_SIZE_USED(firstnotfull);
+
+				if (fnf_size_used == PCPU_BLKSIZE)
+					firstnotfull = &blkhead;
+				else if (firstnotfull->prev == &blkhead)
+					break;
+				else if (BLK_SIZE_USED(firstnotfull->prev)
+					 == fnf_size_used)
+					firstnotfull = firstnotfull->prev;
+				else
+					break;
+			}
+		}
+		list_del(pos);
+	}
+
+	/* Free bufctl after fixing the bufctl list */
+	if (bufp == buf_head) {
+		buf_head = bufp->next;
+	} else {
+		struct buf_ctl *tmp = buf_head;
+		while (tmp && tmp->next != bufp)
+			tmp = tmp->next;
+		BUG_ON(!tmp || tmp->next != bufp);
+		tmp->next = bufp->next;
+	}
+	free_bufctl(blkp, bufp);
+	/* If usecount is zero, give this block back to vm */
+	if (!blkp->size_used)
+		vfree_percpu(blkp->start_addr);
+	return;
+}
+
+/*
+ * Free memory allocated using alloc_percpu.
+ */
+
+void free_percpu(const void *objp)
+{
+	struct buf_ctl *bufp;
+	struct pcpu_block *blkp;
+	struct list_head *pos;
+	if (!objp)
+		return;
+
+	/* Find block from which obj was allocated by scanning  bufctl list */
+	down(&blklist_lock);
+	bufp = buf_head;
+	while (bufp) {
+		if (bufp->addr == objp)
+			break;
+		bufp = bufp->next;
+	}
+	BUG_ON(!bufp);
+
+	/* We have the bufctl for the obj here, Now get the block */
+	list_for_each(pos, &blkhead) {
+		blkp = list_entry(pos, struct pcpu_block, blklist);
+		if (objp >= blkp->start_addr &&
+		    objp < blkp->start_addr + PCPU_BLKSIZE)
+			break;
+	}
+
+	BUG_ON(pos == &blkhead);	/* Couldn't find obj in block list */
+
+	/* 
+	 * Mark the bitmap free, Update use count, fix the ordered 
+	 * blklist, free the obj bufctl. 
+	 */
+	free_percpu_obj(pos, bufp);
+
+	up(&blklist_lock);
+	return;
+}
+
+EXPORT_SYMBOL(free_percpu);
diff -ruN -X dontdiff2 linux-2.6.11-rc1mm1/mm/slab.c alloc_percpu-2.6.11-rc1mm1/mm/slab.c
--- linux-2.6.11-rc1mm1/mm/slab.c	2005-01-15 21:26:56.000000000 +0530
+++ alloc_percpu-2.6.11-rc1mm1/mm/slab.c	2005-01-15 21:41:52.000000000 +0530
@@ -2482,51 +2482,6 @@
 
 EXPORT_SYMBOL(__kmalloc);
 
-#ifdef CONFIG_SMP
-/**
- * __alloc_percpu - allocate one copy of the object for every present
- * cpu in the system, zeroing them.
- * Objects should be dereferenced using the per_cpu_ptr macro only.
- *
- * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
- */
-void *__alloc_percpu(size_t size, size_t align)
-{
-	int i;
-	struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
-
-	if (!pdata)
-		return NULL;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		pdata->ptrs[i] = kmem_cache_alloc_node(
-				kmem_find_general_cachep(size, GFP_KERNEL),
-				cpu_to_node(i));
-
-		if (!pdata->ptrs[i])
-			goto unwind_oom;
-		memset(pdata->ptrs[i], 0, size);
-	}
-
-	/* Catch derefs w/o wrappers */
-	return (void *) (~(unsigned long) pdata);
-
-unwind_oom:
-	while (--i >= 0) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(pdata->ptrs[i]);
-	}
-	kfree(pdata);
-	return NULL;
-}
-
-EXPORT_SYMBOL(__alloc_percpu);
-#endif
-
 /**
  * kmem_cache_free - Deallocate an object
  * @cachep: The cache the allocation was from.
@@ -2590,31 +2545,6 @@
 
 EXPORT_SYMBOL(kfree);
 
-#ifdef CONFIG_SMP
-/**
- * free_percpu - free previously allocated percpu memory
- * @objp: pointer returned by alloc_percpu.
- *
- * Don't free memory not originally allocated by alloc_percpu()
- * The complemented objp is to check for that.
- */
-void
-free_percpu(const void *objp)
-{
-	int i;
-	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		kfree(p->ptrs[i]);
-	}
-	kfree(p);
-}
-
-EXPORT_SYMBOL(free_percpu);
-#endif
-
 unsigned int kmem_cache_size(kmem_cache_t *cachep)
 {
 	return obj_reallen(cachep);

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2/2] mm: Reimplementation of alloc_percpu
  2005-01-17 18:36 ` [patch 2/2] mm: Reimplementation of alloc_percpu Ravikiran G Thirumalai
@ 2005-01-18  1:30   ` Rusty Russell
  2005-01-18 15:15     ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 5+ messages in thread
From: Rusty Russell @ 2005-01-18  1:30 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, lkml - Kernel Mailing List, Manfred Spraul,
	Dipankar Sarma

On Tue, 2005-01-18 at 00:06 +0530, Ravikiran G Thirumalai wrote:
> Here's the alloc_percpu reimplementation changed to
> - Use qsort 
> - Use GFP_KERNEL|__GFP_HIGHMEM|__GFP_ZERO for BLOCK_MANAGEMENT_PAGES
>   (GFP_HIGHZERO would have been ideal)
> - Changed currency size to sizeof (int) from sizeof (void *) for better
>   utilization for small objects
> 
> The allocator can be easily modified to use __per_cpu_offset[] table at a later
> stage by: 
> 1. Allocating ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE) for the
>    static percpu areas and populating __per_cpu_offset[] offset table
> 2. Making PCPU_BLKSIZE same as the static per cpu area size above
> 3. Serving dynamic percpu requests from modules etc from blocks by
>    returning ret -= __per_cpu_offset[0] from a percpu block.  This way
>    modules need not have a limit on static percpu areas.

Unfortunately ia64 breaks (3).  They have pinned TLB entries covering
64k, which they put the static per-cpu data into.  This is used for
local_inc, etc, and David Mosberger loved that trick (this is why my
version allocated from that first reserved block for modules' static
per-cpu vars).

Hope that clarifies,
Rusty.
-- 
A bad analogy is like a leaky screwdriver -- Richard Braakman


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2/2] mm: Reimplementation of alloc_percpu
  2005-01-18  1:30   ` Rusty Russell
@ 2005-01-18 15:15     ` Ravikiran G Thirumalai
  2005-01-18 23:30       ` Rusty Russell
  0 siblings, 1 reply; 5+ messages in thread
From: Ravikiran G Thirumalai @ 2005-01-18 15:15 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, lkml - Kernel Mailing List, Manfred Spraul,
	Dipankar Sarma

On Tue, Jan 18, 2005 at 12:30:32PM +1100, Rusty Russell wrote:
> On Tue, 2005-01-18 at 00:06 +0530, Ravikiran G Thirumalai wrote:
> > ...
> > The allocator can be easily modified to use __per_cpu_offset[] table at a later
> > stage by: 
> > 1. Allocating ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE) for the
> >    static percpu areas and populating __per_cpu_offset[] offset table
> > 2. Making PCPU_BLKSIZE same as the static per cpu area size above
> > 3. Serving dynamic percpu requests from modules etc from blocks by
> >    returning ret -= __per_cpu_offset[0] from a percpu block.  This way
> >    modules need not have a limit on static percpu areas.
> 
> Unfortunately ia64 breaks (3).  They have pinned TLB entries covering
> 64k, which they put the static per-cpu data into.  This is used for
> local_inc, etc, and David Mosberger loved that trick (this is why my
> version allocated from that first reserved block for modules' static
> per-cpu vars).

Hmmm... then if we change (1) to allocate PERCPU_ENOUGH_ROOM, then the math
will work out?  We will still have a limit on static per-cpu areas in
modules, but alloc_percpu can use the same __per_cpu_offset table[].
Will this work?

But, what I am concerned is about arches like x86_64 which currently
do not maintain the relation:
__per_cpu_offset[n] = __per_cpu_offset[0] + static_percpu_size * n  ---> (A)
correct me if I am wrong, but both our methods for alloc_percpu to use
per_cpu_offset depend on the static per-cpu areas being virtually
contiguous (with relation (A) above being maintained).
If arches cannot sew up node local pages to form a virtually contiguous
block, maybe because setup_per_cpu_areas happens early during boot, 
then we will have a problem.

So a common solution could be:
Declare a dynamic percpu offset table 'alloc_percpu_offset' or
something like that, make it a static per-cpu variable.  Then, the 
blocks can be of any uniform size, we just have to fill
alloc_percpu_offset and use
	(RELOC_HIDE(ptr, per_cpu(alloc_percpu_offset, cpu))))
to get to to the cpu local versions.  I think dereference speeds can be 
fast with this method too since we use __per_cpu_offset[] indirectly.
Of course this is not needed if all arches can do node local allocation
and maintain relation (A) for static per-cpu areas.

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2/2] mm: Reimplementation of alloc_percpu
  2005-01-18 15:15     ` Ravikiran G Thirumalai
@ 2005-01-18 23:30       ` Rusty Russell
  0 siblings, 0 replies; 5+ messages in thread
From: Rusty Russell @ 2005-01-18 23:30 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, lkml - Kernel Mailing List, Manfred Spraul,
	Dipankar Sarma

On Tue, 2005-01-18 at 20:45 +0530, Ravikiran G Thirumalai wrote:
> On Tue, Jan 18, 2005 at 12:30:32PM +1100, Rusty Russell wrote:
> > On Tue, 2005-01-18 at 00:06 +0530, Ravikiran G Thirumalai wrote:
> > > ...
> > > The allocator can be easily modified to use __per_cpu_offset[] table at a later
> > > stage by: 
> > > 1. Allocating ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE) for the
> > >    static percpu areas and populating __per_cpu_offset[] offset table
> > > 2. Making PCPU_BLKSIZE same as the static per cpu area size above
> > > 3. Serving dynamic percpu requests from modules etc from blocks by
> > >    returning ret -= __per_cpu_offset[0] from a percpu block.  This way
> > >    modules need not have a limit on static percpu areas.
> > 
> > Unfortunately ia64 breaks (3).  They have pinned TLB entries covering
> > 64k, which they put the static per-cpu data into.  This is used for
> > local_inc, etc, and David Mosberger loved that trick (this is why my
> > version allocated from that first reserved block for modules' static
> > per-cpu vars).
> 
> Hmmm... then if we change (1) to allocate PERCPU_ENOUGH_ROOM, then the math
> will work out?  We will still have a limit on static per-cpu areas in
> modules, but alloc_percpu can use the same __per_cpu_offset table[].
> Will this work?

I think so.

> But, what I am concerned is about arches like x86_64 which currently
> do not maintain the relation:
> __per_cpu_offset[n] = __per_cpu_offset[0] + static_percpu_size * n  ---> (A)
> correct me if I am wrong, but both our methods for alloc_percpu to use
> per_cpu_offset depend on the static per-cpu areas being virtually
> contiguous (with relation (A) above being maintained).
> If arches cannot sew up node local pages to form a virtually contiguous
> block, maybe because setup_per_cpu_areas happens early during boot, 
> then we will have a problem.

They don't actually have to be contiguous, although that makes it
easier.  They can reserve virtual address space to extend their per-cpu
areas.  I think this is a worthwhile tradeoff if they want to do this.

Cheers,
Rusty.
-- 
A bad analogy is like a leaky screwdriver -- Richard Braakman


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2005-01-19  0:07 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-17 18:30 [patch 1/2] mm: Reimplementation of alloc_percpu - move qsort from xfs/support to linux/lib Ravikiran G Thirumalai
2005-01-17 18:36 ` [patch 2/2] mm: Reimplementation of alloc_percpu Ravikiran G Thirumalai
2005-01-18  1:30   ` Rusty Russell
2005-01-18 15:15     ` Ravikiran G Thirumalai
2005-01-18 23:30       ` Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).