linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 00/12] Slab defragmentation V3
@ 2007-06-07 21:55 clameter
  2007-06-07 21:55 ` [patch 01/12] SLUB: Add support for kmem_cache_ops clameter
                   ` (13 more replies)
  0 siblings, 14 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

Will show up shortly at http://ftp.kernel.org/pub/linux/kernel/people/christoph/slab-defrag/

Test results (see appended scripts / user space code for more data)

(3 level tree with 10 entries at first level , 20 at the second and 30 files at the
third level. Files at the lowest level were removed to create inode fragmentation)

%Ra is the allocation ratio (need to apply the slabinfo patch to get those numbers)

inode reclaim in reiserfs

Name                   Objects Objsize    Space Slabs/Part/Cpu  O/S O %Ra %Ef Flg
dentry                   14660     200     3.0M        733/0/1   20 0 100  97 Da
reiser_inode_cache        1596     640     4.1M      256/201/1   25 2  24  24 DCa

Status after defrag

Name                   Objects Objsize    Space Slabs/Part/Cpu  O/S O %Ra %Ef Flg
dentry                    8849     200     1.8M       454/17/1   20 0  97  95 Da
reiser_inode_cache        1381     640     1.0M        65/11/0   25 2  84  82 DCa



Slab defragmentation can be triggered in two ways:

1. Manually by running

slabinfo -s <slabs-to-shrink>

or manually by the kernel calling

kmem_cache_shrink(slab)

(Currently only ACPI is doing such a call to a slab that has no
defragmentation support. In that case we simply do what SLAB does:
drop per cpu caches and sift through partial list for free slabs).

2. Automatically if defragmentable slabs reach a certain degree of
   fragmentation.

The point where slab defragmentation occurs is can be set at

/proc/sys/vm/slab_defrag_ratio

Slab fragmentation is measured by how much of the possible objects in a
slab are in use. The default setting for slab_defrag_ratio is 30%. This
means that slab fragmentation is going to be triggered if there are more than
3 free object slots for each allocated object.

Setting the slab_defrag_ratio higher will cause more defragmentation runs.
If slab_defrag_ratio is set to 0 then no slab defragmentation occurs.

Slabs are checked for their fragmentation levels after the slabs have been shrunk
by running shrinkers in vm/scan.c during memory reclaim. This means that slab
defragmentation is only triggered if we are under memory pressure and if there is
significant slab fragmentation.

V1->V2
- Clean up control flow using a state variable. Simplify API. Back to 2
  functions that now take arrays of objects.
- Inode defrag support for a set of filesystems
- Fix up dentry defrag support to work on negative dentries by adding
  a new dentry flag that indicates that a dentry is not in the process
  of being freed or allocated.

V2->V3
- Support directory reclaim
- Add infrastructure to trigger slab defrag after slab shrinking if we
  have slabs with a high degree of fragmentation.



Test script:

#!/bin/sh

echo 30 >/proc/sys/vm/slab_defrag_ratio

./gazfiles c 3 10 20 30
echo "Status before"
slabinfo -D
./gazfiles d 2
echo "Status after removing files"
slabinfo -D
slabinfo -s
echo "Status after defrag"
slabinfo -D
./gazfiles d 0


gazfiles.c :

/*
 * Create a gazillion of files to be able to create slab fragmentation
 *
 * (C) 2007 sgi, Christoph Lameter <clameter@sgi.com>
 *
 * Create a n layered hierachy of files of empty files
 *
 * gazfiles <action> <levels> <n1> <n2> ...
 *
 * gazfiles c[reate] 3 50 50 50
 *
 * gazfiles s[hrink] <levels>
 *
 * gazfiles r[andomkill] <nr to kill> 
 */

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <string.h>
#include <unistd.h>
#include <stdarg.h>
#include <getopt.h>
#include <regex.h>
#include <errno.h>

#define MAXIMUM_LEVELS 10

int level;
int sizes[MAXIMUM_LEVELS];

void fatal(const char *x, ...)
{
        va_list ap;

        va_start(ap, x);
        vfprintf(stderr, x, ap);
        va_end(ap);
        exit(1);
}

int read_gaz(void)
{
	FILE *f = fopen(".gazinfo", "r");
	int rc = 0;
	int i;

	if (!f)
		return 0;

	if (!fscanf(f, "%d", &level))
		goto out;

	if (level >= MAXIMUM_LEVELS)
		goto out;

	for (i = 0; i < level; i++)
		if (!fscanf(f, " %d", &sizes[i]))
			goto out;
	rc = 1;
out:
	fclose(f);
	return rc;
}

void write_gaz(void)
{
	FILE *f = fopen(".gazinfo","w");
	int i;

	fprintf(f, "%d",level);
	for (i = 0; i < level; i++)
		fprintf(f," %d", sizes[i]);
	fprintf(f, "\n");
	fclose(f);
}

void cre(int l)
{
	int i;

	for (i = 0; i < sizes[l - 1]; i++) {
		char name[20];

		sprintf(name, "%03d", i);

		if (l < level) {
			mkdir(name, 0775);
			chdir(name);
			cre(l + 1);
			chdir("..");
		} else {
			FILE *f;

			f = fopen(name,"w");
			fprintf(f, "Test");
			fclose(f);
		}
	}
}

void create(int l, char **sz)
{
	int i;

	level = l;
	for (i = 0; i < level; i++)
		sizes[i] = atoi(sz[i]);

	if (mkdir("gazf", 0775))
		fatal("Cannot create gazf here\n");
	chdir("gazf");
	write_gaz();
	cre(1);
	chdir("..");
}

void shrink(int level)
{
	if (chdir("gazf"))
		fatal("No gazfiles in this directory");
	read_gaz();
	chdir("..");
}

void scand(int l, void (*func)(int, int, char *, unsigned long),
			unsigned long level)
{
	DIR *dir;
	struct dirent *de;

	dir = opendir(".");
	if (!dir)
		fatal("Cannot open directory");
	while ((de = readdir(dir))) {
		struct stat s;

		if (de->d_name[0] == '.')
			continue;

		/*
		 * Some idiot broke the glibc library or made it impossible
		 * to figure out how to make readdir work right
		 */

		stat(de->d_name, &s);
		if (S_ISDIR(s.st_mode))
			de->d_type = DT_DIR;

		if (de->d_type == DT_DIR) {
			if (chdir(de->d_name))
				fatal("Cannot enter %s", de->d_name);
			scand(l + 1, func, level);
			chdir("..");
			func(l, 1, de->d_name, level);
		} else {
			func(l, 0, de->d_name, level);
		}
	}
	closedir(dir);
}

void traverse(void (*func)(int, int, char *, unsigned long),
		unsigned long level)
{
	if (chdir("gazf"))
		fatal("No gazfiles in this directory");
	scand(1, func, level);
	chdir("..");
}

void randomkill(int nr)
{
	if (chdir("gazf"))
		fatal("No gazfiles in this directory");
	read_gaz();
	chdir("..");
}

void del_func(int l, int dir, char *name, unsigned long level)
{
	if (l <= level)
		return;
	if (dir) {
		if (rmdir(name))
			fatal("Cannot remove directory %s");
	} else {
		if (unlink(name))
			fatal("Cannot unlink file %s");
	}
}

void delete(int l)
{
	if (l == 0) {
		system("rm -rf gazf");
		return;
	}
	traverse(del_func, l);
}

void usage(void)
{
	printf("gazfiles: Tool to manage gazillions of files\n\n");
	printf("gazfiles create <levels> <#l1> <#l2> ...\n");
	printf("gazfiles delete <levels>\n");
	printf("gazfiles shrink <levels>\n");
	printf("gazfiles randomkill <nr>\n\n");
	printf("(C) 2007 sgi, Christoph Lameter <clameter@sgi.com>\n");
	exit(0);
}

int main(int argc, char *argv[])
{
	if (argc  <  2)
		usage();

	switch (argv[1][0]) {
		case 'c' :
			create(atoi(argv[2]), argv + 3);
			break;
		case 's' :
			if (argc != 3)
				usage();

			shrink(atoi(argv[2]));
			break;
		case 'r' :
			if (argc != 3)
				usage();

			randomkill(atoi(argv[2]));
			break;
		case 'd':
			if (argc != 3)
				usage();
			delete(atoi(argv[2]));
			break;

		default:
			usage();
	}
	return 0;
}
-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 01/12] SLUB: Add support for kmem_cache_ops
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 02/12] SLUB: Slab defragmentation core functionality clameter
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slab_defrag_kmem_cache_ops --]
[-- Type: text/plain, Size: 8319 bytes --]

We use the parameter formerly used by the destructor to pass an optional
pointer to a kmem_cache_ops structure to kmem_cache_create.

kmem_cache_ops is created as empty. Later patches populate kmem_cache_ops.

Create a KMEM_CACHE_OPS macro that allows the specification of a the
kmem_cache_ops.

Code to handle kmem_cache_ops is added to SLUB. SLAB and SLOB are updated
to be able to accept a kmem_cache_ops structure but will ignore it.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/slab.h     |   13 +++++++++----
 include/linux/slub_def.h |    1 +
 mm/slab.c                |    6 +++---
 mm/slob.c                |    2 +-
 mm/slub.c                |   44 ++++++++++++++++++++++++++++++--------------
 5 files changed, 44 insertions(+), 22 deletions(-)

Index: slub/include/linux/slab.h
===================================================================
--- slub.orig/include/linux/slab.h	2007-06-04 20:12:56.000000000 -0700
+++ slub/include/linux/slab.h	2007-06-04 20:13:58.000000000 -0700
@@ -38,10 +38,13 @@
 void __init kmem_cache_init(void);
 int slab_is_available(void);
 
+struct kmem_cache_ops {
+};
+
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *, struct kmem_cache *, unsigned long),
-			void (*)(void *, struct kmem_cache *, unsigned long));
+			const struct kmem_cache_ops *s);
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
@@ -59,9 +62,11 @@ int kmem_ptr_validate(struct kmem_cache 
  * f.e. add ____cacheline_aligned_in_smp to the struct declaration
  * then the objects will be properly aligned in SMP configurations.
  */
-#define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
-		sizeof(struct __struct), __alignof__(struct __struct),\
-		(__flags), NULL, NULL)
+#define KMEM_CACHE_OPS(__struct, __flags, __ops) \
+	kmem_cache_create(#__struct, sizeof(struct __struct), \
+	__alignof__(struct __struct), (__flags), NULL, (__ops))
+
+#define KMEM_CACHE(__struct, __flags) KMEM_CACHE_OPS(__struct, __flags, NULL)
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-06-04 20:13:58.000000000 -0700
+++ slub/mm/slub.c	2007-06-04 20:13:58.000000000 -0700
@@ -294,6 +294,9 @@ static inline int check_valid_pointer(st
 	return 1;
 }
 
+struct kmem_cache_ops slub_default_ops = {
+};
+
 /*
  * Slow version of get and set free pointer.
  *
@@ -2057,11 +2060,13 @@ static int calculate_sizes(struct kmem_c
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 		const char *name, size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(void *, struct kmem_cache *, unsigned long))
+		void (*ctor)(void *, struct kmem_cache *, unsigned long),
+		const struct kmem_cache_ops *ops)
 {
 	memset(s, 0, kmem_size);
 	s->name = name;
 	s->ctor = ctor;
+	s->ops = ops;
 	s->objsize = size;
 	s->flags = flags;
 	s->align = align;
@@ -2244,7 +2249,7 @@ static struct kmem_cache *create_kmalloc
 
 	down_write(&slub_lock);
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
-			flags, NULL))
+			flags, NULL, &slub_default_ops))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
@@ -2575,12 +2580,16 @@ static int slab_unmergeable(struct kmem_
 	if (s->refcount < 0)
 		return 1;
 
+	if (s->ops != &slub_default_ops)
+		return 1;
+
 	return 0;
 }
 
 static struct kmem_cache *find_mergeable(size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(void *, struct kmem_cache *, unsigned long))
+		void (*ctor)(void *, struct kmem_cache *, unsigned long),
+		const struct kmem_cache_ops *ops)
 {
 	struct kmem_cache *s;
 
@@ -2590,6 +2599,9 @@ static struct kmem_cache *find_mergeable
 	if (ctor)
 		return NULL;
 
+	if (ops != &slub_default_ops)
+		return NULL;
+
 	size = ALIGN(size, sizeof(void *));
 	align = calculate_alignment(flags, align, size);
 	size = ALIGN(size, align);
@@ -2622,13 +2634,15 @@ static struct kmem_cache *find_mergeable
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 		size_t align, unsigned long flags,
 		void (*ctor)(void *, struct kmem_cache *, unsigned long),
-		void (*dtor)(void *, struct kmem_cache *, unsigned long))
+		const struct kmem_cache_ops *ops)
 {
 	struct kmem_cache *s;
 
-	BUG_ON(dtor);
+	if (!ops)
+		ops = &slub_default_ops;
+
 	down_write(&slub_lock);
-	s = find_mergeable(size, align, flags, ctor);
+	s = find_mergeable(size, align, flags, ctor, ops);
 	if (s) {
 		s->refcount++;
 		/*
@@ -2642,7 +2656,7 @@ struct kmem_cache *kmem_cache_create(con
 	} else {
 		s = kmalloc(kmem_size, GFP_KERNEL);
 		if (s && kmem_cache_open(s, GFP_KERNEL, name,
-				size, align, flags, ctor)) {
+				size, align, flags, ctor, ops)) {
 			if (sysfs_slab_add(s)) {
 				kfree(s);
 				goto err;
@@ -3267,16 +3281,18 @@ static ssize_t order_show(struct kmem_ca
 }
 SLAB_ATTR_RO(order);
 
-static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+static ssize_t ops_show(struct kmem_cache *s, char *buf)
 {
-	if (s->ctor) {
-		int n = sprint_symbol(buf, (unsigned long)s->ctor);
+	int x = 0;
 
-		return n + sprintf(buf + n, "\n");
+	if (s->ctor) {
+		x += sprintf(buf + x, "ctor : ");
+		x += sprint_symbol(buf + x, (unsigned long)s->ctor);
+		x += sprintf(buf + x, "\n");
 	}
-	return 0;
+	return x;
 }
-SLAB_ATTR_RO(ctor);
+SLAB_ATTR_RO(ops);
 
 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 {
@@ -3508,7 +3524,7 @@ static struct attribute * slab_attrs[] =
 	&slabs_attr.attr,
 	&partial_attr.attr,
 	&cpu_slabs_attr.attr,
-	&ctor_attr.attr,
+	&ops_attr.attr,
 	&aliases_attr.attr,
 	&align_attr.attr,
 	&sanity_checks_attr.attr,
Index: slub/include/linux/slub_def.h
===================================================================
--- slub.orig/include/linux/slub_def.h	2007-06-04 20:13:53.000000000 -0700
+++ slub/include/linux/slub_def.h	2007-06-04 20:13:58.000000000 -0700
@@ -41,6 +41,7 @@ struct kmem_cache {
 	int objects;		/* Number of objects in slab */
 	int refcount;		/* Refcount for slab cache destroy */
 	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	const struct kmem_cache_ops *ops;
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
 	const char *name;	/* Name (only for display!) */
Index: slub/mm/slab.c
===================================================================
--- slub.orig/mm/slab.c	2007-06-04 20:12:56.000000000 -0700
+++ slub/mm/slab.c	2007-06-04 20:13:58.000000000 -0700
@@ -2100,7 +2100,7 @@ static int __init_refok setup_cpu_cache(
  * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @ctor: A constructor for the objects.
- * @dtor: A destructor for the objects (not implemented anymore).
+ * @ops: A kmem_cache_ops structure (ignored).
  *
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a int, but can be interrupted.
@@ -2126,7 +2126,7 @@ struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
 	unsigned long flags,
 	void (*ctor)(void*, struct kmem_cache *, unsigned long),
-	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+	const struct kmem_cache_ops *ops)
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
@@ -2135,7 +2135,7 @@ kmem_cache_create (const char *name, siz
 	 * Sanity checks... these are all serious usage bugs.
 	 */
 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-	    size > KMALLOC_MAX_SIZE || dtor) {
+	    size > KMALLOC_MAX_SIZE) {
 		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
 				name);
 		BUG();
Index: slub/mm/slob.c
===================================================================
--- slub.orig/mm/slob.c	2007-06-04 20:12:56.000000000 -0700
+++ slub/mm/slob.c	2007-06-04 20:13:58.000000000 -0700
@@ -483,7 +483,7 @@ struct kmem_cache {
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 	size_t align, unsigned long flags,
 	void (*ctor)(void*, struct kmem_cache *, unsigned long),
-	void (*dtor)(void*, struct kmem_cache *, unsigned long))
+	const struct kmem_cache_ops *o)
 {
 	struct kmem_cache *c;
 

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 02/12] SLUB: Slab defragmentation core functionality
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
  2007-06-07 21:55 ` [patch 01/12] SLUB: Add support for kmem_cache_ops clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 03/12] SLUB: Extend slabinfo to support -D and -C options clameter
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slab_defrag_core --]
[-- Type: text/plain, Size: 15946 bytes --]

Slab defragmentation occurs either

1. Unconditionally when kmem_cache_shrink is called on slab by the kernel
calling kmem_cache_shrink or slabinfo triggering slab shrinking. This
form performs defragmentation on all nodes of a NUMA system.

2. Conditionally when kmem_cache_defrag(<percentage>, <node>) is called.

   The defragmentation is only performed if the fragmentation of the slab
   is higher then the specified percentage. Fragmentation ratios are measured
   by calculating the percentage of objects in use compared to the total
   number of objects that the slab cache could hold.

   kmem_cache_defrag takes a node parameter. This can either be -1 if
   defragmentation should be performed on all nodes, or a node number.
   If a node number was specified then defragmentation is only performed
   on a specific node.

   Slab defragementation is a memory intensive operation that can be
   sped up in a NUMA system if mostly node local memory is accessed.

For defragmentation SLUB first generates a sorted list of partial slabs.
Sorting is performed according to the number of objects allocated.
Thus the slabs with the least objects will be at the end.

We extract slabs off the tail of that list until we have either reached a
mininum number of slabs or until we encounter a slab that has more than a
quarter of its objects allocated. Then we attempt to remove the objects
from each of the slabs taken.

In order for a slabcache to support defragmentation a couple of functions
must be defined via kmem_cache_ops. These are

void *get(struct kmem_cache *s, int nr, void **objects)

	Must obtain a reference to the listed objects. SLUB guarantees that
	the objects are still allocated. However, other threads may be blocked
	in slab_free attempting to free objects in the slab. These may succeed
	as soon as get() returns to the slab allocator. The function must
	be able to detect the situation and void the attempts to handle such
	objects (by for example voiding the corresponding entry in the objects
	array).

	No slab operations may be performed in get_reference(). Interrupts
	are disabled. What can be done is very limited. The slab lock
	for the page with the object is taken. Any attempt to perform a slab
	operation may lead to a deadlock.

	get() returns a private pointer that is passed to kick. Should we
	be unable to obtain all references then that pointer may indicate
	to the kick() function that it should not attempt any object removal
	or move but simply remove the reference counts.

void kick(struct kmem_cache *, int nr, void **objects, void *get_result)

	After SLUB has established references to the objects in a
	slab it will drop all locks and then use kick() to move objects out
	of the slab. The existence of the object is guaranteed by virtue of
	the earlier obtained references via get(). The callback may perform
	any slab operation since no locks are held at the time of call.

	The callback should remove the object from the slab in some way. This
	may be accomplished by reclaiming the object and then running
	kmem_cache_free() or reallocating it and then running
	kmem_cache_free(). Reallocation is advantageous because the partial
	slabs were just sorted to have the partial slabs with the most objects
	first. Allocation is likely to result in filling up a slab so that
	it can be removed from the partial list.

	Kick() does not return a result. SLUB will check the number of
	remaining objects in the slab. If all objects were removed then
	we know that the operation was successful.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/slab.h |   32 ++++
 mm/slab.c            |    5 
 mm/slob.c            |    5 
 mm/slub.c            |  335 +++++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 313 insertions(+), 64 deletions(-)

Index: slub/include/linux/slab.h
===================================================================
--- slub.orig/include/linux/slab.h	2007-06-07 14:09:42.000000000 -0700
+++ slub/include/linux/slab.h	2007-06-07 14:10:59.000000000 -0700
@@ -38,7 +38,39 @@
 void __init kmem_cache_init(void);
 int slab_is_available(void);
 
+struct kmem_cache;
+
 struct kmem_cache_ops {
+	/*
+	 * Called with slab lock held and interrupts disabled.
+	 * No slab operation may be performed.
+	 *
+	 * Parameters passed are the number of objects to process
+	 * and an array of pointers to objects for which we
+	 * need references.
+	 *
+	 * Returns a pointer that is passed to the kick function.
+	 * If all objects cannot be moved then the pointer may
+	 * indicate that this wont work and then kick can simply
+	 * remove the references that were already obtained.
+	 *
+	 * The array passed to get() is also passed to kick(). The
+	 * function may remove objects by setting array elements to NULL.
+	 */
+	void *(*get)(struct kmem_cache *, int nr, void **);
+
+	/*
+	 * Called with no locks held and interrupts enabled.
+	 * Any operation may be performed in kick().
+	 *
+	 * Parameters passed are the number of objects in the array,
+	 * the array of pointers to the objects and the pointer
+	 * returned by get().
+	 *
+	 * Success is checked by examining the number of remaining
+	 * objects in the slab.
+	 */
+	void (*kick)(struct kmem_cache *, int nr, void **, void *private);
 };
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-06-07 14:09:42.000000000 -0700
+++ slub/mm/slub.c	2007-06-07 14:11:29.000000000 -0700
@@ -2385,6 +2385,195 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
+static unsigned long count_partial(struct kmem_cache_node *n)
+{
+	unsigned long flags;
+	unsigned long x = 0;
+	struct page *page;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	list_for_each_entry(page, &n->partial, lru)
+		x += page->inuse;
+	spin_unlock_irqrestore(&n->list_lock, flags);
+	return x;
+}
+
+/*
+ * Vacate all objects in the given slab.
+ *
+ * Slab must be locked and frozen. Interrupts are disabled (flags must
+ * be passed).
+ *
+ * Will drop and regain and drop the slab lock. At the end the slab will
+ * either be freed or returned to the partial lists.
+ *
+ * Returns the number of remaining objects
+ */
+static int __kmem_cache_vacate(struct kmem_cache *s,
+		struct page *page, unsigned long flags, void *scratch)
+{
+	void **vector = scratch;
+	void *p;
+	void *addr = page_address(page);
+	DECLARE_BITMAP(map, s->objects);
+	int leftover;
+	int objects;
+	void *private;
+
+	if (!page->inuse)
+		goto out;
+
+	/* Determine used objects */
+	bitmap_fill(map, s->objects);
+	for_each_free_object(p, s, page->freelist)
+		__clear_bit(slab_index(p, s, addr), map);
+
+	objects = 0;
+	memset(vector, 0, s->objects * sizeof(void **));
+	for_each_object(p, s, addr) {
+		if (test_bit(slab_index(p, s, addr), map))
+			vector[objects++] = p;
+	}
+
+	private = s->ops->get(s, objects, vector);
+
+	/*
+	 * Got references. Now we can drop the slab lock. The slab
+	 * is frozen so it cannot vanish from under us nor will
+	 * allocations be performed on the slab. However, unlocking the
+	 * slab will allow concurrent slab_frees to proceed.
+	 */
+	slab_unlock(page);
+	local_irq_restore(flags);
+
+	/*
+	 * Perform the KICK callbacks to remove the objects.
+	 */
+	s->ops->kick(s, objects, vector, private);
+
+	local_irq_save(flags);
+	slab_lock(page);
+out:
+	/*
+	 * Check the result and unfreeze the slab
+	 */
+	leftover = page->inuse;
+	unfreeze_slab(s, page);
+	local_irq_restore(flags);
+	return leftover;
+}
+
+/*
+ * Sort the partial slabs by the number of items allocated.
+ * The slabs with the least objects come last.
+ */
+static unsigned long sort_partial_list(struct kmem_cache *s,
+	struct kmem_cache_node *n, void *scratch)
+{
+	struct list_head *slabs_by_inuse = scratch;
+	int i;
+ 	struct page *page;
+	struct page *t;
+	unsigned long freed = 0;
+
+	for (i = 0; i < s->objects; i++)
+		INIT_LIST_HEAD(slabs_by_inuse + i);
+
+	/*
+	 * Build lists indexed by the items in use in each slab.
+	 *
+	 * Note that concurrent frees may occur while we hold the
+	 * list_lock. page->inuse here is the upper limit.
+	 */
+	list_for_each_entry_safe(page, t, &n->partial, lru) {
+		if (!page->inuse && slab_trylock(page)) {
+			/*
+			 * Must hold slab lock here because slab_free
+			 * may have freed the last object and be
+			 * waiting to release the slab.
+			 */
+			list_del(&page->lru);
+			n->nr_partial--;
+			slab_unlock(page);
+			discard_slab(s, page);
+			freed++;
+		} else {
+			list_move(&page->lru,
+			slabs_by_inuse + page->inuse);
+		}
+	}
+
+	/*
+	 * Rebuild the partial list with the slabs filled up most
+	 * first and the least used slabs at the end.
+	 */
+	for (i = s->objects - 1; i >= 0; i--)
+		list_splice(slabs_by_inuse + i, n->partial.prev);
+
+	return freed;
+}
+
+/*
+ * Shrink the slab cache on a particular node of the cache
+ */
+static unsigned long __kmem_cache_shrink(struct kmem_cache *s,
+	struct kmem_cache_node *n, void *scratch)
+{
+	unsigned long flags;
+	struct page *page, *page2;
+ 	LIST_HEAD(zaplist);
+	int freed;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	freed = sort_partial_list(s, n, scratch);
+
+  	/*
+	 * If we have no functions available to defragment the slabs
+ 	 * then we are done.
+ 	 */
+ 	if (!s->ops->get || !s->ops->kick) {
+		spin_unlock_irqrestore(&n->list_lock, flags);
+ 		return freed;
+	}
+
+	/*
+	 * Take slabs with just a few objects off the tail of the now
+	 * ordered list. These are the slabs with the least objects
+	 * and those are likely easy to reclaim.
+	 */
+ 	while (n->nr_partial > MAX_PARTIAL) {
+ 		page = container_of(n->partial.prev, struct page, lru);
+
+		/*
+		 * We are holding the list_lock so we can only
+		 * trylock the slab
+		 */
+		if (page->inuse > s->objects / 4)
+			break;
+
+		if (!slab_trylock(page))
+			break;
+
+		list_move_tail(&page->lru, &zaplist);
+		n->nr_partial--;
+		SetSlabFrozen(page);
+		slab_unlock(page);
+	}
+
+	spin_unlock_irqrestore(&n->list_lock, flags);
+
+	/* Now we can free objects in the slabs on the zaplist */
+	list_for_each_entry_safe(page, page2, &zaplist, lru) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		slab_lock(page);
+		if (__kmem_cache_vacate(s, page, flags, scratch) == 0)
+			freed++;
+	}
+	return freed;
+}
+
 /*
  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
  * the remaining slabs by the number of items in use. The slabs with the
@@ -2398,71 +2587,88 @@ EXPORT_SYMBOL(kfree);
 int kmem_cache_shrink(struct kmem_cache *s)
 {
 	int node;
-	int i;
-	struct kmem_cache_node *n;
-	struct page *page;
-	struct page *t;
-	struct list_head *slabs_by_inuse =
-		kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
-	unsigned long flags;
+	void *scratch;
 
-	if (!slabs_by_inuse)
+	flush_all(s);
+
+	scratch = kmalloc(sizeof(struct list_head) * s->objects,
+								GFP_KERNEL);
+	if (!scratch)
 		return -ENOMEM;
 
-	flush_all(s);
-	for_each_online_node(node) {
-		n = get_node(s, node);
+	for_each_online_node(node)
+		__kmem_cache_shrink(s, get_node(s, node), scratch);
 
-		if (!n->nr_partial)
-			continue;
+	kfree(scratch);
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
 
-		for (i = 0; i < s->objects; i++)
-			INIT_LIST_HEAD(slabs_by_inuse + i);
+static unsigned long __kmem_cache_defrag(struct kmem_cache *s,
+				int percent, int node, void *scratch)
+{
+	unsigned long capacity;
+	unsigned long objects;
+	unsigned long ratio;
+	struct kmem_cache_node *n = get_node(s, node);
 
-		spin_lock_irqsave(&n->list_lock, flags);
+	/*
+	 * An insignificant number of partial slabs makes
+	 * the slab not interesting.
+	 */
+	if (n->nr_partial <= MAX_PARTIAL)
+		return 0;
 
-		/*
-		 * Build lists indexed by the items in use in each slab.
-		 *
-		 * Note that concurrent frees may occur while we hold the
-		 * list_lock. page->inuse here is the upper limit.
-		 */
-		list_for_each_entry_safe(page, t, &n->partial, lru) {
-			if (!page->inuse && slab_trylock(page)) {
-				/*
-				 * Must hold slab lock here because slab_free
-				 * may have freed the last object and be
-				 * waiting to release the slab.
-				 */
-				list_del(&page->lru);
-				n->nr_partial--;
-				slab_unlock(page);
-				discard_slab(s, page);
-			} else {
-				if (n->nr_partial > MAX_PARTIAL)
-					list_move(&page->lru,
-					slabs_by_inuse + page->inuse);
-			}
-		}
+ 	/*
+	 * Calculate usage ratio
+ 	 */
+	capacity = atomic_long_read(&n->nr_slabs) * s->objects;
+	objects = capacity - n->nr_partial * s->objects + count_partial(n);
+	ratio = objects * 100 / capacity;
 
-		if (n->nr_partial <= MAX_PARTIAL)
-			goto out;
+	/*
+	 * If usage ratio is more than required then no
+	 * defragmentation
+	 */
+	if (ratio > percent)
+		return 0;
+
+	return __kmem_cache_shrink(s, n, scratch) << s->order;
+}
+
+/*
+ * Defrag slabs on the local node if fragmentation is higher
+ * than the given percentage
+ */
+int kmem_cache_defrag(int percent, int node)
+{
+	struct kmem_cache *s;
+	unsigned long pages = 0;
+	void *scratch;
+
+	down_read(&slub_lock);
+	list_for_each_entry(s, &slab_caches, list) {
 
 		/*
-		 * Rebuild the partial list with the slabs filled up most
-		 * first and the least used slabs at the end.
+		 * The slab cache must have defrag methods.
 		 */
-		for (i = s->objects - 1; i >= 0; i--)
-			list_splice(slabs_by_inuse + i, n->partial.prev);
+		if (!s->ops || !s->ops->kick)
+			continue;
 
-	out:
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		scratch = kmalloc(sizeof(struct list_head) * s->objects,
+								GFP_KERNEL);
+		if (node == -1) {
+			for_each_online_node(node)
+				pages += __kmem_cache_defrag(s, percent,
+							node, scratch);
+		} else
+			pages += __kmem_cache_defrag(s, percent, node, scratch);
+		kfree(scratch);
 	}
-
-	kfree(slabs_by_inuse);
-	return 0;
+	up_read(&slub_lock);
+	return pages;
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
+EXPORT_SYMBOL(kmem_cache_defrag);
 
 /**
  * krealloc - reallocate memory. The contents will remain unchanged.
@@ -3122,19 +3328,6 @@ static int list_locations(struct kmem_ca
 	return n;
 }
 
-static unsigned long count_partial(struct kmem_cache_node *n)
-{
-	unsigned long flags;
-	unsigned long x = 0;
-	struct page *page;
-
-	spin_lock_irqsave(&n->list_lock, flags);
-	list_for_each_entry(page, &n->partial, lru)
-		x += page->inuse;
-	spin_unlock_irqrestore(&n->list_lock, flags);
-	return x;
-}
-
 enum slab_stat_type {
 	SL_FULL,
 	SL_PARTIAL,
@@ -3290,6 +3483,20 @@ static ssize_t ops_show(struct kmem_cach
 		x += sprint_symbol(buf + x, (unsigned long)s->ctor);
 		x += sprintf(buf + x, "\n");
 	}
+
+	if (s->ops->get) {
+		x += sprintf(buf + x, "get : ");
+		x += sprint_symbol(buf + x,
+				(unsigned long)s->ops->get);
+		x += sprintf(buf + x, "\n");
+	}
+
+	if (s->ops->kick) {
+		x += sprintf(buf + x, "kick : ");
+		x += sprint_symbol(buf + x,
+				(unsigned long)s->ops->kick);
+		x += sprintf(buf + x, "\n");
+	}
 	return x;
 }
 SLAB_ATTR_RO(ops);
Index: slub/mm/slab.c
===================================================================
--- slub.orig/mm/slab.c	2007-06-07 14:09:42.000000000 -0700
+++ slub/mm/slab.c	2007-06-07 14:10:59.000000000 -0700
@@ -2516,6 +2516,11 @@ int kmem_cache_shrink(struct kmem_cache 
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
+int kmem_cache_defrag(int percent, int node)
+{
+	return 0;
+}
+
 /**
  * kmem_cache_destroy - delete a cache
  * @cachep: the cache to destroy
Index: slub/mm/slob.c
===================================================================
--- slub.orig/mm/slob.c	2007-06-07 14:09:42.000000000 -0700
+++ slub/mm/slob.c	2007-06-07 14:10:59.000000000 -0700
@@ -591,6 +591,11 @@ int kmem_cache_shrink(struct kmem_cache 
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 
+int kmem_cache_defrag(int percentage, int node)
+{
+	return 0;
+}
+
 int kmem_ptr_validate(struct kmem_cache *a, const void *b)
 {
 	return 0;

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 03/12] SLUB: Extend slabinfo to support -D and -C options
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
  2007-06-07 21:55 ` [patch 01/12] SLUB: Add support for kmem_cache_ops clameter
  2007-06-07 21:55 ` [patch 02/12] SLUB: Slab defragmentation core functionality clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 04/12] SLUB: Slab defragmentation trigger clameter
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slab_defrag_slabinfo_updates --]
[-- Type: text/plain, Size: 3900 bytes --]

-D lists caches that support defragmentation

-C lists caches that use a ctor.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 Documentation/vm/slabinfo.c |   39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

Index: slub/Documentation/vm/slabinfo.c
===================================================================
--- slub.orig/Documentation/vm/slabinfo.c	2007-06-07 14:09:37.000000000 -0700
+++ slub/Documentation/vm/slabinfo.c	2007-06-07 14:12:27.000000000 -0700
@@ -30,6 +30,7 @@ struct slabinfo {
 	int hwcache_align, object_size, objs_per_slab;
 	int sanity_checks, slab_size, store_user, trace;
 	int order, poison, reclaim_account, red_zone;
+	int defrag, ctor;
 	unsigned long partial, objects, slabs;
 	int numa[MAX_NODES];
 	int numa_partial[MAX_NODES];
@@ -56,6 +57,8 @@ int show_slab = 0;
 int skip_zero = 1;
 int show_numa = 0;
 int show_track = 0;
+int show_defrag = 0;
+int show_ctor = 0;
 int show_first_alias = 0;
 int validate = 0;
 int shrink = 0;
@@ -90,18 +93,20 @@ void fatal(const char *x, ...)
 void usage(void)
 {
 	printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n"
-		"slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
+		"slabinfo [-aCDefhilnosSrtTvz1] [-d debugopts] [slab-regexp]\n"
 		"-a|--aliases           Show aliases\n"
+		"-C|--ctor              Show slabs with ctors\n"
 		"-d<options>|--debug=<options> Set/Clear Debug options\n"
-		"-e|--empty		Show empty slabs\n"
+		"-D|--defrag            Show defragmentable caches\n"
+		"-e|--empty             Show empty slabs\n"
 		"-f|--first-alias       Show first alias\n"
 		"-h|--help              Show usage information\n"
 		"-i|--inverted          Inverted list\n"
 		"-l|--slabs             Show slabs\n"
 		"-n|--numa              Show NUMA information\n"
-		"-o|--ops		Show kmem_cache_ops\n"
+		"-o|--ops               Show kmem_cache_ops\n"
 		"-s|--shrink            Shrink slabs\n"
-		"-r|--report		Detailed report on single slabs\n"
+		"-r|--report            Detailed report on single slabs\n"
 		"-S|--Size              Sort by size\n"
 		"-t|--tracking          Show alloc/free information\n"
 		"-T|--Totals            Show summary information\n"
@@ -452,6 +457,12 @@ void slabcache(struct slabinfo *s)
 	if (show_empty && s->slabs)
 		return;
 
+	if (show_defrag && !s->defrag)
+		return;
+
+	if (show_ctor && !s->ctor)
+		return;
+
 	store_size(size_str, slab_size(s));
 	sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
 
@@ -462,6 +473,10 @@ void slabcache(struct slabinfo *s)
 		*p++ = '*';
 	if (s->cache_dma)
 		*p++ = 'd';
+	if (s->defrag)
+		*p++ = 'D';
+	if (s->ctor)
+		*p++ = 'C';
 	if (s->hwcache_align)
 		*p++ = 'A';
 	if (s->poison)
@@ -1072,6 +1087,12 @@ void read_slab_dir(void)
 			slab->store_user = get_obj("store_user");
 			slab->trace = get_obj("trace");
 			chdir("..");
+			if (read_slab_obj(slab, "ops")) {
+				if (strstr(buffer, "ctor :"))
+					slab->ctor = 1;
+				if (strstr(buffer, "kick :"))
+					slab->defrag = 1;
+			}
 			if (slab->name[0] == ':')
 				alias_targets++;
 			slab++;
@@ -1121,7 +1142,9 @@ void output_slabs(void)
 
 struct option opts[] = {
 	{ "aliases", 0, NULL, 'a' },
+	{ "ctor", 0, NULL, 'C' },
 	{ "debug", 2, NULL, 'd' },
+	{ "defrag", 0, NULL, 'D' },
 	{ "empty", 0, NULL, 'e' },
 	{ "first-alias", 0, NULL, 'f' },
 	{ "help", 0, NULL, 'h' },
@@ -1146,7 +1169,7 @@ int main(int argc, char *argv[])
 
 	page_size = getpagesize();
 
-	while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzTS",
+	while ((c = getopt_long(argc, argv, "ad::efhil1noprstvzCDTS",
 						opts, NULL)) != -1)
 	switch(c) {
 		case '1':
@@ -1196,6 +1219,12 @@ int main(int argc, char *argv[])
 		case 'z':
 			skip_zero = 0;
 			break;
+		case 'C':
+			show_ctor = 1;
+			break;
+		case 'D':
+			show_defrag = 1;
+			break;
 		case 'T':
 			show_totals = 1;
 			break;

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 04/12] SLUB: Slab defragmentation trigger
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (2 preceding siblings ...)
  2007-06-07 21:55 ` [patch 03/12] SLUB: Extend slabinfo to support -D and -C options clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 05/12] Generic inode defragmentation clameter
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slab_defrag_trigger --]
[-- Type: text/plain, Size: 9281 bytes --]

At some point slab defragmentation needs to be triggered. The logical
point for this is after slab shrinking was performed in vmscan.c. At
that point the fragmentation ratio of a slab was increased by objects
being freed. So we call kmem_cache_defrag from there.

kmem_cache_defrag takes the defrag ratio to make the decision to
defrag a slab or not. We define a new VM tunable

slab_defrag_ratio

that contains the limit to trigger slab defragmentation.

slab_shrink() from vmscan.c is called in some contexts to do
global shrinking of slabs and in others to do shrinking for
a particular zone. Pass the node number of the zone to
slab_shrink, so that slab_shrink can call kmem_cache_defrag()
and restrict the defragmentation to the node that is under
memory pressure.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 Documentation/sysctl/vm.txt |   25 +++++++++++++++++++++++++
 fs/drop_caches.c            |    2 +-
 include/linux/mm.h          |    2 +-
 include/linux/slab.h        |    1 +
 kernel/sysctl.c             |   10 ++++++++++
 mm/vmscan.c                 |   34 +++++++++++++++++++++++++++-------
 6 files changed, 65 insertions(+), 9 deletions(-)

Index: slub/Documentation/sysctl/vm.txt
===================================================================
--- slub.orig/Documentation/sysctl/vm.txt	2007-06-07 14:20:48.000000000 -0700
+++ slub/Documentation/sysctl/vm.txt	2007-06-07 14:22:35.000000000 -0700
@@ -35,6 +35,7 @@ Currently, these files are in /proc/sys/
 - swap_prefetch
 - swap_prefetch_delay
 - swap_prefetch_sleep
+- slab_defrag_ratio
 
 ==============================================================
 
@@ -300,3 +301,27 @@ sleep for when the ram is found to be fu
 further.
 
 The default value is 5.
+
+==============================================================
+
+slab_defrag_ratio
+
+After shrinking the slabs the system checks if slabs have a lower usage
+ratio than the percentage given here. If so then slab defragmentation is
+activated to increase the usage ratio of the slab and in order to free
+memory.
+
+This is the percentage of objects allocated of the total possible number
+of objects in a slab. A lower percentage signifies more fragmentation.
+
+Note slab defragmentation only works on slabs that have the proper methods
+defined (see /sys/slab/<slabname>/ops). When this text was written slab
+defragmentation was only supported by the dentry cache and the inode cache.
+
+The main purpose of the slab defragmentation is to address pathological
+situations in which large amounts of inodes or dentries have been
+removed from the system. That may leave lots of slabs around with just
+a few objects. Slab defragmentation removes these slabs.
+
+The default value is 30% meaning for 3 items in use we have 7 free
+and unused items.
Index: slub/include/linux/slab.h
===================================================================
--- slub.orig/include/linux/slab.h	2007-06-07 14:20:48.000000000 -0700
+++ slub/include/linux/slab.h	2007-06-07 14:22:35.000000000 -0700
@@ -85,6 +85,7 @@ void kmem_cache_free(struct kmem_cache *
 unsigned int kmem_cache_size(struct kmem_cache *);
 const char *kmem_cache_name(struct kmem_cache *);
 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
+int kmem_cache_defrag(int percentage, int node);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
Index: slub/kernel/sysctl.c
===================================================================
--- slub.orig/kernel/sysctl.c	2007-06-07 14:20:48.000000000 -0700
+++ slub/kernel/sysctl.c	2007-06-07 14:22:35.000000000 -0700
@@ -81,6 +81,7 @@ extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
+extern int sysctl_slab_defrag_ratio;
 extern int audit_argv_kb;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -928,6 +929,15 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slab_defrag_ratio",
+		.data		= &sysctl_slab_defrag_ratio,
+		.maxlen		= sizeof(sysctl_slab_defrag_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
 		.ctl_name	= VM_LEGACY_VA_LAYOUT,
Index: slub/mm/vmscan.c
===================================================================
--- slub.orig/mm/vmscan.c	2007-06-07 14:20:48.000000000 -0700
+++ slub/mm/vmscan.c	2007-06-07 14:25:25.000000000 -0700
@@ -135,6 +135,12 @@ void unregister_shrinker(struct shrinker
 EXPORT_SYMBOL(unregister_shrinker);
 
 #define SHRINK_BATCH 128
+
+/*
+ * Slabs should be defragmented if less than 30% of objects are allocated.
+ */
+int sysctl_slab_defrag_ratio = 30;
+
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -152,10 +158,19 @@ EXPORT_SYMBOL(unregister_shrinker);
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone can be be NULL. This is currently
+ * only used to limit slab defragmentation to a NUMA node. The performace
+ * of shrink_slab would be better (in particular under NUMA) if it could
+ * be targeted as a whole to a zone that is under memory pressure but
+ * the VFS datastructures do not allow that at the present time. As a
+ * result zone_reclaim must perform global slab reclaim in order
+ * to free up memory in a zone.
+ *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+			unsigned long lru_pages, struct zone *zone)
 {
 	struct shrinker *shrinker;
 	unsigned long ret = 0;
@@ -218,6 +233,8 @@ unsigned long shrink_slab(unsigned long 
 		shrinker->nr += total_scan;
 	}
 	up_read(&shrinker_rwsem);
+	kmem_cache_defrag(sysctl_slab_defrag_ratio,
+		zone ? zone_to_nid(zone) : -1);
 	return ret;
 }
 
@@ -1163,7 +1180,8 @@ unsigned long try_to_free_pages(struct z
 		if (!priority)
 			disable_swap_token();
 		nr_reclaimed += shrink_zones(priority, zones, &sc);
-		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages,
+						NULL);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
@@ -1333,7 +1351,7 @@ loop_again:
 			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
-						lru_pages);
+						lru_pages, zone);
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
@@ -1601,7 +1619,7 @@ unsigned long shrink_all_memory(unsigned
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
-		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+		shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
 		if (!reclaim_state.reclaimed_slab)
 			break;
 
@@ -1639,7 +1657,7 @@ unsigned long shrink_all_memory(unsigned
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
-					count_lru_pages());
+					count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
@@ -1656,7 +1674,8 @@ unsigned long shrink_all_memory(unsigned
 	if (!ret) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
-			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+			shrink_slab(nr_pages, sc.gfp_mask,
+					count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 	}
@@ -1816,7 +1835,8 @@ static int __zone_reclaim(struct zone *z
 		 * Note that shrink_slab will free memory on all zones and may
 		 * take a long time.
 		 */
-		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+		while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+						zone) &&
 			zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
 				slab_reclaimable - nr_pages)
 			;
Index: slub/fs/drop_caches.c
===================================================================
--- slub.orig/fs/drop_caches.c	2007-06-07 14:20:48.000000000 -0700
+++ slub/fs/drop_caches.c	2007-06-07 14:22:35.000000000 -0700
@@ -52,7 +52,7 @@ void drop_slab(void)
 	int nr_objects;
 
 	do {
-		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL);
 	} while (nr_objects > 10);
 }
 
Index: slub/include/linux/mm.h
===================================================================
--- slub.orig/include/linux/mm.h	2007-06-07 14:20:48.000000000 -0700
+++ slub/include/linux/mm.h	2007-06-07 14:22:35.000000000 -0700
@@ -1240,7 +1240,7 @@ int in_gate_area_no_task(unsigned long a
 int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages);
+			unsigned long lru_pages, struct zone *zone);
 extern void drop_pagecache_sb(struct super_block *);
 void drop_pagecache(void);
 void drop_slab(void);

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 05/12] Generic inode defragmentation
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (3 preceding siblings ...)
  2007-06-07 21:55 ` [patch 04/12] SLUB: Slab defragmentation trigger clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 06/12] ext2 ext3 ext4: support inode slab defragmentation clameter
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slub_defrag_inode_generic --]
[-- Type: text/plain, Size: 4035 bytes --]

This implements the ability to remove inodes in a particular slab
from inode cache. In order to remove an inode we may have to write out
the pages of an inode, the inode itself and remove the dentries referring
to the node.

Provide generic functionality that can be used by filesystems that have
their own inode caches to also tie into the defragmentation functions
that are made available here.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 fs/inode.c         |  100 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h |    5 ++
 2 files changed, 104 insertions(+), 1 deletion(-)

Index: slub/fs/inode.c
===================================================================
--- slub.orig/fs/inode.c	2007-06-07 14:24:33.000000000 -0700
+++ slub/fs/inode.c	2007-06-07 14:28:04.000000000 -0700
@@ -1351,6 +1351,104 @@ static int __init set_ihash_entries(char
 }
 __setup("ihash_entries=", set_ihash_entries);
 
+static void *get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	int i;
+
+	spin_lock(&inode_lock);
+	for (i = 0; i < nr; i++) {
+		struct inode *inode = v[i];
+
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			v[i] = NULL;
+		else
+			__iget(inode);
+	}
+	spin_unlock(&inode_lock);
+	return NULL;
+}
+
+/*
+ * Function for filesystems that embedd struct inode into their own
+ * structures. The offset is the offset of the struct inode in the fs inode.
+ */
+void *fs_get_inodes(struct kmem_cache *s, int nr, void **v, unsigned long offset)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		v[i] += offset;
+
+	return get_inodes(s, nr, v);
+}
+EXPORT_SYMBOL(fs_get_inodes);
+
+void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private)
+{
+	struct inode *inode;
+	int i;
+	int abort = 0;
+	LIST_HEAD(freeable);
+	struct super_block *sb;
+
+	for (i = 0; i < nr; i++) {
+		inode = v[i];
+		if (!inode)
+			continue;
+
+		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+			if (remove_inode_buffers(inode))
+				invalidate_mapping_pages(&inode->i_data,
+								0, -1);
+		}
+
+		/* Invalidate children and dentry */
+		if (S_ISDIR(inode->i_mode)) {
+			struct dentry *d = d_find_alias(inode);
+
+			if (d) {
+				d_invalidate(d);
+				dput(d);
+			}
+		}
+
+		if (inode->i_state & I_DIRTY)
+			write_inode_now(inode, 1);
+
+		d_prune_aliases(inode);
+	}
+
+	mutex_lock(&iprune_mutex);
+	for (i = 0; i < nr; i++) {
+		inode = v[i];
+		if (!inode)
+			continue;
+
+		sb = inode->i_sb;
+		iput(inode);
+		if (abort || !(sb->s_flags & MS_ACTIVE))
+			continue;
+
+		spin_lock(&inode_lock);
+		abort =  !can_unuse(inode);
+
+		if (!abort) {
+			list_move(&inode->i_list, &freeable);
+			inode->i_state |= I_FREEING;
+			inodes_stat.nr_unused--;
+		}
+		spin_unlock(&inode_lock);
+	}
+	dispose_list(&freeable);
+	mutex_unlock(&iprune_mutex);
+}
+EXPORT_SYMBOL(kick_inodes);
+
+static struct kmem_cache_ops inode_kmem_cache_ops = {
+	.get = get_inodes,
+	.kick = kick_inodes
+};
+
 /*
  * Initialize the waitqueues and inode hash table.
  */
@@ -1389,7 +1487,7 @@ void __init inode_init(unsigned long mem
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 					 SLAB_MEM_SPREAD),
 					 init_once,
-					 NULL);
+					 &inode_kmem_cache_ops);
 	register_shrinker(&icache_shrinker);
 
 	/* Hash may have been set up in inode_init_early */
Index: slub/include/linux/fs.h
===================================================================
--- slub.orig/include/linux/fs.h	2007-06-07 14:24:33.000000000 -0700
+++ slub/include/linux/fs.h	2007-06-07 14:28:44.000000000 -0700
@@ -1790,6 +1790,11 @@ static inline void insert_inode_hash(str
 	__insert_inode_hash(inode, inode->i_ino);
 }
 
+/* Helper functions for inode defragmentation support in filesystems */
+extern void kick_inodes(struct kmem_cache *, int, void **, void *);
+extern void *fs_get_inodes(struct kmem_cache *, int nr, void **,
+						unsigned long offset);
+
 extern struct file * get_empty_filp(void);
 extern void file_move(struct file *f, struct list_head *list);
 extern void file_kill(struct file *f);

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 06/12] ext2 ext3 ext4: support inode slab defragmentation
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (4 preceding siblings ...)
  2007-06-07 21:55 ` [patch 05/12] Generic inode defragmentation clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 07/12] xfs: inode defragmentation support clameter
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slub_defrag_fs_ext234 --]
[-- Type: text/plain, Size: 3106 bytes --]

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 fs/ext2/super.c |   16 ++++++++++++++--
 fs/ext3/super.c |   14 +++++++++++++-
 fs/ext4/super.c |   14 +++++++++++++-
 3 files changed, 40 insertions(+), 4 deletions(-)

Index: slub/fs/ext2/super.c
===================================================================
--- slub.orig/fs/ext2/super.c	2007-06-07 14:09:36.000000000 -0700
+++ slub/fs/ext2/super.c	2007-06-07 14:28:47.000000000 -0700
@@ -168,14 +168,26 @@ static void init_once(void * foo, struct
 	mutex_init(&ei->truncate_mutex);
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
+static void *ext2_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct ext2_inode_info, vfs_inode));
+}
+
+static struct kmem_cache_ops ext2_kmem_cache_ops = {
+	.get = ext2_get_inodes,
+	.kick = kick_inodes
+};
+
 static int init_inodecache(void)
 {
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
 					     sizeof(struct ext2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once,
+					     &ext2_kmem_cache_ops);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
Index: slub/fs/ext3/super.c
===================================================================
--- slub.orig/fs/ext3/super.c	2007-06-07 14:09:36.000000000 -0700
+++ slub/fs/ext3/super.c	2007-06-07 14:28:47.000000000 -0700
@@ -483,13 +483,25 @@ static void init_once(void * foo, struct
 	inode_init_once(&ei->vfs_inode);
 }
 
+static void *ext3_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct ext3_inode_info, vfs_inode));
+}
+
+static struct kmem_cache_ops ext3_kmem_cache_ops = {
+	.get = ext3_get_inodes,
+	.kick = kick_inodes
+};
+
 static int init_inodecache(void)
 {
 	ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
 					     sizeof(struct ext3_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once,
+					     &ext3_kmem_cache_ops);
 	if (ext3_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
Index: slub/fs/ext4/super.c
===================================================================
--- slub.orig/fs/ext4/super.c	2007-06-07 14:09:36.000000000 -0700
+++ slub/fs/ext4/super.c	2007-06-07 14:29:49.000000000 -0700
@@ -543,13 +543,25 @@ static void init_once(void * foo, struct
 	inode_init_once(&ei->vfs_inode);
 }
 
+static void *ext4_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct ext4_inode_info, vfs_inode));
+}
+
+static struct kmem_cache_ops ext4_kmem_cache_ops = {
+	.get = ext4_get_inodes,
+	.kick = kick_inodes
+};
+
 static int init_inodecache(void)
 {
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once,
+					     &ext4_kmem_cache_ops);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 07/12] xfs: inode defragmentation support
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (5 preceding siblings ...)
  2007-06-07 21:55 ` [patch 06/12] ext2 ext3 ext4: support inode slab defragmentation clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 08/12] procfs: " clameter
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slub_defrag_fs_xfs --]
[-- Type: text/plain, Size: 3279 bytes --]

Add slab defrag support.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 fs/xfs/linux-2.6/kmem.h      |    5 +++--
 fs/xfs/linux-2.6/xfs_buf.c   |    2 +-
 fs/xfs/linux-2.6/xfs_super.c |   13 ++++++++++++-
 fs/xfs/xfs_vfsops.c          |    6 +++---
 4 files changed, 19 insertions(+), 7 deletions(-)

Index: slub/fs/xfs/linux-2.6/kmem.h
===================================================================
--- slub.orig/fs/xfs/linux-2.6/kmem.h	2007-06-06 13:08:09.000000000 -0700
+++ slub/fs/xfs/linux-2.6/kmem.h	2007-06-06 13:32:58.000000000 -0700
@@ -79,9 +79,10 @@ kmem_zone_init(int size, char *zone_name
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-		     void (*construct)(void *, kmem_zone_t *, unsigned long))
+		     void (*construct)(void *, kmem_zone_t *, unsigned long),
+		     const struct kmem_cache_ops *ops)
 {
-	return kmem_cache_create(zone_name, size, 0, flags, construct, NULL);
+	return kmem_cache_create(zone_name, size, 0, flags, construct, ops);
 }
 
 static inline void
Index: slub/fs/xfs/linux-2.6/xfs_buf.c
===================================================================
--- slub.orig/fs/xfs/linux-2.6/xfs_buf.c	2007-06-06 13:08:09.000000000 -0700
+++ slub/fs/xfs/linux-2.6/xfs_buf.c	2007-06-06 13:32:58.000000000 -0700
@@ -1834,7 +1834,7 @@ xfs_buf_init(void)
 #endif
 
 	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
-						KM_ZONE_HWALIGN, NULL);
+						KM_ZONE_HWALIGN, NULL, NULL);
 	if (!xfs_buf_zone)
 		goto out_free_trace_buf;
 
Index: slub/fs/xfs/linux-2.6/xfs_super.c
===================================================================
--- slub.orig/fs/xfs/linux-2.6/xfs_super.c	2007-06-06 13:08:09.000000000 -0700
+++ slub/fs/xfs/linux-2.6/xfs_super.c	2007-06-06 13:32:58.000000000 -0700
@@ -355,13 +355,24 @@ xfs_fs_inode_init_once(
 	inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
 }
 
+static void *xfs_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v, offsetof(bhv_vnode_t, v_inode));
+};
+
+static struct kmem_cache_ops xfs_kmem_cache_ops = {
+	.get = xfs_get_inodes,
+	.kick = kick_inodes
+};
+
 STATIC int
 xfs_init_zones(void)
 {
 	xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
 					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
 					KM_ZONE_SPREAD,
-					xfs_fs_inode_init_once);
+					xfs_fs_inode_init_once,
+					&xfs_kmem_cache_ops);
 	if (!xfs_vnode_zone)
 		goto out;
 
Index: slub/fs/xfs/xfs_vfsops.c
===================================================================
--- slub.orig/fs/xfs/xfs_vfsops.c	2007-06-06 15:19:52.000000000 -0700
+++ slub/fs/xfs/xfs_vfsops.c	2007-06-06 15:20:36.000000000 -0700
@@ -109,13 +109,13 @@ xfs_init(void)
 	xfs_inode_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
 					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-					KM_ZONE_SPREAD, NULL);
+					KM_ZONE_SPREAD, NULL, NULL);
 	xfs_ili_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
-					KM_ZONE_SPREAD, NULL);
+					KM_ZONE_SPREAD, NULL, NULL);
 	xfs_chashlist_zone =
 		kmem_zone_init_flags(sizeof(xfs_chashlist_t), "xfs_chashlist",
-					KM_ZONE_SPREAD, NULL);
+					KM_ZONE_SPREAD, NULL, NULL);
 
 	/*
 	 * Allocate global trace buffers.

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 08/12] procfs: inode defragmentation support
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (6 preceding siblings ...)
  2007-06-07 21:55 ` [patch 07/12] xfs: inode defragmentation support clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 09/12] reiserfs: " clameter
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slub_defrag_fs_proc --]
[-- Type: text/plain, Size: 1096 bytes --]

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 fs/proc/inode.c |   22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

Index: slub/fs/proc/inode.c
===================================================================
--- slub.orig/fs/proc/inode.c	2007-06-04 20:12:56.000000000 -0700
+++ slub/fs/proc/inode.c	2007-06-04 21:35:00.000000000 -0700
@@ -112,14 +112,25 @@ static void init_once(void * foo, struct
 
 	inode_init_once(&ei->vfs_inode);
 }
- 
+
+static void *proc_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+			offsetof(struct proc_inode, vfs_inode));
+};
+
+static struct kmem_cache_ops proc_kmem_cache_ops = {
+	.get = proc_get_inodes,
+	.kick = kick_inodes
+};
+
 int __init proc_init_inodecache(void)
 {
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+					     init_once, &proc_kmem_cache_ops);
 	if (proc_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 09/12] reiserfs: inode defragmentation support
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (7 preceding siblings ...)
  2007-06-07 21:55 ` [patch 08/12] procfs: " clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 10/12] sockets: " clameter
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slub_defrag_fs_reiser --]
[-- Type: text/plain, Size: 1168 bytes --]

Add inode defrag support

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 fs/reiserfs/super.c |   14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

Index: slub/fs/reiserfs/super.c
===================================================================
--- slub.orig/fs/reiserfs/super.c	2007-06-07 14:09:36.000000000 -0700
+++ slub/fs/reiserfs/super.c	2007-06-07 14:30:49.000000000 -0700
@@ -520,6 +520,17 @@ static void init_once(void *foo, struct 
 #endif
 }
 
+static void *reiserfs_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+			offsetof(struct reiserfs_inode_info, vfs_inode));
+}
+
+struct kmem_cache_ops reiserfs_kmem_cache_ops = {
+	.get = reiserfs_get_inodes,
+	.kick = kick_inodes
+};
+
 static int init_inodecache(void)
 {
 	reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
@@ -527,7 +538,8 @@ static int init_inodecache(void)
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
 							SLAB_MEM_SPREAD),
-						  init_once, NULL);
+						  init_once,
+						  &reiserfs_kmem_cache_ops);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 10/12] sockets: inode defragmentation support
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (8 preceding siblings ...)
  2007-06-07 21:55 ` [patch 09/12] reiserfs: " clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 11/12] Dentry defragmentation clameter
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slub_defrag_fs_socket --]
[-- Type: text/plain, Size: 1086 bytes --]

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 net/socket.c |   13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

Index: slub/net/socket.c
===================================================================
--- slub.orig/net/socket.c	2007-06-06 15:19:29.000000000 -0700
+++ slub/net/socket.c	2007-06-06 15:20:54.000000000 -0700
@@ -264,6 +264,17 @@ static void init_once(void *foo, struct 
 	inode_init_once(&ei->vfs_inode);
 }
 
+static void *sock_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct socket_alloc, vfs_inode));
+}
+
+static struct kmem_cache_ops sock_kmem_cache_ops = {
+	.get = sock_get_inodes,
+	.kick = kick_inodes
+};
+
 static int init_inodecache(void)
 {
 	sock_inode_cachep = kmem_cache_create("sock_inode_cache",
@@ -273,7 +284,7 @@ static int init_inodecache(void)
 					       SLAB_RECLAIM_ACCOUNT |
 					       SLAB_MEM_SPREAD),
 					      init_once,
-					      NULL);
+					      &sock_kmem_cache_ops);
 	if (sock_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 11/12] Dentry defragmentation
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (9 preceding siblings ...)
  2007-06-07 21:55 ` [patch 10/12] sockets: " clameter
@ 2007-06-07 21:55 ` clameter
  2007-06-07 21:55 ` [patch 12/12] SLUB: Support memory defrag through kmem_cache_vacate() clameter
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slub_defrag_dentry --]
[-- Type: text/plain, Size: 4793 bytes --]

get() uses the dcache lock and then works with dget_locked to obtain a
reference to the dentry. An additional complication is that the dentry
may be in process of being freed or it may just have been allocated.
We add an additional flag to d_flags to be able to determined the
status of an object.

kick() is called after get() has been used and after the slab has dropped
all of its own locks. The dentry pruning for unused entries works in a
straighforward way.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 fs/dcache.c            |  112 +++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/dcache.h |    5 ++
 2 files changed, 109 insertions(+), 8 deletions(-)

Index: slub/fs/dcache.c
===================================================================
--- slub.orig/fs/dcache.c	2007-06-07 14:31:24.000000000 -0700
+++ slub/fs/dcache.c	2007-06-07 14:31:39.000000000 -0700
@@ -135,6 +135,7 @@ static struct dentry *d_kill(struct dent
 
 	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
+	dentry->d_flags &= ~DCACHE_ENTRY_VALID;
 	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
@@ -951,6 +952,7 @@ struct dentry *d_alloc(struct dentry * p
 	if (parent)
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	dentry_stat.nr_dentry++;
+	dentry->d_flags |= DCACHE_ENTRY_VALID;
 	spin_unlock(&dcache_lock);
 
 	return dentry;
@@ -2108,18 +2110,112 @@ static void __init dcache_init_early(voi
 		INIT_HLIST_HEAD(&dentry_hashtable[loop]);
 }
 
+/*
+ * The slab is holding off frees. Thus we can safely examine
+ * the object without the danger of it vanishing from under us.
+ */
+static void *get_dentries(struct kmem_cache *s, int nr, void **v)
+{
+	struct dentry *dentry;
+	int i;
+
+	spin_lock(&dcache_lock);
+	for (i = 0; i < nr; i++) {
+		dentry = v[i];
+		/*
+		 * if DCACHE_ENTRY_VALID is not set then the dentry
+		 * may be already in the process of being freed.
+		 */
+		if (!(dentry->d_flags & DCACHE_ENTRY_VALID))
+			v[i] = NULL;
+		else
+			dget_locked(dentry);
+	}
+	spin_unlock(&dcache_lock);
+	return 0;
+}
+
+/*
+ * Slab has dropped all the locks. Get rid of the
+ * refcount we obtained earlier and also rid of the
+ * object.
+ */
+static void kick_dentries(struct kmem_cache *s, int nr, void **v, void *private)
+{
+	struct dentry *dentry;
+	int abort = 0;
+	int i;
+
+	/*
+	 * First invalidate the dentries without holding the dcache lock
+	 */
+	for (i = 0; i < nr; i++) {
+		dentry = v[i];
+
+		if (dentry)
+			d_invalidate(dentry);
+	}
+
+	/*
+	 * If we are the last one holding a reference then the dentries can
+	 * be freed. We  need the dcache_lock.
+	 */
+	spin_lock(&dcache_lock);
+	for (i = 0; i < nr; i++) {
+		dentry = v[i];
+		if (!dentry)
+			continue;
+
+		if (abort)
+			goto put_dentry;
+
+		spin_lock(&dentry->d_lock);
+		if (atomic_read(&dentry->d_count) > 1) {
+			/*
+			 * Reference count was increased.
+			 * We need to abandon the freeing of
+			 * objects.
+			 */
+			abort = 1;
+			spin_unlock(&dentry->d_lock);
+put_dentry:
+			spin_unlock(&dcache_lock);
+			dput(dentry);
+			spin_lock(&dcache_lock);
+			continue;
+		}
+
+		/* Remove from LRU */
+		if (!list_empty(&dentry->d_lru)) {
+			dentry_stat.nr_unused--;
+			list_del_init(&dentry->d_lru);
+		}
+		/* Drop the entry */
+		prune_one_dentry(dentry, 1);
+	}
+	spin_unlock(&dcache_lock);
+
+	/*
+	 * dentries are freed using RCU so we need to wait until RCU
+	 * operations arei complete
+	 */
+	if (!abort)
+		synchronize_rcu();
+}
+
+static struct kmem_cache_ops dentry_kmem_cache_ops = {
+	.get = get_dentries,
+	.kick = kick_dentries,
+};
+
 static void __init dcache_init(unsigned long mempages)
 {
 	int loop;
 
-	/* 
-	 * A constructor could be added for stable state like the lists,
-	 * but it is probably not worth it because of the cache nature
-	 * of the dcache. 
-	 */
-	dentry_cache = KMEM_CACHE(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-	
+	dentry_cache = KMEM_CACHE_OPS(dentry,
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD,
+		&dentry_kmem_cache_ops);
+
 	register_shrinker(&dcache_shrinker);
 
 	/* Hash may have been set up in dcache_init_early */
Index: slub/include/linux/dcache.h
===================================================================
--- slub.orig/include/linux/dcache.h	2007-06-07 14:31:24.000000000 -0700
+++ slub/include/linux/dcache.h	2007-06-07 14:32:35.000000000 -0700
@@ -177,6 +177,11 @@ d_iput:		no		no		no       yes
 
 #define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
 
+#define DCACHE_ENTRY_VALID	0x0040	/*
+					 * Entry is valid and not in the
+					 * process of being created or
+					 * destroyed.
+					 */
 extern spinlock_t dcache_lock;
 
 /**

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 12/12] SLUB: Support memory defrag through kmem_cache_vacate()
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (10 preceding siblings ...)
  2007-06-07 21:55 ` [patch 11/12] Dentry defragmentation clameter
@ 2007-06-07 21:55 ` clameter
       [not found] ` <6bffcb0e0706080239w5cfe8594sbf5564dacd48936f@mail.gmail.com>
  2007-06-08 18:02 ` Michal Piotrowski
  13 siblings, 0 replies; 23+ messages in thread
From: clameter @ 2007-06-07 21:55 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

[-- Attachment #1: slab_defrag_kmem_cache_vacate --]
[-- Type: text/plain, Size: 6794 bytes --]

Special function kmem_cache_vacate() to push out the objects in a
specified slab. In order to make that work we will have to handle
slab page allocations in such a way that we can determine if a slab is valid whenever we access it regardless of its time in life.

A valid slab that can be freed has PageSlab(page) and page->inuse > 0 set.
So we need to make sure in allocate_slab that page->inuse is zero before
PageSlab is set otherwise kmem_cache_vacate may operate on a slab that
has not been properly setup yet.

There is currently no in kernel user. The hope is that Mel's defragmentation
method can at some point use this functionality to make slabs movable
so that the reclaimable type of pages may not be necessary anymore.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/slab.h |    1 
 mm/slab.c            |    9 ++++
 mm/slob.c            |    9 ++++
 mm/slub.c            |  109 ++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 119 insertions(+), 9 deletions(-)

Index: slub/include/linux/slab.h
===================================================================
--- slub.orig/include/linux/slab.h	2007-06-07 14:36:09.000000000 -0700
+++ slub/include/linux/slab.h	2007-06-07 14:36:15.000000000 -0700
@@ -86,6 +86,7 @@ unsigned int kmem_cache_size(struct kmem
 const char *kmem_cache_name(struct kmem_cache *);
 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
 int kmem_cache_defrag(int percentage, int node);
+int kmem_cache_vacate(struct page *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
Index: slub/mm/slab.c
===================================================================
--- slub.orig/mm/slab.c	2007-06-07 14:36:09.000000000 -0700
+++ slub/mm/slab.c	2007-06-07 14:36:15.000000000 -0700
@@ -2521,6 +2521,15 @@ int kmem_cache_defrag(int percent, int n
 	return 0;
 }
 
+/*
+ * SLAB does not support slab defragmentation
+ */
+int kmem_cache_vacate(struct page *page)
+{
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_vacate);
+
 /**
  * kmem_cache_destroy - delete a cache
  * @cachep: the cache to destroy
Index: slub/mm/slob.c
===================================================================
--- slub.orig/mm/slob.c	2007-06-07 14:36:09.000000000 -0700
+++ slub/mm/slob.c	2007-06-07 14:36:15.000000000 -0700
@@ -596,6 +596,15 @@ int kmem_cache_defrag(int percentage, in
 	return 0;
 }
 
+/*
+ * SLOB does not support slab defragmentation
+ */
+int kmem_cache_vacate(struct page *page)
+{
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_vacate);
+
 int kmem_ptr_validate(struct kmem_cache *a, const void *b)
 {
 	return 0;
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-06-07 14:11:29.000000000 -0700
+++ slub/mm/slub.c	2007-06-07 14:36:15.000000000 -0700
@@ -1032,6 +1032,7 @@ static inline int slab_pad_check(struct 
 static inline int check_object(struct kmem_cache *s, struct page *page,
 			void *object, int active) { return 1; }
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct page *page) {}
 static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {}
 #define slub_debug 0
 #endif
@@ -1097,12 +1098,11 @@ static struct page *new_slab(struct kmem
 	n = get_node(s, page_to_nid(page));
 	if (n)
 		atomic_long_inc(&n->nr_slabs);
+
+	page->inuse = 0;
+	page->lockless_freelist = NULL;
 	page->offset = s->offset / sizeof(void *);
 	page->slab = s;
-	page->flags |= 1 << PG_slab;
-	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
-			SLAB_STORE_USER | SLAB_TRACE))
-		SetSlabDebug(page);
 
 	start = page_address(page);
 	end = start + s->objects * s->size;
@@ -1120,11 +1120,20 @@ static struct page *new_slab(struct kmem
 	set_freepointer(s, last, NULL);
 
 	page->freelist = start;
-	page->lockless_freelist = NULL;
-	page->inuse = 0;
-out:
-	if (flags & __GFP_WAIT)
-		local_irq_disable();
+
+	/*
+	 * page->inuse must be 0 when PageSlab(page) becomes
+	 * true so that defrag knows that this slab is not in use.
+	 */
+	smp_wmb();
+	__SetPageSlab(page);
+	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
+			SLAB_STORE_USER | SLAB_TRACE))
+		SetSlabDebug(page);
+
+ out:
+ 	if (flags & __GFP_WAIT)
+ 		local_irq_disable();
 	return page;
 }
 
@@ -2575,6 +2584,88 @@ static unsigned long __kmem_cache_shrink
 }
 
 /*
+ * Get a page off a list and freeze it. Must be holding slab lock.
+ */
+static void freeze_from_list(struct kmem_cache *s, struct page *page)
+{
+	if (page->inuse < s->objects)
+		remove_partial(s, page);
+	else if (s->flags & SLAB_STORE_USER)
+		remove_full(s, page);
+	SetSlabFrozen(page);
+}
+
+/*
+ * Attempt to free objects in a page. Return 1 if succesful.
+ */
+int kmem_cache_vacate(struct page *page)
+{
+	unsigned long flags;
+	struct kmem_cache *s;
+	int vacated = 0;
+	void **vector = NULL;
+
+	/*
+	 * Get a reference to the page. Return if its freed or being freed.
+	 * This is necessary to make sure that the page does not vanish
+	 * from under us before we are able to check the result.
+	 */
+	if (!get_page_unless_zero(page))
+		return 0;
+
+	if (!PageSlab(page))
+		goto out;
+
+	s = page->slab;
+	if (!s)
+		goto out;
+
+	vector = kmalloc(s->objects * sizeof(void *), GFP_KERNEL);
+	if (!vector)
+		goto out2;
+
+	local_irq_save(flags);
+	/*
+	 * The implicit memory barrier in slab_lock guarantees that page->inuse
+	 * is loaded after PageSlab(page) has been established to be true. This is
+	 * only revelant for a  newly created slab.
+	 */
+	slab_lock(page);
+
+	/*
+	 * We may now have locked a page that may be in various stages of
+	 * being freed. If the PageSlab bit is off then we have already
+	 * reached the page allocator. If page->inuse is zero then we are
+	 * in SLUB but freeing or allocating the page.
+	 * page->inuse is never modified without the slab lock held.
+	 *
+	 * Also abort if the page happens to be already frozen. If its
+	 * frozen then a concurrent vacate may be in progress.
+	 */
+	if (!PageSlab(page) || SlabFrozen(page) || !page->inuse)
+		goto out_locked;
+
+	/*
+	 * We are holding a lock on a slab page and all operations on the
+	 * slab are blocking.
+	 */
+	if (!s->ops->get || !s->ops->kick)
+		goto out_locked;
+	freeze_from_list(s, page);
+	vacated = __kmem_cache_vacate(s, page, flags, vector);
+out:
+	kfree(vector);
+out2:
+	put_page(page);
+	return vacated == 0;
+out_locked:
+	slab_unlock(page);
+	local_irq_restore(flags);
+	goto out;
+
+}
+
+/*
  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
  * the remaining slabs by the number of items in use. The slabs with the
  * most items in use come first. New allocations will then fill those up

-- 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
       [not found] ` <6bffcb0e0706080239w5cfe8594sbf5564dacd48936f@mail.gmail.com>
@ 2007-06-08 15:16   ` Christoph Lameter
  2007-06-08 15:28   ` Christoph Lameter
  1 sibling, 0 replies; 23+ messages in thread
From: Christoph Lameter @ 2007-06-08 15:16 UTC (permalink / raw)
  To: Michal Piotrowski; +Cc: akpm, linux-kernel, linux-mm, dgc, Mel Gorman

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> Hi Christoph,
> 
> On 07/06/07, clameter@sgi.com <clameter@sgi.com> wrote:
> > Will show up shortly at
> http://ftp.kernel.org/pub/linux/kernel/people/christoph/slab-defrag/
> 
> I tried to apply this patchset, but without success. I tried
> 2.6.22-rc4-mm2, 2.6.22-rc4, 2.6.22-rc4-git2, 2.6.22-rc3...

What was the problem?


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
       [not found] ` <6bffcb0e0706080239w5cfe8594sbf5564dacd48936f@mail.gmail.com>
  2007-06-08 15:16   ` [patch 00/12] Slab defragmentation V3 Christoph Lameter
@ 2007-06-08 15:28   ` Christoph Lameter
  1 sibling, 0 replies; 23+ messages in thread
From: Christoph Lameter @ 2007-06-08 15:28 UTC (permalink / raw)
  To: Michal Piotrowski; +Cc: akpm, linux-kernel, linux-mm, dgc, Mel Gorman

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> Hi Christoph,
> 
> On 07/06/07, clameter@sgi.com <clameter@sgi.com> wrote:
> > Will show up shortly at
> http://ftp.kernel.org/pub/linux/kernel/people/christoph/slab-defrag/
> 
> I tried to apply this patchset, but without success. I tried
> 2.6.22-rc4-mm2, 2.6.22-rc4, 2.6.22-rc4-git2, 2.6.22-rc3...

Yeah its against 2.6.22-rc4-mm1 and 2.6.22-rc4-mm2 changes kernel/sysctl.c 
so that the defrag trigger patch fails. Sigh.

I added kernel versions below slab-defrag so that you can find the correct 
version for your kernel.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
  2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
                   ` (12 preceding siblings ...)
       [not found] ` <6bffcb0e0706080239w5cfe8594sbf5564dacd48936f@mail.gmail.com>
@ 2007-06-08 18:02 ` Michal Piotrowski
  2007-06-08 18:16   ` Christoph Lameter
  13 siblings, 1 reply; 23+ messages in thread
From: Michal Piotrowski @ 2007-06-08 18:02 UTC (permalink / raw)
  To: clameter; +Cc: akpm, linux-kernel, linux-mm, dgc, Michal Piotrowski, Mel Gorman

bash shared mapping + your script in a loop
while true;  do sudo ./run.sh; done > res3.txt


[ 2866.154597] =======================================================
[ 2866.162384] [ INFO: possible circular locking dependency detected ]
[ 2866.168698] 2.6.22-rc4-mm2 #1
[ 2866.171671] -------------------------------------------------------
[ 2866.177972] bash-shared-map/3245 is trying to acquire lock:
[ 2866.183566]  (slub_lock){----}, at: [<c0482510>] kmem_cache_defrag+0x18/0xb3

l *kmem_cache_defrag+0x18
0xc1082510 is in kmem_cache_defrag (mm/slub.c:2742).
2737            struct kmem_cache *s;
2738            unsigned long pages = 0;
2739            void *scratch;
2740
2741            down_read(&slub_lock);
2742            list_for_each_entry(s, &slab_caches, list) {
2743
2744                    /*
2745                     * The slab cache must have defrag methods.
2746                     */


[ 2866.190800] 
[ 2866.190801] but task is already holding lock:
[ 2866.196746]  (&inode->i_alloc_sem){--..}, at: [<c0498b07>] notify_change+0xdf/0x2ec

l *notify_change+0xdf
0xc1098b07 is in notify_change (fs/attr.c:145).
140                     return 0;
141
142             if (ia_valid & ATTR_SIZE)
143                     down_write(&dentry->d_inode->i_alloc_sem);
144
145             if (inode->i_op && inode->i_op->setattr) {
146                     error = security_inode_setattr(dentry, attr);
147                     if (!error)
148                             error = inode->i_op->setattr(dentry, attr);
149             } else {


[ 2866.204761] 
[ 2866.204762] which lock already depends on the new lock.
[ 2866.204764] 
[ 2866.213058] 
[ 2866.213060] the existing dependency chain (in reverse order) is:
[ 2866.220630] 
[ 2866.220631] -> #2 (&inode->i_alloc_sem){--..}:
[ 2866.226784]        [<c0441df1>] add_lock_to_list+0x67/0x8b
[ 2866.232525]        [<c0444bb9>] __lock_acquire+0xb02/0xd36
[ 2866.238315]        [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.243702]        [<c043c0c5>] down_write+0x3e/0x77
[ 2866.248914]        [<c0498b07>] notify_change+0xdf/0x2ec
[ 2866.254542]        [<c0484161>] do_truncate+0x60/0x79
[ 2866.259927]        [<c048d5fe>] may_open+0x1db/0x240
[ 2866.265165]        [<c048fbbd>] open_namei+0x2d6/0x6bb
[ 2866.270602]        [<c0483a5d>] do_filp_open+0x26/0x3b
[ 2866.275996]        [<c0483acf>] do_sys_open+0x5d/0xed
[ 2866.281382]        [<c0483b97>] sys_open+0x1c/0x1e
[ 2866.286508]        [<c0404182>] sysenter_past_esp+0x5f/0x99
[ 2866.292428]        [<b7f9d410>] 0xb7f9d410
[ 2866.296819]        [<ffffffff>] 0xffffffff
[ 2866.301177] 
[ 2866.301178] -> #1 (&sysfs_inode_imutex_key){--..}:
[ 2866.307632]        [<c0441df1>] add_lock_to_list+0x67/0x8b
[ 2866.313425]        [<c0444bb9>] __lock_acquire+0xb02/0xd36
[ 2866.319164]        [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.324576]        [<c065b745>] __mutex_lock_slowpath+0x107/0x369
[ 2866.331008]        [<c065b9c3>] mutex_lock+0x1c/0x1f
[ 2866.336314]        [<c04c2609>] create_dir+0x1e/0x1c2
[ 2866.341682]        [<c04c280d>] sysfs_create_dir+0x60/0x7b
[ 2866.347396]        [<c050a335>] kobject_shadow_add+0xd7/0x189
[ 2866.353499]        [<c050a3f1>] kobject_add+0xa/0xc
[ 2866.358685]        [<c0480f00>] sysfs_slab_add+0x10c/0x152
[ 2866.364374]        [<c048111b>] kmem_cache_create+0x13a/0x1d4
[ 2866.370442]        [<c083415d>] fasync_init+0x2e/0x37
[ 2866.375818]        [<c0824542>] kernel_init+0x14e/0x2bf
[ 2866.381351]        [<c0404e7b>] kernel_thread_helper+0x7/0x10
[ 2866.387419]        [<ffffffff>] 0xffffffff
[ 2866.391843] 
[ 2866.391845] -> #0 (slub_lock){----}:
[ 2866.397022]        [<c0442b04>] print_circular_bug_tail+0x2e/0x68
[ 2866.403359]        [<c0444aa5>] __lock_acquire+0x9ee/0xd36
[ 2866.409080]        [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.414466]        [<c043bfff>] down_read+0x3d/0x74
[ 2866.419635]        [<c0482510>] kmem_cache_defrag+0x18/0xb3
[ 2866.425540]        [<c046c7ac>] shrink_slab+0x1ca/0x1d5
[ 2866.431002]        [<c046cc1d>] try_to_free_pages+0x178/0x224
[ 2866.437044]        [<c046824f>] __alloc_pages+0x1cd/0x324
[ 2866.442794]        [<c0465282>] find_or_create_page+0x5c/0xa6
[ 2866.448817]        [<c04c9379>] ext3_truncate+0xbb/0x83b
[ 2866.454411]        [<c0472470>] vmtruncate+0x11a/0x140
[ 2866.459762]        [<c049894d>] inode_setattr+0x5c/0x137
[ 2866.465286]        [<c04caafb>] ext3_setattr+0x19c/0x1f8
[ 2866.470835]        [<c0498b61>] notify_change+0x139/0x2ec
[ 2866.476514]        [<c0484161>] do_truncate+0x60/0x79
[ 2866.481822]        [<c04842af>] do_sys_ftruncate+0x135/0x150
[ 2866.487778]        [<c04842e5>] sys_ftruncate64+0x1b/0x1d
[ 2866.493405]        [<c040420c>] syscall_call+0x7/0xb
[ 2866.498599]        [<b7f10410>] 0xb7f10410
[ 2866.502913]        [<ffffffff>] 0xffffffff
[ 2866.507201] 
[ 2866.507203] other info that might help us debug this:
[ 2866.507204] 
[ 2866.515363] 2 locks held by bash-shared-map/3245:
[ 2866.520151]  #0:  (&inode->i_mutex){--..}, at: [<c065b9c3>] mutex_lock+0x1c/0x1f
[ 2866.527826]  #1:  (&inode->i_alloc_sem){--..}, at: [<c0498b07>] notify_change+0xdf/0x2ec
[ 2866.536158] 
[ 2866.536160] stack backtrace:
[ 2866.540597]  [<c04052ad>] dump_trace+0x63/0x1eb
[ 2866.545187]  [<c040544f>] show_trace_log_lvl+0x1a/0x2f
[ 2866.550426]  [<c040608d>] show_trace+0x12/0x14
[ 2866.555005]  [<c04060a5>] dump_stack+0x16/0x18
[ 2866.559552]  [<c0442b35>] print_circular_bug_tail+0x5f/0x68
[ 2866.565216]  [<c0444aa5>] __lock_acquire+0x9ee/0xd36
[ 2866.570264]  [<c0444e8b>] lock_acquire+0x9e/0xb8
[ 2866.574991]  [<c043bfff>] down_read+0x3d/0x74
[ 2866.579487]  [<c0482510>] kmem_cache_defrag+0x18/0xb3
[ 2866.584664]  [<c046c7ac>] shrink_slab+0x1ca/0x1d5
[ 2866.589462]  [<c046cc1d>] try_to_free_pages+0x178/0x224
[ 2866.594796]  [<c046824f>] __alloc_pages+0x1cd/0x324
[ 2866.599800]  [<c0465282>] find_or_create_page+0x5c/0xa6
[ 2866.605099]  [<c04c9379>] ext3_truncate+0xbb/0x83b
[ 2866.609974]  [<c0472470>] vmtruncate+0x11a/0x140
[ 2866.614695]  [<c049894d>] inode_setattr+0x5c/0x137
[ 2866.619578]  [<c04caafb>] ext3_setattr+0x19c/0x1f8
[ 2866.624470]  [<c0498b61>] notify_change+0x139/0x2ec
[ 2866.629441]  [<c0484161>] do_truncate+0x60/0x79
[ 2866.634075]  [<c04842af>] do_sys_ftruncate+0x135/0x150
[ 2866.639339]  [<c04842e5>] sys_ftruncate64+0x1b/0x1d
[ 2866.644310]  [<c040420c>] syscall_call+0x7/0xb
[ 2866.648823]  [<b7f10410>] 0xb7f10410
[ 2866.652482]  =======================

http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.22-rc4-mm2-sd3/sd-dmesg
http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.22-rc4-mm2-sd3/sd-config

Regards,
Michal

-- 
"Najbardziej brakowało mi twojego milczenia."
-- Andrzej Sapkowski "Coś więcej"

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
  2007-06-08 18:02 ` Michal Piotrowski
@ 2007-06-08 18:16   ` Christoph Lameter
       [not found]     ` <6bffcb0e0706081156u4ad0cc9dkf6d55ebcbd79def2@mail.gmail.com>
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2007-06-08 18:16 UTC (permalink / raw)
  To: Michal Piotrowski; +Cc: akpm, linux-kernel, linux-mm, dgc, Mel Gorman

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> bash shared mapping + your script in a loop
> while true;  do sudo ./run.sh; done > res3.txt

Hmmmm... Seems to be triggered from the reclaim path kmem_cache_defrag 
rather than the manual triggered one from the script. Taking the slub_lock 
on the reclaim path is an issue it seems.

Maybe we need to do a trylock in kmem_cache_defrag to defuse the 
situation? This is after all an optimization so we can bug out.

Does this fix it?

---
 mm/slub.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-06-08 11:12:40.000000000 -0700
+++ slub/mm/slub.c	2007-06-08 11:14:34.000000000 -0700
@@ -2738,7 +2738,9 @@ int kmem_cache_defrag(int percent, int n
 	unsigned long pages = 0;
 	void *scratch;
 
-	down_read(&slub_lock);
+	if (!down_read_trylock(&slub_lock))
+		return 0;
+
 	list_for_each_entry(s, &slab_caches, list) {
 
 		/*



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
       [not found]     ` <6bffcb0e0706081156u4ad0cc9dkf6d55ebcbd79def2@mail.gmail.com>
@ 2007-06-08 19:08       ` Christoph Lameter
  2007-06-08 19:32         ` Michal Piotrowski
  2007-06-08 19:40         ` Christoph Lameter
  0 siblings, 2 replies; 23+ messages in thread
From: Christoph Lameter @ 2007-06-08 19:08 UTC (permalink / raw)
  To: Michal Piotrowski; +Cc: akpm, linux-kernel, linux-mm, dgc, Mel Gorman

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> Yes, it does. Thanks!

Ahhh... That leds to the discovery more sysfs problems. I need to make 
sure not to be holding locks while calling into sysfs. More cleanup...


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
  2007-06-08 19:08       ` Christoph Lameter
@ 2007-06-08 19:32         ` Michal Piotrowski
  2007-06-08 19:38           ` Christoph Lameter
  2007-06-08 19:40         ` Christoph Lameter
  1 sibling, 1 reply; 23+ messages in thread
From: Michal Piotrowski @ 2007-06-08 19:32 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Michal Piotrowski, akpm, linux-kernel, linux-mm, dgc, Mel Gorman

Christoph Lameter pisze:
> On Fri, 8 Jun 2007, Michal Piotrowski wrote:
> 
>> Yes, it does. Thanks!
> 
> Ahhh... That leds to the discovery more sysfs problems. I need to make 
> sure not to be holding locks while calling into sysfs. More cleanup...
> 
> 

sysfs... I forgot about my sysfs test case

#! /bin/sh

for i in `find /sys/ -type f`
do
    echo "wyświetlam $i"
    sudo cat $i > /dev/null
#    sleep 1s
done

[ 2816.175573] BUG: sleeping function called from invalid context at mm/page_alloc.c:1547
[ 2816.183578] in_atomic():1, irqs_disabled():1
[ 2816.187946] 1 lock held by cat/12586:
[ 2816.191705]  #0:  (&n->list_lock){++..}, at: [<c0481630>] list_locations+0x3d/0x26b

l *list_locations+0x3d
0xc1081630 is in list_locations (mm/slub.c:3388).
3383                    struct page *page;
3384
3385                    if (!atomic_read(&n->nr_slabs))
3386                            continue;
3387
3388                    spin_lock_irqsave(&n->list_lock, flags);
3389                    list_for_each_entry(page, &n->partial, lru)
3390                            process_slab(&t, s, page, alloc);
3391                    list_for_each_entry(page, &n->full, lru)
3392                            process_slab(&t, s, page, alloc);


[ 2816.199571] irq event stamp: 11526
[ 2816.203054] hardirqs last  enabled at (11525): [<c042adbd>] on_each_cpu+0x3b/0x71
[ 2816.210689] hardirqs last disabled at (11526): [<c065d241>] _spin_lock_irqsave+0x13/0x6e
[ 2816.218910] softirqs last  enabled at (11236): [<c042b5dd>] __do_softirq+0xdf/0xe5
[ 2816.226635] softirqs last disabled at (11229): [<c0406d65>] do_softirq+0x68/0x11f

l *on_each_cpu+0x3b
0xc102adbd is in on_each_cpu (include/asm/irqflags.h:36).
31              asm volatile("cli": : :"memory");
32      }
33
34      static inline void native_irq_enable(void)
35      {
36              asm volatile("sti": : :"memory");
37      }
38
39      static inline void native_safe_halt(void)
40      {

l *_spin_lock_irqsave+0x13
0xc125d241 is in _spin_lock_irqsave (kernel/spinlock.c:84).
79      unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
80      {
81              unsigned long flags;
82
83              local_irq_save(flags);
84              preempt_disable();
85              spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
86              /*
87               * On lockdep we dont want the hand-coded irq-enable of
88               * _raw_spin_lock_flags() code, because lockdep assumes

l *__do_softirq+0xdf
0xc102b5dd is in __do_softirq (kernel/softirq.c:252).
247
248             trace_softirq_exit();
249
250             account_system_vtime(current);
251             _local_bh_enable();
252     }
253
254     #ifndef __ARCH_HAS_DO_SOFTIRQ
255
256     asmlinkage void do_softirq(void)

l *do_softirq+0x68
0xc1006d65 is in do_softirq (arch/i386/kernel/irq.c:222).
217                     irqctx->tinfo.previous_esp = current_stack_pointer;
218
219                     /* build the stack frame on the softirq stack */
220                     isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
221
222                     asm volatile(
223                             "       xchgl   %%ebx,%%esp     \n"
224                             "       call    __do_softirq    \n"
225                             "       movl    %%ebx,%%esp     \n"
226                             : "=b"(isp)


[ 2816.234235]  [<c04052ad>] dump_trace+0x63/0x1eb
[ 2816.238888]  [<c040544f>] show_trace_log_lvl+0x1a/0x2f
[ 2816.244211]  [<c040608d>] show_trace+0x12/0x14
[ 2816.248757]  [<c04060a5>] dump_stack+0x16/0x18
[ 2816.253288]  [<c041eef1>] __might_sleep+0xce/0xd5
[ 2816.258046]  [<c04680b5>] __alloc_pages+0x33/0x324
[ 2816.262968]  [<c04683fb>] __get_free_pages+0x55/0x66
[ 2816.268060]  [<c0481517>] process_slab+0x1bd/0x299
[ 2816.272988]  [<c048164a>] list_locations+0x57/0x26b
[ 2816.277981]  [<c0481880>] free_calls_show+0x22/0x29
[ 2816.282965]  [<c047e702>] slab_attr_show+0x1c/0x20
[ 2816.287891]  [<c04c1bd9>] sysfs_read_file+0x94/0x105
[ 2816.293018]  [<c048580b>] vfs_read+0xcf/0x158
[ 2816.297539]  [<c0485c71>] sys_read+0x3d/0x72
[ 2816.301910]  [<c040420c>] syscall_call+0x7/0xb
[ 2816.306486]  [<b7f30410>] 0xb7f30410
[ 2816.310165]  =======================
[ 2818.826341] BUG: sleeping function called from invalid context at mm/page_alloc.c:1547
[ 2818.834388] in_atomic():1, irqs_disabled():1
[ 2818.838751] 1 lock held by cat/12635:
[ 2818.842506]  #0:  (&n->list_lock){++..}, at: [<c0481630>] list_locations+0x3d/0x26b
[ 2818.850460] irq event stamp: 11494
[ 2818.853908] hardirqs last  enabled at (11493): [<c042adbd>] on_each_cpu+0x3b/0x71
[ 2818.861505] hardirqs last disabled at (11494): [<c065d241>] _spin_lock_irqsave+0x13/0x6e
[ 2818.869831] softirqs last  enabled at (11258): [<c042b5dd>] __do_softirq+0xdf/0xe5
[ 2818.877576] softirqs last disabled at (11215): [<c0406d65>] do_softirq+0x68/0x11f
[ 2818.885217]  [<c04052ad>] dump_trace+0x63/0x1eb
[ 2818.889893]  [<c040544f>] show_trace_log_lvl+0x1a/0x2f
[ 2818.895112]  [<c040608d>] show_trace+0x12/0x14
[ 2818.899667]  [<c04060a5>] dump_stack+0x16/0x18
[ 2818.904232]  [<c041eef1>] __might_sleep+0xce/0xd5
[ 2818.909046]  [<c04680b5>] __alloc_pages+0x33/0x324
[ 2818.913956]  [<c04683fb>] __get_free_pages+0x55/0x66
[ 2818.919022]  [<c0481517>] process_slab+0x1bd/0x299
[ 2818.923923]  [<c048164a>] list_locations+0x57/0x26b
[ 2818.928961]  [<c0481880>] free_calls_show+0x22/0x29
[ 2818.933916]  [<c047e702>] slab_attr_show+0x1c/0x20
[ 2818.938825]  [<c04c1bd9>] sysfs_read_file+0x94/0x105
[ 2818.943900]  [<c048580b>] vfs_read+0xcf/0x158
[ 2818.948335]  [<c0485c71>] sys_read+0x3d/0x72
[ 2818.952683]  [<c040420c>] syscall_call+0x7/0xb
[ 2818.957213]  [<b7f82410>] 0xb7f82410
[ 2818.960896]  =======================

http://www.stardust.webpages.pl/files/tbf/bitis-gabonica/2.6.22-rc4-mm2-sd3/sd-dmesg2

Regards,
Michal

-- 
"Najbardziej brakowało mi twojego milczenia."
-- Andrzej Sapkowski "Coś więcej"

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
  2007-06-08 19:32         ` Michal Piotrowski
@ 2007-06-08 19:38           ` Christoph Lameter
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Lameter @ 2007-06-08 19:38 UTC (permalink / raw)
  To: Michal Piotrowski; +Cc: akpm, linux-kernel, linux-mm, dgc, Mel Gorman

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> 0xc1081630 is in list_locations (mm/slub.c:3388).
> 3383                    struct page *page;
> 3384
> 3385                    if (!atomic_read(&n->nr_slabs))
> 3386                            continue;
> 3387
> 3388                    spin_lock_irqsave(&n->list_lock, flags);
> 3389                    list_for_each_entry(page, &n->partial, lru)
> 3390                            process_slab(&t, s, page, alloc);
> 3391                    list_for_each_entry(page, &n->full, lru)
> 3392                            process_slab(&t, s, page, alloc);


Yes process slab needs some temporary data to generate the lists of 
functions calling etc and that is a GFP_TEMPORARY alloc.

Does this fix it?

---
 mm/slub.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-06-08 12:35:56.000000000 -0700
+++ slub/mm/slub.c	2007-06-08 12:37:32.000000000 -0700
@@ -2930,7 +2930,7 @@ static int alloc_loc_track(struct loc_tr
 
 	order = get_order(sizeof(struct location) * max);
 
-	l = (void *)__get_free_pages(GFP_TEMPORARY, order);
+	l = (void *)__get_free_pages(GFP_ATOMIC, order);
 
 	if (!l)
 		return 0;

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
  2007-06-08 19:08       ` Christoph Lameter
  2007-06-08 19:32         ` Michal Piotrowski
@ 2007-06-08 19:40         ` Christoph Lameter
  2007-06-08 19:47           ` Michal Piotrowski
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2007-06-08 19:40 UTC (permalink / raw)
  To: Michal Piotrowski; +Cc: akpm, linux-kernel, linux-mm, dgc, Mel Gorman

On Fri, 8 Jun 2007, Christoph Lameter wrote:

> On Fri, 8 Jun 2007, Michal Piotrowski wrote:
> 
> > Yes, it does. Thanks!
> 
> Ahhh... That leds to the discovery more sysfs problems. I need to make 
> sure not to be holding locks while calling into sysfs. More cleanup...

Could you remove the trylock patch and see how this one fares? We may need 
both but this should avoid taking the slub_lock around any possible alloc 
of sysfs.


SLUB: Move sysfs operations outside of slub_lock

Sysfs can do a gazillion things when called. Make sure that we do
not call any sysfs functions while holding the slub_lock. Let sysfs
fend for itself locking wise.

Just protect the essentials: The modifications to the slab lists
and the ref counters of the slabs.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/slub.c |   34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-06-08 12:21:56.000000000 -0700
+++ slub/mm/slub.c	2007-06-08 12:30:23.000000000 -0700
@@ -2179,12 +2179,13 @@ void kmem_cache_destroy(struct kmem_cach
 	s->refcount--;
 	if (!s->refcount) {
 		list_del(&s->list);
+		up_write(&slub_lock);
 		if (kmem_cache_close(s))
 			WARN_ON(1);
 		sysfs_slab_remove(s);
 		kfree(s);
-	}
-	up_write(&slub_lock);
+	} else
+		up_write(&slub_lock);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
@@ -2637,26 +2638,33 @@ struct kmem_cache *kmem_cache_create(con
 		 */
 		s->objsize = max(s->objsize, (int)size);
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+		up_write(&slub_lock);
+
 		if (sysfs_slab_alias(s, name))
 			goto err;
-	} else {
-		s = kmalloc(kmem_size, GFP_KERNEL);
-		if (s && kmem_cache_open(s, GFP_KERNEL, name,
+
+		return s;
+	}
+
+	s = kmalloc(kmem_size, GFP_KERNEL);
+	if (s) {
+		if (kmem_cache_open(s, GFP_KERNEL, name,
 				size, align, flags, ctor)) {
-			if (sysfs_slab_add(s)) {
-				kfree(s);
-				goto err;
-			}
 			list_add(&s->list, &slab_caches);
+			up_write(&slub_lock);
 			raise_kswapd_order(s->order);
-		} else
-			kfree(s);
+
+			if (sysfs_slab_add(s))
+				goto err;
+
+			return s;
+
+		}
+		kfree(s);
 	}
 	up_write(&slub_lock);
-	return s;
 
 err:
-	up_write(&slub_lock);
 	if (flags & SLAB_PANIC)
 		panic("Cannot create slabcache %s\n", name);
 	else

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
  2007-06-08 19:40         ` Christoph Lameter
@ 2007-06-08 19:47           ` Michal Piotrowski
  2007-06-08 20:48             ` Christoph Lameter
  0 siblings, 1 reply; 23+ messages in thread
From: Michal Piotrowski @ 2007-06-08 19:47 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Michal Piotrowski, akpm, linux-kernel, linux-mm, dgc, Mel Gorman

Christoph Lameter pisze:
> On Fri, 8 Jun 2007, Christoph Lameter wrote:
> 
>> On Fri, 8 Jun 2007, Michal Piotrowski wrote:
>>
>>> Yes, it does. Thanks!
>> Ahhh... That leds to the discovery more sysfs problems. I need to make 
>> sure not to be holding locks while calling into sysfs. More cleanup...
> 
> Could you remove the trylock patch and see how this one fares? We may need 
> both but this should avoid taking the slub_lock around any possible alloc 
> of sysfs.
> 
> 

It's a bit tricky

cat ../sd2.patch | patch -p1
patching file mm/slub.c
Hunk #1 succeeded at 2194 (offset 15 lines).
Hunk #2 FAILED at 2653.
1 out of 2 hunks FAILED -- saving rejects to file mm/slub.c.rej
[michal@bitis-gabonica linux-work3]$ cat mm/slub.c.rej
***************
*** 2652,2677 ****
                 */
                s->objsize = max(s->objsize, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
                if (sysfs_slab_alias(s, name))
                        goto err;
-       } else {
-               s = kmalloc(kmem_size, GFP_KERNEL);
-               if (s && kmem_cache_open(s, GFP_KERNEL, name,
                                size, align, flags, ctor)) {
-                       if (sysfs_slab_add(s)) {
-                               kfree(s);
-                               goto err;
-                       }
                        list_add(&s->list, &slab_caches);
                        raise_kswapd_order(s->order);
-               } else
-                       kfree(s);
        }
        up_write(&slub_lock);
-       return s;

  err:
-       up_write(&slub_lock);
        if (flags & SLAB_PANIC)
                panic("Cannot create slabcache %s\n", name);
        else
--- 2653,2685 ----
                 */
                s->objsize = max(s->objsize, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+               up_write(&slub_lock);
+
                if (sysfs_slab_alias(s, name))
                        goto err;
+
+               return s;
+       }
+
+       s = kmalloc(kmem_size, GFP_KERNEL);
+       if (s) {
+               if (kmem_cache_open(s, GFP_KERNEL, name,
                                size, align, flags, ctor)) {
                        list_add(&s->list, &slab_caches);
+                       up_write(&slub_lock);
                        raise_kswapd_order(s->order);
+
+                       if (sysfs_slab_add(s))
+                               goto err;
+
+                       return s;
+
+               }
+               kfree(s);
        }
        up_write(&slub_lock);

  err:
        if (flags & SLAB_PANIC)
                panic("Cannot create slabcache %s\n", name);
        else

Regards,
Michal

-- 
"Najbardziej brakowało mi twojego milczenia."
-- Andrzej Sapkowski "Coś więcej"

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 00/12] Slab defragmentation V3
  2007-06-08 19:47           ` Michal Piotrowski
@ 2007-06-08 20:48             ` Christoph Lameter
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Lameter @ 2007-06-08 20:48 UTC (permalink / raw)
  To: Michal Piotrowski; +Cc: akpm, linux-kernel, linux-mm, dgc, Mel Gorman

On Fri, 8 Jun 2007, Michal Piotrowski wrote:

> > Could you remove the trylock patch and see how this one fares? We may need
> > both but this should avoid taking the slub_lock around any possible alloc of
> > sysfs.
> It's a bit tricky

Hmmm... Yes that version was aginst 4-mm1 instead after the defrag 
patchset. The difference is only the "ops" parameter...

Rediff to apply after defrag patchset.

SLUB: Move sysfs operations outside of slub_lock

Sysfs can do a gazillion things when called. Make sure that we do
not call any sysfs functions while holding the slub_lock. Let sysfs
fend for itself locking wise.

Just protect the essentials: The modifications to the slab lists
and the ref counters of the slabs.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/slub.c |   34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-06-08 13:47:32.000000000 -0700
+++ slub/mm/slub.c	2007-06-08 13:48:07.000000000 -0700
@@ -2193,12 +2193,13 @@ void kmem_cache_destroy(struct kmem_cach
 	s->refcount--;
 	if (!s->refcount) {
 		list_del(&s->list);
+		up_write(&slub_lock);
 		if (kmem_cache_close(s))
 			WARN_ON(1);
 		sysfs_slab_remove(s);
 		kfree(s);
-	}
-	up_write(&slub_lock);
+	} else
+		up_write(&slub_lock);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
@@ -2956,26 +2957,33 @@ struct kmem_cache *kmem_cache_create(con
 		 */
 		s->objsize = max(s->objsize, (int)size);
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+		up_write(&slub_lock);
+
 		if (sysfs_slab_alias(s, name))
 			goto err;
-	} else {
-		s = kmalloc(kmem_size, GFP_KERNEL);
-		if (s && kmem_cache_open(s, GFP_KERNEL, name,
+
+		return s;
+	}
+
+	s = kmalloc(kmem_size, GFP_KERNEL);
+	if (s) {
+		if (kmem_cache_open(s, GFP_KERNEL, name,
 				size, align, flags, ctor, ops)) {
-			if (sysfs_slab_add(s)) {
-				kfree(s);
-				goto err;
-			}
 			list_add(&s->list, &slab_caches);
+			up_write(&slub_lock);
 			raise_kswapd_order(s->order);
-		} else
-			kfree(s);
+
+			if (sysfs_slab_add(s))
+				goto err;
+
+			return s;
+
+		}
+		kfree(s);
 	}
 	up_write(&slub_lock);
-	return s;
 
 err:
-	up_write(&slub_lock);
 	if (flags & SLAB_PANIC)
 		panic("Cannot create slabcache %s\n", name);
 	else

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2007-06-08 20:48 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-06-07 21:55 [patch 00/12] Slab defragmentation V3 clameter
2007-06-07 21:55 ` [patch 01/12] SLUB: Add support for kmem_cache_ops clameter
2007-06-07 21:55 ` [patch 02/12] SLUB: Slab defragmentation core functionality clameter
2007-06-07 21:55 ` [patch 03/12] SLUB: Extend slabinfo to support -D and -C options clameter
2007-06-07 21:55 ` [patch 04/12] SLUB: Slab defragmentation trigger clameter
2007-06-07 21:55 ` [patch 05/12] Generic inode defragmentation clameter
2007-06-07 21:55 ` [patch 06/12] ext2 ext3 ext4: support inode slab defragmentation clameter
2007-06-07 21:55 ` [patch 07/12] xfs: inode defragmentation support clameter
2007-06-07 21:55 ` [patch 08/12] procfs: " clameter
2007-06-07 21:55 ` [patch 09/12] reiserfs: " clameter
2007-06-07 21:55 ` [patch 10/12] sockets: " clameter
2007-06-07 21:55 ` [patch 11/12] Dentry defragmentation clameter
2007-06-07 21:55 ` [patch 12/12] SLUB: Support memory defrag through kmem_cache_vacate() clameter
     [not found] ` <6bffcb0e0706080239w5cfe8594sbf5564dacd48936f@mail.gmail.com>
2007-06-08 15:16   ` [patch 00/12] Slab defragmentation V3 Christoph Lameter
2007-06-08 15:28   ` Christoph Lameter
2007-06-08 18:02 ` Michal Piotrowski
2007-06-08 18:16   ` Christoph Lameter
     [not found]     ` <6bffcb0e0706081156u4ad0cc9dkf6d55ebcbd79def2@mail.gmail.com>
2007-06-08 19:08       ` Christoph Lameter
2007-06-08 19:32         ` Michal Piotrowski
2007-06-08 19:38           ` Christoph Lameter
2007-06-08 19:40         ` Christoph Lameter
2007-06-08 19:47           ` Michal Piotrowski
2007-06-08 20:48             ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).