linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Matthew Dobson <colpatch@us.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: andrea@suse.de, Sridhar Samudrala <sri@us.ibm.com>,
	pavel@suse.cz, Andrew Morton <akpm@osdl.org>,
	Linux Memory Management <linux-mm@kvack.org>
Subject: [RFC][PATCH 6/6] Critical Page Pool: Slab Support
Date: Wed, 14 Dec 2005 00:02:53 -0800	[thread overview]
Message-ID: <439FD1AD.8090405@us.ibm.com> (raw)
In-Reply-To: <439FCECA.3060909@us.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 577 bytes --]

Finally, add support for the Critical Page Pool to the Slab Allocator.  We
need the slab allocator to be at least marginally aware of the existence of
critical pages, or else we leave open the possibility of non-critical slab
allocations stealing objects from 'critical' slabs.  We add a separate,
node-unspecific list to kmem_cache_t called slabs_crit.  We keep all
partial and full critical slabs on this list.  We don't keep empty critical
slabs around, in the interest of giving this memory back to the VM ASAP in
what is typically a high memory pressure situation.

-Matt

[-- Attachment #2: slab_support.patch --]
[-- Type: text/x-patch, Size: 7230 bytes --]

Modify the Slab Allocator to support the addition of a Critical Pool to the VM.
What we want is to ensure that if a cache is allocated a new slab page from the
Critical Pool during an Emergency situation, that only other __GFP_CRITICAL
allocations are satisfied from that slab.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>

Index: linux-2.6.15-rc5+critical_pool/mm/slab.c
===================================================================
--- linux-2.6.15-rc5+critical_pool.orig/mm/slab.c	2005-12-13 16:14:25.757617592 -0800
+++ linux-2.6.15-rc5+critical_pool/mm/slab.c	2005-12-13 16:32:08.300086584 -0800
@@ -221,8 +221,9 @@ struct slab {
 	unsigned long		colouroff;
 	void			*s_mem;		/* including colour offset */
 	unsigned int		inuse;		/* num of objs active in slab */
-	kmem_bufctl_t		free;
+	unsigned short		critical;	/* is this an critical slab? */
 	unsigned short          nodeid;
+	kmem_bufctl_t		free;
 };
 
 /*
@@ -395,6 +396,9 @@ struct kmem_cache {
 	unsigned int		slab_size;
 	unsigned int		dflags;		/* dynamic flags */
 
+	/* list of critical slabs for this cache */
+	struct list_head	slabs_crit;
+
 	/* constructor func */
 	void (*ctor)(void *, kmem_cache_t *, unsigned long);
 
@@ -1770,6 +1774,7 @@ next:
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
 	cachep->objsize = size;
+	INIT_LIST_HEAD(&cachep->slabs_crit);
 
 	if (flags & CFLGS_OFF_SLAB)
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -2086,6 +2091,7 @@ static struct slab* alloc_slabmgmt(kmem_
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
 	slabp->s_mem = objp+colour_off;
+	slabp->critical = 0;
 
 	return slabp;
 }
@@ -2161,7 +2167,8 @@ static void *get_object(kmem_cache_t *ca
 	next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-	WARN_ON(slabp->nodeid != nodeid);
+	if (nodeid >= 0)
+		WARN_ON(slabp->nodeid != nodeid);
 #endif
 	slabp->free = next;
 
@@ -2175,7 +2182,8 @@ static void return_object(kmem_cache_t *
 
 #if DEBUG
 	/* Verify that the slab belongs to the intended node */
-	WARN_ON(slabp->nodeid != nodeid);
+	if (nodeid >= 0)
+		WARN_ON(slabp->nodeid != nodeid);
 
 	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 		printk(KERN_ERR "slab: double free detected in cache "
@@ -2324,18 +2332,64 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
+static inline int is_critical_object(void *obj)
+{
+	struct slab *slabp;
+
+	if (!obj)
+		return 0;
+
+	slabp = page_get_slab(virt_to_page(obj));
+	return slabp->critical;
+}
+
+static inline void *get_critical_object(kmem_cache_t *cachep, gfp_t flags)
+{
+	struct slab *slabp;
+	void *objp = NULL;
+
+	spin_lock(&cachep->spinlock);
+	/* search for any partially free critical slabs */
+	if (!list_empty(&cachep->slabs_crit))
+		list_for_each_entry(slabp, &cachep->slabs_crit, list)
+			if (slabp->free != BUFCTL_END) {
+				objp = get_object(cachep, slabp, -1);
+				check_slabp(cachep, slabp);
+				break;
+			}
+	spin_unlock(&cachep->spinlock);
+
+	return objp;
+}
+
+static inline void free_critical_object(kmem_cache_t *cachep, void *objp)
+{
+	struct slab *slabp = page_get_slab(virt_to_page(objp));
+
+	check_slabp(cachep, slabp);
+	return_object(cachep, slabp, objp, -1);
+	check_slabp(cachep, slabp);
+
+	if (!slabp->inuse) {
+		BUG_ON(cachep->flags & SLAB_DESTROY_BY_RCU);
+
+		list_del(&slabp->list);
+		slab_destroy(cachep, slabp);
+	}
+}
+
 /**
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void *cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
 	struct slab *slabp;
 	void *objp;
 	size_t offset;
 	gfp_t local_flags;
 	unsigned long ctor_flags;
-	struct kmem_list3 *l3;
+	int critical = is_emergency_alloc(flags) && !cachep->gfporder;
 
 	/*
 	 * Be lazy and only check for valid flags here,
@@ -2344,7 +2398,14 @@ static int cache_grow(kmem_cache_t *cach
 	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
 		BUG();
 	if (flags & SLAB_NO_GROW)
-		return 0;
+		return NULL;
+
+	/*
+	 * If we are in an emergency situation and this is a 'critical' alloc,
+	 * see if we can get an object from an existing critical slab first.
+	 */
+	if (critical && (objp = get_critical_object(cachep, flags)))
+		return objp;
 
 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 	local_flags = (flags & SLAB_LEVEL_MASK);
@@ -2391,21 +2452,30 @@ static int cache_grow(kmem_cache_t *cach
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	check_irq_off();
-	l3 = cachep->nodelists[nodeid];
-	spin_lock(&l3->list_lock);
 
 	/* Make slab active. */
-	list_add_tail(&slabp->list, &(l3->slabs_free));
 	STATS_INC_GROWN(cachep);
-	l3->free_objects += cachep->num;
-	spin_unlock(&l3->list_lock);
-	return 1;
+	if (!critical) {
+		struct kmem_list3 *l3 = cachep->nodelists[nodeid];
+		spin_lock(&l3->list_lock);
+		list_add_tail(&slabp->list, &l3->slabs_free);
+		l3->free_objects += cachep->num;
+		spin_unlock(&l3->list_lock);
+	} else {
+		slabp->critical = 1;
+		spin_lock(&cachep->spinlock);
+		list_add_tail(&slabp->list, &cachep->slabs_crit);
+		spin_unlock(&cachep->spinlock);
+		objp = get_object(cachep, slabp, -1);
+		check_slabp(cachep, slabp);
+	}
+	return objp;
 failed_freepages:
 	kmem_freepages(cachep, objp);
 failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
-	return 0;
+	return NULL;
 }
 
 static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
@@ -2483,15 +2553,18 @@ alloc_done:
 	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
-		int x;
-		x = cache_grow(cachep, flags, numa_node_id());
+		void *obj = cache_grow(cachep, flags, numa_node_id());
+
+		/* critical objects don't "grow" the slab, just return 'obj' */
+		if (is_critical_object(obj))
+			return obj;
 
-		// cache_grow can reenable interrupts, then ac could change.
+		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = ac_data(cachep);
-		if (!x && ac->avail == 0)	// no objects in sight? abort
+		if (!obj && ac->avail == 0) /* No objects in sight?  Abort.  */
 			return NULL;
 
-		if (!ac->avail)		// objects refilled by interrupt?
+		if (!ac->avail)		/* objects refilled by interrupt?    */
 			goto retry;
 	}
 	ac->touched = 1;
@@ -2597,7 +2670,6 @@ static void *__cache_alloc_node(kmem_cac
  	struct slab *slabp;
  	struct kmem_list3 *l3;
  	void *obj;
- 	int x;
 
  	l3 = cachep->nodelists[nodeid];
  	BUG_ON(!l3);
@@ -2639,11 +2711,15 @@ retry:
 
 must_grow:
  	spin_unlock(&l3->list_lock);
- 	x = cache_grow(cachep, flags, nodeid);
+	obj = cache_grow(cachep, flags, nodeid);
 
- 	if (!x)
+	if (!obj)
  		return NULL;
 
+	/* critical objects don't "grow" the slab, just return 'obj' */
+	if (is_critical_object(obj))
+		goto done;
+
  	goto retry;
 done:
  	return obj;
@@ -2758,6 +2834,11 @@ static inline void __cache_free(kmem_cac
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
+	if (is_critical_object(objp)) {
+		free_critical_object(cachep, objp);
+		return;
+	}
+
 	/* Make sure we are not freeing a object from another
 	 * node to the array cache on this cpu.
 	 */

  parent reply	other threads:[~2005-12-14  8:02 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-12-14  7:50 [RFC][PATCH 0/6] Critical Page Pool Matthew Dobson
2005-12-14  7:52 ` [RFC][PATCH 1/6] Create " Matthew Dobson
2005-12-14 10:48   ` Andrea Arcangeli
2005-12-14 13:30   ` Rik van Riel
2005-12-14 16:26     ` Matthew Dobson
2005-12-15  3:29       ` Matt Mackall
2005-12-14  7:54 ` [RFC][PATCH 2/6] in_emergency Trigger Matthew Dobson
2005-12-14  7:56 ` [RFC][PATCH 3/6] Slab Prep: get/return_object Matthew Dobson
2005-12-14  8:19   ` Pekka Enberg
2005-12-14 16:26     ` Matthew Dobson
2005-12-14  7:58 ` [RFC][PATCH 4/6] Slab Prep: slab_destruct() Matthew Dobson
2005-12-14  8:37   ` Pekka Enberg
2005-12-14 16:30     ` Matthew Dobson
2005-12-14  7:59 ` [RFC][PATCH 5/6] Slab Prep: Move cache_grow() Matthew Dobson
2005-12-14  8:02 ` Matthew Dobson [this message]
2005-12-14 10:08 ` [RFC][PATCH 0/6] Critical Page Pool Pavel Machek
2005-12-14 12:01   ` Andrea Arcangeli
2005-12-14 13:03     ` Alan Cox
2005-12-14 16:37       ` Matthew Dobson
2005-12-14 19:17         ` Alan Cox
2005-12-15 16:27         ` Pavel Machek
2005-12-14 16:03     ` Matthew Dobson
2005-12-14 15:55   ` Matthew Dobson
2005-12-15 16:26     ` Pavel Machek
2005-12-15 21:51       ` Matthew Dobson
2005-12-16  5:02         ` Sridhar Samudrala

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=439FD1AD.8090405@us.ibm.com \
    --to=colpatch@us.ibm.com \
    --cc=akpm@osdl.org \
    --cc=andrea@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=pavel@suse.cz \
    --cc=sri@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).