linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC 1/7] slub: Replace ctor field with ops field in /sys/slab/*
       [not found] <20181220192145.023162076@linux.com>
@ 2018-12-20 19:21 ` Christoph Lameter
  2018-12-20 19:21 ` [RFC 2/7] slub: Add defrag_ratio field and sysfs support Christoph Lameter
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2018-12-20 19:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, linux-kernel, Pekka Enberg, akpm, Mel Gorman, andi,
	Rik van Riel, Dave Chinner, Christoph Hellwig, Michal Hocko,
	Mike Kravetz

[-- Attachment #1: ctor_to_ops --]
[-- Type: text/plain, Size: 1286 bytes --]

Create an ops field in /sys/slab/*/ops to contain all the callback
operations defined for a slab cache. This will be used to display
the additional callbacks that will be defined soon to enable
defragmentation.

Display the existing ctor callback in the ops fields contents.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/slub.c |   16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -4994,13 +4994,18 @@ static ssize_t cpu_partial_store(struct
 }
 SLAB_ATTR(cpu_partial);
 
-static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+static ssize_t ops_show(struct kmem_cache *s, char *buf)
 {
+	int x = 0;
+
 	if (!s->ctor)
 		return 0;
-	return sprintf(buf, "%pS\n", s->ctor);
+
+	if (s->ctor)
+		x += sprintf(buf + x, "ctor : %pS\n", s->ctor);
+	return x;
 }
-SLAB_ATTR_RO(ctor);
+SLAB_ATTR_RO(ops);
 
 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 {
@@ -5413,7 +5418,7 @@ static struct attribute *slab_attrs[] =
 	&objects_partial_attr.attr,
 	&partial_attr.attr,
 	&cpu_slabs_attr.attr,
-	&ctor_attr.attr,
+	&ops_attr.attr,
 	&aliases_attr.attr,
 	&align_attr.attr,
 	&hwcache_align_attr.attr,


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC 2/7] slub: Add defrag_ratio field and sysfs support
       [not found] <20181220192145.023162076@linux.com>
  2018-12-20 19:21 ` [RFC 1/7] slub: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
@ 2018-12-20 19:21 ` Christoph Lameter
  2018-12-20 19:21 ` [RFC 3/7] slub: Add isolate() and migrate() methods Christoph Lameter
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2018-12-20 19:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, linux-kernel, Pekka Enberg, akpm, Mel Gorman, andi,
	Rik van Riel, Dave Chinner, Christoph Hellwig, Michal Hocko,
	Mike Kravetz

[-- Attachment #1: defrag_ratio --]
[-- Type: text/plain, Size: 3653 bytes --]

"defrag_ratio" is used to set the threshold at which defragmentation
should be attempted on a slab page.

"defrag_ratio" is percentage in the range of 1 - 100. If more than
that percentage of slots in a slab page are unused the the slab page
will become subject to defragmentation.

Add a defrag ratio field and set it to 30% by default. A limit of 30% specifies
that less than 3 out of 10 available slots for objects need to be leftover
before slab defragmentation will be attempted on the remaining objects.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 Documentation/ABI/testing/sysfs-kernel-slab |   13 +++++++++++++
 include/linux/slub_def.h                    |    6 ++++++
 mm/slub.c                                   |   23 +++++++++++++++++++++++
 3 files changed, 42 insertions(+)

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -3628,6 +3628,7 @@ static int kmem_cache_open(struct kmem_c
 
 	set_cpu_partial(s);
 
+	s->defrag_ratio = 30;
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
 #endif
@@ -5113,6 +5114,27 @@ static ssize_t destroy_by_rcu_show(struc
 }
 SLAB_ATTR_RO(destroy_by_rcu);
 
+static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+	return sprintf(buf, "%d\n", s->defrag_ratio);
+}
+
+static ssize_t defrag_ratio_store(struct kmem_cache *s,
+				const char *buf, size_t length)
+{
+	unsigned long ratio;
+	int err;
+
+	err = kstrtoul(buf, 10, &ratio);
+	if (err)
+		return err;
+
+	if (ratio < 100)
+		s->defrag_ratio = ratio;
+	return length;
+}
+SLAB_ATTR(defrag_ratio);
+
 #ifdef CONFIG_SLUB_DEBUG
 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 {
@@ -5437,6 +5459,7 @@ static struct attribute *slab_attrs[] =
 	&validate_attr.attr,
 	&alloc_calls_attr.attr,
 	&free_calls_attr.attr,
+	&defrag_ratio_attr.attr,
 #endif
 #ifdef CONFIG_ZONE_DMA
 	&cache_dma_attr.attr,
Index: linux/Documentation/ABI/testing/sysfs-kernel-slab
===================================================================
--- linux.orig/Documentation/ABI/testing/sysfs-kernel-slab
+++ linux/Documentation/ABI/testing/sysfs-kernel-slab
@@ -180,6 +180,19 @@ Description:
 		list.  It can be written to clear the current count.
 		Available when CONFIG_SLUB_STATS is enabled.
 
+What:		/sys/kernel/slab/cache/defrag_ratio
+Date:		December 2018
+KernelVersion:	4.18
+Contact:	Christoph Lameter <cl@linux-foundation.org>
+		Pekka Enberg <penberg@cs.helsinki.fi>,
+Description:
+		The defrag_ratio files allows the control of how agressive
+		slab fragmentation reduction works at reclaiming objects from
+		sparsely populated slabs. This is a percentage. If a slab
+		has more than this percentage of available object then reclaim
+		will attempt to reclaim objects so that the whole slab
+		page can be freed. The default is 30%.
+
 What:		/sys/kernel/slab/cache/deactivate_to_tail
 Date:		February 2008
 KernelVersion:	2.6.25
Index: linux/include/linux/slub_def.h
===================================================================
--- linux.orig/include/linux/slub_def.h
+++ linux/include/linux/slub_def.h
@@ -104,6 +104,12 @@ struct kmem_cache {
 	unsigned int red_left_pad;	/* Left redzone padding size */
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
+	int defrag_ratio;	/*
+				 * Ratio used to check the percentage of
+				 * objects allocate in a slab page.
+				 * If less than this ratio is allocated
+				 * then reclaim attempts are made.
+				 */
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;	/* For sysfs */
 	struct work_struct kobj_remove_work;


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC 3/7] slub: Add isolate() and migrate() methods
       [not found] <20181220192145.023162076@linux.com>
  2018-12-20 19:21 ` [RFC 1/7] slub: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
  2018-12-20 19:21 ` [RFC 2/7] slub: Add defrag_ratio field and sysfs support Christoph Lameter
@ 2018-12-20 19:21 ` Christoph Lameter
  2018-12-20 19:22 ` [RFC 4/7] slub: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2018-12-20 19:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, linux-kernel, Pekka Enberg, akpm, Mel Gorman, andi,
	Rik van Riel, Dave Chinner, Christoph Hellwig, Michal Hocko,
	Mike Kravetz

[-- Attachment #1: isolate_and_migrate_methods --]
[-- Type: text/plain, Size: 6503 bytes --]

Add the two methods needed for moving objects and enable the
display of the callbacks via the /sys/kernel/slab interface.

Add documentation explaining the use of these methods and the prototypes
for slab.h. Add functions to setup the callbacks method for a slab cache.

Add empty functions for SLAB/SLOB. The API is generic so it
could be theoretically implemented for these allocators as well.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/slab.h     |   50 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/slub_def.h |    3 ++
 mm/slub.c                |   29 ++++++++++++++++++++++++++-
 3 files changed, 81 insertions(+), 1 deletion(-)

Index: linux/include/linux/slub_def.h
===================================================================
--- linux.orig/include/linux/slub_def.h
+++ linux/include/linux/slub_def.h
@@ -99,6 +99,9 @@ struct kmem_cache {
 	gfp_t allocflags;	/* gfp flags to use on each alloc */
 	int refcount;		/* Refcount for slab cache destroy */
 	void (*ctor)(void *);
+	kmem_isolate_func *isolate;
+	kmem_migrate_func *migrate;
+
 	unsigned int inuse;		/* Offset to metadata */
 	unsigned int align;		/* Alignment */
 	unsigned int red_left_pad;	/* Left redzone padding size */
Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -3498,7 +3498,6 @@ static int calculate_sizes(struct kmem_c
 	else
 		s->flags &= ~__OBJECT_POISON;
 
-
 	/*
 	 * If we are Redzoning then check if there is some space between the
 	 * end of the object and the free pointer. If not then add an
@@ -4311,6 +4310,25 @@ int __kmem_cache_create(struct kmem_cach
 	return err;
 }
 
+void kmem_cache_setup_mobility(struct kmem_cache *s,
+	kmem_isolate_func isolate, kmem_migrate_func migrate)
+{
+	/*
+	 * Defragmentable slabs must have a ctor otherwise objects may be
+	 * in an undetermined state after they are allocated.
+	 */
+	BUG_ON(!s->ctor);
+	s->isolate = isolate;
+	s->migrate = migrate;
+	/*
+	 * Sadly serialization requirements currently mean that we have
+	 * to disable fast cmpxchg based processing.
+	 */
+	s->flags &= ~__CMPXCHG_DOUBLE;
+
+}
+EXPORT_SYMBOL(kmem_cache_setup_mobility);
+
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 {
 	struct kmem_cache *s;
@@ -5004,6 +5022,20 @@ static ssize_t ops_show(struct kmem_cach
 
 	if (s->ctor)
 		x += sprintf(buf + x, "ctor : %pS\n", s->ctor);
+
+	if (s->isolate) {
+		x += sprintf(buf + x, "isolate : ");
+		x += sprint_symbol(buf + x,
+				(unsigned long)s->isolate);
+		x += sprintf(buf + x, "\n");
+	}
+
+	if (s->migrate) {
+		x += sprintf(buf + x, "migrate : ");
+		x += sprint_symbol(buf + x,
+				(unsigned long)s->migrate);
+		x += sprintf(buf + x, "\n");
+	}
 	return x;
 }
 SLAB_ATTR_RO(ops);
Index: linux/include/linux/slab.h
===================================================================
--- linux.orig/include/linux/slab.h
+++ linux/include/linux/slab.h
@@ -153,6 +153,68 @@ void memcg_deactivate_kmem_caches(struct
 void memcg_destroy_kmem_caches(struct mem_cgroup *);
 
 /*
+ * Function prototypes passed to kmem_cache_setup_mobility() to enable mobile
+ * objects and targeted reclaim in slab caches.
+ */
+
+/*
+ * kmem_cache_isolate_func() is called with locks held so that the slab
+ * objects cannot be freed. We are in an atomic context and no slab
+ * operations may be performed. The purpose of kmem_cache_isolate_func()
+ * is to pin the object so that it cannot be freed until
+ * kmem_cache_migrate_func() has processed them. This may be accomplished
+ * by increasing the refcount or setting a flag.
+ *
+ * Parameters passed are the number of objects to process and an array of
+ * pointers to objects which are intended to be moved.
+ *
+ * Returns a pointer that is passed to the migrate function. If any objects
+ * cannot be touched at this point then the pointer may indicate a
+ * failure and then the migration function can simply remove the references
+ * that were already obtained. The private data could be used to track
+ * the objects that were already pinned.
+ *
+ * The object pointer array passed is also passed to kmem_cache_migrate().
+ * The function may remove objects from the array by setting pointers to
+ * NULL. This is useful if we can determine that an object is being freed
+ * because kmem_cache_isolate_func() was called when the subsystem
+ * was calling kmem_cache_free().
+ * In that case it is not necessary to increase the refcount or
+ * specially mark the object because the release of the slab lock
+ * will lead to the immediate freeing of the object.
+ */
+typedef void *kmem_isolate_func(struct kmem_cache *, void **, int);
+
+/*
+ * kmem_cache_move_migrate_func is called with no locks held and interrupts
+ * enabled. Sleeping is possible. Any operation may be performed in
+ * migrate(). kmem_cache_migrate_func should allocate new objects and
+ * free all the objects.
+ **
+ * Parameters passed are the number of objects in the array, the array of
+ * pointers to the objects, the NUMA node where the object should be
+ * allocated and the pointer returned by kmem_cache_isolate_func().
+ *
+ * Success is checked by examining the number of remaining objects in
+ * the slab. If the number is zero then the objects will be freed.
+ */
+typedef void kmem_migrate_func(struct kmem_cache *, void **, int nr, int node, void *private);
+
+/*
+ * kmem_cache_setup_mobility() is used to setup callbacks for a slab cache.
+ */
+#ifdef CONFIG_SLUB
+void kmem_cache_setup_mobility(struct kmem_cache *, kmem_isolate_func,
+						kmem_migrate_func);
+#else
+static inline void kmem_cache_setup_mobility(struct kmem_cache *s,
+	kmem_isolate_func isolate, kmem_migrate_func migrate) {}
+#endif
+
+/*
+ * Allocator specific definitions. These are mainly used to establish optimized
+ * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
+ * selecting the appropriate general cache at compile time.
  * Please use this macro to create slab caches. Simply specify the
  * name of the structure and maybe some flags that are listed above.
  *
Index: linux/mm/slab_common.c
===================================================================
--- linux.orig/mm/slab_common.c
+++ linux/mm/slab_common.c
@@ -298,7 +298,7 @@ int slab_unmergeable(struct kmem_cache *
 	if (!is_root_cache(s))
 		return 1;
 
-	if (s->ctor)
+	if (s->ctor || s->isolate || s->migrate)
 		return 1;
 
 	if (s->usersize)


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC 4/7] slub: Sort slab cache list and establish maximum objects for defrag slabs
       [not found] <20181220192145.023162076@linux.com>
                   ` (2 preceding siblings ...)
  2018-12-20 19:21 ` [RFC 3/7] slub: Add isolate() and migrate() methods Christoph Lameter
@ 2018-12-20 19:22 ` Christoph Lameter
  2018-12-20 19:22 ` [RFC 5/7] slub: Slab defrag core Christoph Lameter
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2018-12-20 19:22 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, linux-kernel, Pekka Enberg, akpm, Mel Gorman, andi,
	Rik van Riel, Dave Chinner, Christoph Hellwig, Michal Hocko,
	Mike Kravetz

[-- Attachment #1: sort_and_max --]
[-- Type: text/plain, Size: 2555 bytes --]

It is advantageous to have all defragmentable slabs together at the
beginning of the list of slabs so that there is no need to scan the
complete list. Put defragmentable caches first when adding a slab cache
and others last.

Determine the maximum number of objects in defragmentable slabs. This allows
the sizing of the array holding refs to objects in a slab later.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/slub.c |   26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -196,6 +196,9 @@ static inline bool kmem_cache_has_cpu_pa
 /* Use cmpxchg_double */
 #define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
 
+/* Maximum objects in defragmentable slabs */
+static unsigned int max_defrag_slab_objects;
+
 /*
  * Tracking user of a slab.
  */
@@ -4310,22 +4313,45 @@ int __kmem_cache_create(struct kmem_cach
 	return err;
 }
 
+/*
+ * Allocate a slab scratch space that is sufficient to keep at least
+ * max_defrag_slab_objects pointers to individual objects and also a bitmap
+ * for max_defrag_slab_objects.
+ */
+static inline void *alloc_scratch(void)
+{
+	return kmalloc(max_defrag_slab_objects * sizeof(void *) +
+		BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long),
+		GFP_KERNEL);
+}
+
 void kmem_cache_setup_mobility(struct kmem_cache *s,
 	kmem_isolate_func isolate, kmem_migrate_func migrate)
 {
+	int max_objects = oo_objects(s->max);
+
 	/*
 	 * Defragmentable slabs must have a ctor otherwise objects may be
 	 * in an undetermined state after they are allocated.
 	 */
 	BUG_ON(!s->ctor);
+
+	mutex_lock(&slab_mutex);
+
 	s->isolate = isolate;
 	s->migrate = migrate;
+
 	/*
 	 * Sadly serialization requirements currently mean that we have
 	 * to disable fast cmpxchg based processing.
 	 */
 	s->flags &= ~__CMPXCHG_DOUBLE;
 
+	list_move(&s->list, &slab_caches);	/* Move to top */
+	if (max_objects > max_defrag_slab_objects)
+		max_defrag_slab_objects = max_objects;
+
+	mutex_unlock(&slab_mutex);
 }
 EXPORT_SYMBOL(kmem_cache_setup_mobility);
 
Index: linux/mm/slab_common.c
===================================================================
--- linux.orig/mm/slab_common.c
+++ linux/mm/slab_common.c
@@ -393,7 +393,7 @@ static struct kmem_cache *create_cache(c
 		goto out_free_cache;
 
 	s->refcount = 1;
-	list_add(&s->list, &slab_caches);
+	list_add_tail(&s->list, &slab_caches);
 	memcg_link_cache(s);
 out:
 	if (err)


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC 5/7] slub: Slab defrag core
       [not found] <20181220192145.023162076@linux.com>
                   ` (3 preceding siblings ...)
  2018-12-20 19:22 ` [RFC 4/7] slub: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
@ 2018-12-20 19:22 ` Christoph Lameter
  2018-12-20 19:22 ` [RFC 6/7] slub: Extend slabinfo to support -D and -F options Christoph Lameter
  2018-12-20 19:22 ` [RFC 7/7] xarray: Implement migration function for objects Christoph Lameter
  6 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2018-12-20 19:22 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, linux-kernel, Pekka Enberg, akpm, Mel Gorman, andi,
	Rik van Riel, Dave Chinner, Christoph Hellwig, Michal Hocko,
	Mike Kravetz

[-- Attachment #1: mobility_core --]
[-- Type: text/plain, Size: 13969 bytes --]

Slab defragmentation may occur:

1. Unconditionally when kmem_cache_shrink is called on a slab cache by the
   kernel calling kmem_cache_shrink.

2. Through the use of the slabinfo command.

3. Per node defrag conditionally when kmem_cache_defrag(<node>) is called
   (can be called from reclaim code with a later patch).

   Defragmentation is only performed if the fragmentation of the slab
   is lower than the specified percentage. Fragmentation ratios are measured
   by calculating the percentage of objects in use compared to the total
   number of objects that the slab page can accomodate.

   The scanning of slab caches is optimized because the
   defragmentable slabs come first on the list. Thus we can terminate scans
   on the first slab encountered that does not support defragmentation.

   kmem_cache_defrag() takes a node parameter. This can either be -1 if
   defragmentation should be performed on all nodes, or a node number.

A couple of functions must be setup via a call to kmem_cache_setup_defrag()
in order for a slabcache to support defragmentation. These are

kmem_defrag_isolate_func (void *isolate(struct kmem_cache *s, void **objects, int nr))

	Must stabilize that the objects and ensure that they will not be freed until
	the migration function is complete. SLUB guarantees that
	the objects are still allocated. However, other threads may be blocked
	in slab_free() attempting to free objects in the slab. These may succeed
	as soon as isolate() returns to the slab allocator. The function must
	be able to detect such situations and void the attempts to free such
	objects (by for example voiding the corresponding entry in the objects
	array).

	No slab operations may be performed in isolate(). Interrupts
	are disabled. What can be done is very limited. The slab lock
	for the page that contains the object is taken. Any attempt to perform
	a slab operation may lead to a deadlock.

	kmem_defrag_isolate_func returns a private pointer that is passed to
	kmem_defrag_kick_func(). Should we be unable to obtain all references
	then that pointer may indicate to the kick() function that it should
	not attempt any object removal or move but simply undo the measure
	that were used to stabilize the object.

kmem_defrag_migrate_func (void migrate(struct kmem_cache *, void **objects, int nr,
			int node, void *get_result))

	After SLUB has stabilzed the objects in a
	slab it will then drop all locks and use migrate() to move objects out
	of the slab. The existence of the object is guaranteed by virtue of
	the earlier obtained references via kmem_defrag_get_func(). The
	callback may perform any slab operation since no locks are held at
	the time of call.

	The callback should remove the object from the slab in some way. This
	may be accomplished by reclaiming the object and then running
	kmem_cache_free() or reallocating it and then running
	kmem_cache_free(). Reallocation is advantageous because the partial
	slabs were just sorted to have the partial slabs with the most objects
	first. Reallocation is likely to result in filling up a slab in
	addition to freeing up one slab. A filled up slab can also be removed
	from the partial list. So there could be a double effect.

	kmem_defrag_migrate_func() does not return a result. SLUB will check
	the number of remaining objects in the slab. If all objects were
	removed then the slab is freed and we have reduced the overall
	fragmentation of the slab cache.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/slab.h |    3 
 mm/slub.c            |  265 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 215 insertions(+), 53 deletions(-)

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -351,6 +351,12 @@ static __always_inline void slab_lock(st
 	bit_spin_lock(PG_locked, &page->flags);
 }
 
+static __always_inline int slab_trylock(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageTail(page), page);
+	return bit_spin_trylock(PG_locked, &page->flags);
+}
+
 static __always_inline void slab_unlock(struct page *page)
 {
 	VM_BUG_ON_PAGE(PageTail(page), page);
@@ -3946,79 +3952,6 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
-#define SHRINK_PROMOTE_MAX 32
-
-/*
- * kmem_cache_shrink discards empty slabs and promotes the slabs filled
- * up most to the head of the partial lists. New allocations will then
- * fill those up and thus they can be removed from the partial lists.
- *
- * The slabs with the least items are placed last. This results in them
- * being allocated from last increasing the chance that the last objects
- * are freed in them.
- */
-int __kmem_cache_shrink(struct kmem_cache *s)
-{
-	int node;
-	int i;
-	struct kmem_cache_node *n;
-	struct page *page;
-	struct page *t;
-	struct list_head discard;
-	struct list_head promote[SHRINK_PROMOTE_MAX];
-	unsigned long flags;
-	int ret = 0;
-
-	flush_all(s);
-	for_each_kmem_cache_node(s, node, n) {
-		INIT_LIST_HEAD(&discard);
-		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
-			INIT_LIST_HEAD(promote + i);
-
-		spin_lock_irqsave(&n->list_lock, flags);
-
-		/*
-		 * Build lists of slabs to discard or promote.
-		 *
-		 * Note that concurrent frees may occur while we hold the
-		 * list_lock. page->inuse here is the upper limit.
-		 */
-		list_for_each_entry_safe(page, t, &n->partial, lru) {
-			int free = page->objects - page->inuse;
-
-			/* Do not reread page->inuse */
-			barrier();
-
-			/* We do not keep full slabs on the list */
-			BUG_ON(free <= 0);
-
-			if (free == page->objects) {
-				list_move(&page->lru, &discard);
-				n->nr_partial--;
-			} else if (free <= SHRINK_PROMOTE_MAX)
-				list_move(&page->lru, promote + free - 1);
-		}
-
-		/*
-		 * Promote the slabs filled up most to the head of the
-		 * partial list.
-		 */
-		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
-			list_splice(promote + i, &n->partial);
-
-		spin_unlock_irqrestore(&n->list_lock, flags);
-
-		/* Release empty slabs */
-		list_for_each_entry_safe(page, t, &discard, lru)
-			discard_slab(s, page);
-
-		if (slabs_node(s, node))
-			ret = 1;
-	}
-
-	return ret;
-}
-
 #ifdef CONFIG_MEMCG
 static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
 {
@@ -4325,14 +4258,271 @@ static inline void *alloc_scratch(void)
 		GFP_KERNEL);
 }
 
+/*
+ * Move all objects in the given slab.
+ *
+ * If the target node is the current node then the object is moved else
+ * where on the same node. Which is an effective way of defragmentation
+ * since the current slab page with its object is exempt from allocation.
+ *
+ * The scratch area passed to list function is sufficient to hold
+ * struct listhead times objects per slab. We use it to hold void ** times
+ * objects per slab plus a bitmap for each object.
+ */
+static void kmem_cache_move(struct page *page, void *scratch, int node)
+{
+	void **vector = scratch;
+	void *p;
+	void *addr = page_address(page);
+	struct kmem_cache *s;
+	unsigned long *map;
+	int count;
+	void *private;
+	unsigned long flags;
+	unsigned long objects;
+
+	local_irq_save(flags);
+	slab_lock(page);
+
+	BUG_ON(!PageSlab(page));	/* Must be s slab page */
+	BUG_ON(!page->frozen);	/* Slab must have been frozen earlier */
+
+	s = page->slab_cache;
+	objects = page->objects;
+	map = scratch + objects * sizeof(void **);
+
+	/* Determine used objects */
+	bitmap_fill(map, objects);
+	for (p = page->freelist; p; p = get_freepointer(s, p))
+		__clear_bit(slab_index(p, s, addr), map);
+
+	/* Build vector of pointers to objects */
+	count = 0;
+	memset(vector, 0, objects * sizeof(void **));
+	for_each_object(p, s, addr, objects)
+		if (test_bit(slab_index(p, s, addr), map))
+			vector[count++] = p;
+
+	if (s->isolate)
+		private = s->isolate(s, vector, count);
+	else
+		/*
+		 * Objects do not need to be isolated.
+		 */
+		private = NULL;
+
+	/*
+	 * Pinned the objects. Now we can drop the slab lock. The slab
+	 * is frozen so it cannot vanish from under us nor will
+	 * allocations be performed on the slab. However, unlocking the
+	 * slab will allow concurrent slab_frees to proceed. So
+	 * the subsystem must have a way to tell from the content
+	 * of the object that it was freed.
+	 *
+	 * If neither RCU nor ctor is being used then the object
+	 * may be modified by the allocator after being freed
+	 * which may disrupt the ability of the migrate function
+	 * to tell if the object is free or not.
+	 */
+	slab_unlock(page);
+	local_irq_restore(flags);
+
+	/*
+	 * Perform callbacks to move the objects.
+	 */
+	s->migrate(s, vector, count, node, private);
+}
+
+/*
+ * Move slab objects on a particular node of the cache.
+ * Release slabs with zero objects and tryg to call the move function for
+ * slabs with less than the configured percentage of objects allocated.
+ *
+ * Returns the number of slabs left on the node after the operation.
+ */
+static unsigned long __move(struct kmem_cache *s, int node,
+		int target_node, int ratio)
+{
+	unsigned long flags;
+	struct page *page, *page2;
+	LIST_HEAD(move_list);
+	struct kmem_cache_node *n = get_node(s, node);
+
+	if (node == target_node && n->nr_partial <= 1)
+		/*
+		 * Trying to reduce fragmentataion on a node but there is
+		 * only a single or no partial slab page. This is already
+		 * the optimal object density that we can reach
+		 */
+		goto out;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	list_for_each_entry_safe(page, page2, &n->partial, lru) {
+		if (!slab_trylock(page))
+			/* Busy slab. Get out of the way */
+			continue;
+
+		if (page->inuse) {
+			if (page->inuse > ratio * page->objects / 100) {
+				slab_unlock(page);
+				/*
+				 * Skip slab because the object density
+				 * in the slab page is high enough
+				*/
+				continue;
+			}
+
+			list_move(&page->lru, &move_list);
+			if (s->migrate) {
+				/* Remove page from being considered for allocations */
+				n->nr_partial--;
+				page->frozen = 1;
+			}
+			slab_unlock(page);
+		} else {
+			/* Empty slab page */
+			list_del(&page->lru);
+			n->nr_partial--;
+			slab_unlock(page);
+			discard_slab(s, page);
+		}
+	}
+
+	if (!s->migrate)
+		/*
+		 * No defrag method. By simply putting the zaplist at the
+		 * end of the partial list we can let them simmer longer
+		 * and thus increase the chance of all objects being
+		 * reclaimed.
+		 *
+		 * We have effectively sorted the partial list and put
+		 * the slabs with more objects first. As soon as they
+		 * are allocated they are going to be removed from the
+		 * partial list.
+		 */
+		list_splice(&move_list, n->partial.prev);
+
+
+	spin_unlock_irqrestore(&n->list_lock, flags);
+
+	if (s->migrate && !list_empty(&move_list)) {
+		void **scratch = alloc_scratch();
+		struct page *page;
+		struct page *page2;
+
+		if (scratch) {
+			/* Try to remove / move the objects left */
+			list_for_each_entry(page, &move_list, lru) {
+				if (page->inuse)
+					kmem_cache_move(page, scratch, target_node);
+			}
+			kfree(scratch);
+		}
+
+		/* Inspect results and dispose of pages */
+		spin_lock_irqsave(&n->list_lock, flags);
+		list_for_each_entry_safe(page, page2, &move_list, lru) {
+			list_del(&page->lru);
+			slab_lock(page);
+			page->frozen = 0;
+
+			if (page->inuse) {
+				/*
+				 * Objects left in slab page move it to
+				 * the tail of the partial list to
+				 * increase the change that the freeing
+				 * of the remaining objects will
+				 * free the slab page
+				 */
+				n->nr_partial++;
+				list_add_tail(&page->lru, &n->partial);
+				slab_unlock(page);
+
+			} else {
+				slab_unlock(page);
+				discard_slab(s, page);
+			}
+		}
+		spin_unlock_irqrestore(&n->list_lock, flags);
+	}
+out:
+	return atomic_long_read(&n->nr_slabs);
+}
+
+/*
+ * Defrag slabs conditional on the amount of fragmentation in a page.
+ */
+int kmem_cache_defrag(int node)
+{
+	struct kmem_cache *s;
+	unsigned long left = 0;
+
+	/*
+	 * kmem_cache_defrag may be called from the reclaim path which may be
+	 * called for any page allocator alloc. So there is the danger that we
+	 * get called in a situation where slub already acquired the slub_lock
+	 * for other purposes.
+	 */
+	if (!mutex_trylock(&slab_mutex))
+		return 0;
+
+	list_for_each_entry(s, &slab_caches, list) {
+		/*
+		 * Defragmentable caches come first. If the slab cache is not
+		 * defragmentable then we can stop traversing the list.
+		 */
+		if (!s->migrate)
+			break;
+
+		if (node == -1) {
+			int nid;
+
+			for_each_node_state(nid, N_NORMAL_MEMORY)
+				if (s->node[nid]->nr_partial > MAX_PARTIAL)
+					left += __move(s, nid, nid, s->defrag_ratio);
+		} else
+			left  += __move(s, node, node, 100);
+
+	}
+	mutex_unlock(&slab_mutex);
+	return left;
+}
+EXPORT_SYMBOL(kmem_cache_defrag);
+
+/*
+ * kmem_cache_shrink reduces the memory footprint of a slab cache
+ * by as much as possible. This works by removing empty slabs from
+ * the partial list, migrating slab objects to denser slab pages
+ * (if the slab cache supports that) or reorganizing the partial
+ * list so that denser slab pages come first and less dense
+ * allocated slab pages are at the end.
+ */
+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+	int node;
+	int left = 0;
+
+	flush_all(s);
+	for_each_node_state(node, N_NORMAL_MEMORY)
+		left += __move(s, node, node, 100);
+
+	return 0;
+}
+EXPORT_SYMBOL(__kmem_cache_shrink);
+
 void kmem_cache_setup_mobility(struct kmem_cache *s,
 	kmem_isolate_func isolate, kmem_migrate_func migrate)
 {
 	int max_objects = oo_objects(s->max);
 
 	/*
-	 * Defragmentable slabs must have a ctor otherwise objects may be
-	 * in an undetermined state after they are allocated.
+	 * Mobile objects must have a ctor otherwise the
+	 * object may be in an undefined state on allocation.
+	 *
+	 * Since the object may need to be inspected by the
+	 * migration function at any time after allocation we
+	 * must ensure that the object always has a defined
+	 * state.
 	 */
 	BUG_ON(!s->ctor);
 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC 6/7] slub: Extend slabinfo to support -D and -F options
       [not found] <20181220192145.023162076@linux.com>
                   ` (4 preceding siblings ...)
  2018-12-20 19:22 ` [RFC 5/7] slub: Slab defrag core Christoph Lameter
@ 2018-12-20 19:22 ` Christoph Lameter
  2018-12-20 19:22 ` [RFC 7/7] xarray: Implement migration function for objects Christoph Lameter
  6 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2018-12-20 19:22 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, linux-kernel, Pekka Enberg, akpm, Mel Gorman, andi,
	Rik van Riel, Dave Chinner, Christoph Hellwig, Michal Hocko,
	Mike Kravetz

[-- Attachment #1: extend_slabinfo --]
[-- Type: text/plain, Size: 5574 bytes --]

-F lists caches that support moving and defragmentation

-C lists caches that use a ctor.

Change field names for defrag_ratio and remote_node_defrag_ratio.

Add determination of the allocation ratio for a slab. The allocation ratio
is the percentage of available slots for objects in use.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 Documentation/vm/slabinfo.c |   48 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 5 deletions(-)

Index: linux/tools/vm/slabinfo.c
===================================================================
--- linux.orig/tools/vm/slabinfo.c
+++ linux/tools/vm/slabinfo.c
@@ -33,6 +33,8 @@ struct slabinfo {
 	unsigned int hwcache_align, object_size, objs_per_slab;
 	unsigned int sanity_checks, slab_size, store_user, trace;
 	int order, poison, reclaim_account, red_zone;
+	int movable, ctor;
+	int defrag_ratio, remote_node_defrag_ratio;
 	unsigned long partial, objects, slabs, objects_partial, objects_total;
 	unsigned long alloc_fastpath, alloc_slowpath;
 	unsigned long free_fastpath, free_slowpath;
@@ -67,6 +69,8 @@ int show_report;
 int show_alias;
 int show_slab;
 int skip_zero = 1;
+int show_movable;
+int show_ctor;
 int show_numa;
 int show_track;
 int show_first_alias;
@@ -109,14 +113,16 @@ static void fatal(const char *x, ...)
 
 static void usage(void)
 {
-	printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n"
-		"slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
+	printf("slabinfo 4/15/2017. (c) 2007 sgi/(c) 2011 Linux Foundation/(c) 2017 Jump Trading LLC.\n\n"
+		"slabinfo [-aCdDefFhnpvtsz] [-d debugopts] [slab-regexp]\n"
 		"-a|--aliases           Show aliases\n"
 		"-A|--activity          Most active slabs first\n"
 		"-d<options>|--debug=<options> Set/Clear Debug options\n"
+		"-C|--ctor              Show slabs with ctors\n"
 		"-D|--display-active    Switch line format to activity\n"
 		"-e|--empty             Show empty slabs\n"
 		"-f|--first-alias       Show first alias\n"
+		"-F|--movable           Show caches that support movable objects\n"
 		"-h|--help              Show usage information\n"
 		"-i|--inverted          Inverted list\n"
 		"-l|--slabs             Show slabs\n"
@@ -369,7 +375,7 @@ static void slab_numa(struct slabinfo *s
 		return;
 
 	if (!line) {
-		printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+		printf("\n%-21s: Rto ", mode ? "NUMA nodes" : "Slab");
 		for(node = 0; node <= highest_node; node++)
 			printf(" %4d", node);
 		printf("\n----------------------");
@@ -378,6 +384,7 @@ static void slab_numa(struct slabinfo *s
 		printf("\n");
 	}
 	printf("%-21s ", mode ? "All slabs" : s->name);
+	printf("%3d ", s->remote_node_defrag_ratio);
 	for(node = 0; node <= highest_node; node++) {
 		char b[20];
 
@@ -535,6 +542,8 @@ static void report(struct slabinfo *s)
 		printf("** Slabs are destroyed via RCU\n");
 	if (s->reclaim_account)
 		printf("** Reclaim accounting active\n");
+	if (s->movable)
+		printf("** Defragmentation at %d%%\n", s->defrag_ratio);
 
 	printf("\nSizes (bytes)     Slabs              Debug                Memory\n");
 	printf("------------------------------------------------------------------------\n");
@@ -585,6 +594,12 @@ static void slabcache(struct slabinfo *s
 	if (show_empty && s->slabs)
 		return;
 
+	if (show_movable && !s->movable)
+		return;
+
+	if (show_ctor && !s->ctor)
+		return;
+
 	if (sort_loss == 0)
 		store_size(size_str, slab_size(s));
 	else
@@ -599,6 +614,10 @@ static void slabcache(struct slabinfo *s
 		*p++ = '*';
 	if (s->cache_dma)
 		*p++ = 'd';
+	if (s->movable)
+		*p++ = 'F';
+	if (s->ctor)
+		*p++ = 'C';
 	if (s->hwcache_align)
 		*p++ = 'A';
 	if (s->poison)
@@ -633,7 +652,8 @@ static void slabcache(struct slabinfo *s
 		printf("%-21s %8ld %7d %15s %14s %4d %1d %3ld %3ld %s\n",
 			s->name, s->objects, s->object_size, size_str, dist_str,
 			s->objs_per_slab, s->order,
-			s->slabs ? (s->partial * 100) / s->slabs : 100,
+			s->slabs ? (s->partial * 100) /
+					(s->slabs * s->objs_per_slab) : 100,
 			s->slabs ? (s->objects * s->object_size * 100) /
 				(s->slabs * (page_size << s->order)) : 100,
 			flags);
@@ -1252,7 +1272,17 @@ static void read_slab_dir(void)
 			slab->cpu_partial_free = get_obj("cpu_partial_free");
 			slab->alloc_node_mismatch = get_obj("alloc_node_mismatch");
 			slab->deactivate_bypass = get_obj("deactivate_bypass");
+			slab->defrag_ratio = get_obj("defrag_ratio");
+			slab->remote_node_defrag_ratio =
+					get_obj("remote_node_defrag_ratio");
 			chdir("..");
+			if (read_slab_obj(slab, "ops")) {
+				if (strstr(buffer, "ctor :"))
+					slab->ctor = 1;
+				if (strstr(buffer, "migrate :"))
+					slab->movable = 1;
+			}
+
 			if (slab->name[0] == ':')
 				alias_targets++;
 			slab++;
@@ -1329,6 +1359,8 @@ static void xtotals(void)
 }
 
 struct option opts[] = {
+	{ "ctor", no_argument, NULL, 'C' },
+	{ "movable", no_argument, NULL, 'F' },
 	{ "aliases", no_argument, NULL, 'a' },
 	{ "activity", no_argument, NULL, 'A' },
 	{ "debug", optional_argument, NULL, 'd' },
@@ -1364,7 +1396,7 @@ int main(int argc, char *argv[])
 
 	page_size = getpagesize();
 
-	while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXBU",
+	while ((c = getopt_long(argc, argv, "aACd::DefFhil1noprstvzTSN:LXBU",
 						opts, NULL)) != -1)
 		switch (c) {
 		case '1':
@@ -1420,6 +1452,12 @@ int main(int argc, char *argv[])
 		case 'z':
 			skip_zero = 0;
 			break;
+		case 'C':
+			show_ctor = 1;
+			break;
+		case 'F':
+			show_movable = 1;
+			break;
 		case 'T':
 			show_totals = 1;
 			break;


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [RFC 7/7] xarray: Implement migration function for objects
       [not found] <20181220192145.023162076@linux.com>
                   ` (5 preceding siblings ...)
  2018-12-20 19:22 ` [RFC 6/7] slub: Extend slabinfo to support -D and -F options Christoph Lameter
@ 2018-12-20 19:22 ` Christoph Lameter
  6 siblings, 0 replies; 7+ messages in thread
From: Christoph Lameter @ 2018-12-20 19:22 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: linux-mm, linux-kernel, Pekka Enberg, akpm, Mel Gorman, andi,
	Rik van Riel, Dave Chinner, Christoph Hellwig, Michal Hocko,
	Mike Kravetz

[-- Attachment #1: xarray --]
[-- Type: text/plain, Size: 2370 bytes --]

Implement functions to migrate objects. This is based on
initial code by Matthew Wilcox and was modified to work with
slab object migration.

Signed-off-by: Christoph Lameter <cl@linux.com>

Index: linux/lib/radix-tree.c
===================================================================
--- linux.orig/lib/radix-tree.c
+++ linux/lib/radix-tree.c
@@ -1613,6 +1613,18 @@ static int radix_tree_cpu_dead(unsigned
 	return 0;
 }
 
+
+extern void xa_object_migrate(void *tree_node, int numa_node);
+
+static void radix_tree_migrate(struct kmem_cache *s, void **objects, int nr,
+		int node, void *private)
+{
+	int i;
+
+	for (i=0; i<nr; i++)
+		xa_object_migrate(objects[i], node);
+}
+
 void __init radix_tree_init(void)
 {
 	int ret;
@@ -1627,4 +1639,7 @@ void __init radix_tree_init(void)
 	ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
 					NULL, radix_tree_cpu_dead);
 	WARN_ON(ret < 0);
+	kmem_cache_setup_mobility(radix_tree_node_cachep,
+					NULL,
+					radix_tree_migrate);
 }
Index: linux/lib/xarray.c
===================================================================
--- linux.orig/lib/xarray.c
+++ linux/lib/xarray.c
@@ -1934,6 +1934,51 @@ void xa_destroy(struct xarray *xa)
 }
 EXPORT_SYMBOL(xa_destroy);
 
+void xa_object_migrate(struct xa_node *node, int numa_node)
+{
+	struct xarray *xa = READ_ONCE(node->array);
+	void __rcu **slot;
+	struct xa_node *new_node;
+	int i;
+
+	/* Freed or not yet in tree then skip */
+	if (!xa || xa == XA_FREE_MARK)
+		return;
+
+	new_node = kmem_cache_alloc_node(radix_tree_node_cachep, GFP_KERNEL, numa_node);
+
+	xa_lock_irq(xa);
+
+	/* Check again..... */
+	if (xa != node->array || !list_empty(&node->private_list)) {
+		node = new_node;
+		goto unlock;
+	}
+
+	memcpy(new_node, node, sizeof(struct xa_node));
+
+	/* Move pointers to new node */
+	INIT_LIST_HEAD(&new_node->private_list);
+	for (i = 0; i < XA_CHUNK_SIZE; i++) {
+		void *x = xa_entry_locked(xa, new_node, i);
+
+		if (xa_is_node(x))
+			rcu_assign_pointer(xa_to_node(x)->parent, new_node);
+	}
+	if (!new_node->parent)
+		slot = &xa->xa_head;
+	else
+		slot = &xa_parent_locked(xa, new_node)->slots[new_node->offset];
+	rcu_assign_pointer(*slot, xa_mk_node(new_node));
+
+unlock:
+	xa_unlock_irq(xa);
+	xa_node_free(node);
+	rcu_barrier();
+	return;
+
+}
+
 #ifdef XA_DEBUG
 void xa_dump_node(const struct xa_node *node)
 {


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-12-20 19:22 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20181220192145.023162076@linux.com>
2018-12-20 19:21 ` [RFC 1/7] slub: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
2018-12-20 19:21 ` [RFC 2/7] slub: Add defrag_ratio field and sysfs support Christoph Lameter
2018-12-20 19:21 ` [RFC 3/7] slub: Add isolate() and migrate() methods Christoph Lameter
2018-12-20 19:22 ` [RFC 4/7] slub: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
2018-12-20 19:22 ` [RFC 5/7] slub: Slab defrag core Christoph Lameter
2018-12-20 19:22 ` [RFC 6/7] slub: Extend slabinfo to support -D and -F options Christoph Lameter
2018-12-20 19:22 ` [RFC 7/7] xarray: Implement migration function for objects Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).