[RFC v2 PATCH] mm, sl[au]b: Introduce lockless cache

* [RFC v2 PATCH] mm, sl[au]b: Introduce lockless cache
@ 2021-09-20 15:48 Hyeonggon Yoo
  2021-09-20 22:01 ` Matthew Wilcox
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Hyeonggon Yoo @ 2021-09-20 15:48 UTC (permalink / raw)
  To: linux-mm
  Cc: Hyeonggon Yoo, Christoph Lameter, Pekka Enberg, David Rientjes,
	Joonsoo Kim, Andrew Morton, Vlastimil Babka, linux-kernel,
	Matthew Wilcox, Jens Axboe, John Garry, linux-block, netdev

This is RFC v2 of lockless cache on slab, for situation like IO Polling.
It is untested, and just simple proof of concept yet.

So there will be things to improve or erroneous code. (I'm sure of it)
Any opinions or suggestions will be appreciated a lot!

v1 is here:
        https://lore.kernel.org/linux-mm/20210919164239.49905-1-42.hyeyoo@gmail.com/

Changes since v1:
        - It was implemented as separate layer from slab,
                but it is now in slab.
        - Changed linked list to array

Things to think about, or things to work on:
        - Applying limit, batchcount like SLAB
        - I suspect if it does make sence to implment it in SLOB/SLAB.
        - Can we improve it's mechanism depending on SL[AOU]B?
        - Test needed
        - Finding and fixing erroneous code :(
---
 include/linux/slab.h     | 23 ++++++++++++++
 include/linux/slab_def.h |  2 ++
 include/linux/slub_def.h |  1 +
 mm/slab_common.c         | 66 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 083f3ce550bc..091f514dc8e0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -120,6 +120,9 @@
 /* Slab deactivation flag */
 #define SLAB_DEACTIVATED	((slab_flags_t __force)0x10000000U)
 
+/* use percpu lockless cache */
+#define SLAB_LOCKLESS_CACHE	((slab_flags_t __force)0x20000000U)
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
@@ -327,6 +330,13 @@ enum kmalloc_cache_type {
 	NR_KMALLOC_TYPES
 };
 
+#define KMEM_LOCKLESS_CACHE_QUEUE_SIZE 64
+
+struct kmem_lockless_cache {
+	void *queue[KMEM_LOCKLESS_CACHE_QUEUE_SIZE];
+	unsigned int size;
+};
+
 #ifndef CONFIG_SLOB
 extern struct kmem_cache *
 kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];
@@ -429,6 +439,19 @@ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
 void kmem_cache_free(struct kmem_cache *, void *);
 
+#ifndef CONFIG_SLOB
+
+void *kmem_cache_alloc_cached(struct kmem_cache *s, gfp_t gfpflags);
+void kmem_cache_free_cached(struct kmem_cache *s, void *p);
+
+#else
+
+#define kmem_cache_alloc_cached kmem_cache_alloc
+#define kmem_cache_free_cached kmem_cache_free
+
+#endif /* CONFIG_SLOB */
+
+
 /*
  * Bulk allocation and freeing operations. These are accelerated in an
  * allocator specific way to avoid taking locks repeatedly or building
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 3aa5e1e73ab6..9f3161f38a8a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -85,6 +85,8 @@ struct kmem_cache {
 	unsigned int usersize;		/* Usercopy region size */
 
 	struct kmem_cache_node *node[MAX_NUMNODES];
+
+	struct kmem_lockless_cache __percpu *cache; /* percpu lockless cache */
 };
 
 static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 85499f0586b0..1dc3527efba8 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -96,6 +96,7 @@ struct kmem_cache {
 	unsigned int object_size;/* The size of an object without metadata */
 	struct reciprocal_value reciprocal_size;
 	unsigned int offset;	/* Free pointer offset */
+	struct kmem_lockless_cache __percpu *cache; /* percpu lockless cache */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	/* Number of per cpu partial objects to keep around */
 	unsigned int cpu_partial;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ec2bb0beed75..5b8e4d5a644d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -262,6 +262,13 @@ static struct kmem_cache *create_cache(const char *name,
 	s->useroffset = useroffset;
 	s->usersize = usersize;
 
+	if (flags & SLAB_LOCKLESS_CACHE) {
+		s->cache = alloc_percpu(struct kmem_lockless_cache);
+		if (!s->cache)
+			goto out_free_cache;
+		s->cache->size = 0;
+	}
+
 	err = __kmem_cache_create(s, flags);
 	if (err)
 		goto out_free_cache;
@@ -424,6 +431,57 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align,
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
+/**
+ * kmem_cache_alloc_cached - try to allocate from cache without lock
+ * @s: slab cache
+ * @flags: SLAB flags
+ *
+ * Try to allocate from cache without lock. If fails, fill the lockless cache
+ * using bulk alloc API
+ *
+ * Be sure that there's no race condition.
+ * Must create slab cache with SLAB_LOCKLESS_CACHE flag to use this function.
+ *
+ * Return: a pointer to free object on allocation success, NULL on failure.
+ */
+void *kmem_cache_alloc_cached(struct kmem_cache *s, gfp_t gfpflags)
+{
+	struct kmem_lockless_cache *cache = this_cpu_ptr(s->cache);
+
+	BUG_ON(!(s->flags & SLAB_LOCKLESS_CACHE));
+
+	if (cache->size) /* fastpath without lock */
+		return cache->queue[--cache->size];
+
+	/* slowpath */
+	cache->size = kmem_cache_alloc_bulk(s, gfpflags,
+			KMEM_LOCKLESS_CACHE_QUEUE_SIZE, cache->queue);
+	if (cache->size)
+		return cache->queue[--cache->size];
+	else
+		return NULL;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_cached);
+
+/**
+ * kmem_cache_free_cached - return object to cache
+ * @s: slab cache
+ * @p: pointer to free
+ */
+void kmem_cache_free_cached(struct kmem_cache *s, void *p)
+{
+	struct kmem_lockless_cache *cache = this_cpu_ptr(s->cache);
+
+	BUG_ON(!(s->flags & SLAB_LOCKLESS_CACHE));
+
+	/* Is there better way to do this? */
+	if (cache->size == KMEM_LOCKLESS_CACHE_QUEUE_SIZE)
+		kmem_cache_free(s, cache->queue[--cache->size]);
+
+	cache->queue[cache->size++] = p;
+}
+EXPORT_SYMBOL(kmem_cache_free_cached);
+
 static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 {
 	LIST_HEAD(to_destroy);
@@ -460,6 +518,8 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 
 static int shutdown_cache(struct kmem_cache *s)
 {
+	struct kmem_lockless_cache *cache;
+
 	/* free asan quarantined objects */
 	kasan_cache_shutdown(s);
 
@@ -468,6 +528,12 @@ static int shutdown_cache(struct kmem_cache *s)
 
 	list_del(&s->list);
 
+	if (s->flags & SLAB_LOCKLESS_CACHE) {
+		cache = this_cpu_ptr(s->cache);
+		kmem_cache_free_bulk(s, cache->size, cache->queue);
+		free_percpu(s->cache);
+	}
+
 	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
 #ifdef SLAB_SUPPORTS_SYSFS
 		sysfs_slab_unlink(s);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread