linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: torvalds@linux-foundation.org
Cc: corbet@lwn.net, will@kernel.org, peterz@infradead.org,
	boqun.feng@gmail.com, mark.rutland@arm.com,
	catalin.marinas@arm.com, dennis@kernel.org, tj@kernel.org,
	cl@linux.com, hca@linux.ibm.com, gor@linux.ibm.com,
	agordeev@linux.ibm.com, borntraeger@linux.ibm.com,
	svens@linux.ibm.com, Herbert Xu <herbert@gondor.apana.org.au>,
	davem@davemloft.net, tglx@linutronix.de, mingo@redhat.com,
	bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org,
	hpa@zytor.com, joro@8bytes.org, suravee.suthikulpanit@amd.com,
	robin.murphy@arm.com, dwmw2@infradead.org,
	baolu.lu@linux.intel.com, Arnd Bergmann <arnd@arndb.de>,
	penberg@kernel.org, rientjes@google.com, iamjoonsoo.kim@lge.com,
	Andrew Morton <akpm@linux-foundation.org>,
	vbabka@suse.cz, roman.gushchin@linux.dev, 42.hyeyoo@gmail.com,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-s390@vger.kernel.org,
	linux-crypto@vger.kernel.org, iommu@lists.linux.dev,
	linux-arch@vger.kernel.org
Subject: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()
Date: Mon, 19 Dec 2022 16:35:36 +0100	[thread overview]
Message-ID: <20221219154119.550996611@infradead.org> (raw)
In-Reply-To: 20221219153525.632521981@infradead.org


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/slub_def.h |   12 ++-
 mm/slab.h                |   41 +++++++++++--
 mm/slub.c                |  146 ++++++++++++++++++++++++++++-------------------
 3 files changed, 135 insertions(+), 64 deletions(-)

--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -39,15 +39,21 @@ enum stat_item {
 	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
 	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
 	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
-	NR_SLUB_STAT_ITEMS };
+	NR_SLUB_STAT_ITEMS
+};
 
 /*
  * When changing the layout, make sure freelist and tid are still compatible
  * with this_cpu_cmpxchg_double() alignment requirements.
  */
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to next available object */
-	unsigned long tid;	/* Globally unique transaction id */
+	union {
+		struct {
+			void **freelist;	/* Pointer to next available object */
+			unsigned long tid;	/* Globally unique transaction id */
+		};
+		freelist_aba_t freelist_tid;
+	};
 	struct slab *slab;	/* The slab from which we are allocating */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct slab *partial;	/* Partially allocated frozen slabs */
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -5,6 +5,32 @@
  * Internal slab definitions
  */
 
+/*
+ * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
+ * problems with cmpxchg of just a pointer.
+ */
+typedef union {
+	struct {
+		void *freelist;
+		unsigned long counter;
+	};
+#ifdef CONFIG_64BIT
+	u128 full;
+#else
+	u64 full;
+#endif
+} freelist_aba_t;
+
+#ifdef CONFIG_64BIT
+# ifdef system_has_cmpxchg128
+# define system_has_freelist_aba() system_has_cmpxchg128()
+# endif
+#else /* CONFIG_64BIT */
+# ifdef system_has_cmpxchg64
+# define system_has_freelist_aba() system_has_cmpxchg64()
+# endif
+#endif /* CONFIG_64BIT */
+
 /* Reuses the bits in struct page */
 struct slab {
 	unsigned long __page_flags;
@@ -34,14 +60,19 @@ struct slab {
 	};
 	struct kmem_cache *slab_cache;
 	/* Double-word boundary */
-	void *freelist;		/* first free object */
 	union {
-		unsigned long counters;
 		struct {
-			unsigned inuse:16;
-			unsigned objects:15;
-			unsigned frozen:1;
+			void *freelist;		/* first free object */
+			union {
+				unsigned long counters;
+				struct {
+					unsigned inuse:16;
+					unsigned objects:15;
+					unsigned frozen:1;
+				};
+			};
 		};
+		freelist_aba_t freelist_counter;
 	};
 	unsigned int __unused;
 
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -280,7 +280,13 @@ static inline bool kmem_cache_has_cpu_pa
 /* Poison object */
 #define __OBJECT_POISON		((slab_flags_t __force)0x80000000U)
 /* Use cmpxchg_double */
+
+#if defined(system_has_freelist_aba) && \
+    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 #define __CMPXCHG_DOUBLE	((slab_flags_t __force)0x40000000U)
+#else
+#define __CMPXCHG_DOUBLE	((slab_flags_t __force)0U)
+#endif
 
 /*
  * Tracking user of a slab.
@@ -496,6 +502,47 @@ static __always_inline void slab_unlock(
 	__bit_spin_unlock(PG_locked, &page->flags);
 }
 
+static inline bool
+__update_freelist_fast(struct slab *slab,
+		      void *freelist_old, unsigned long counters_old,
+		      void *freelist_new, unsigned long counters_new)
+{
+
+	bool ret = false;
+
+#ifdef system_has_freelist_aba
+	freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
+	freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
+
+#ifdef CONFIG_64BIT
+	ret = try_cmpxchg128(&slab->freelist_counter.full, &old.full, new.full);
+#else
+	ret = try_cmpxchg64(&slab->freelist_counter.full, &old.full, new.full);
+#endif
+#endif /* system_has_freelist_aba */
+
+	return ret;
+}
+
+static inline bool
+__update_freelist_slow(struct slab *slab,
+		      void *freelist_old, unsigned long counters_old,
+		      void *freelist_new, unsigned long counters_new)
+{
+	bool ret = false;
+
+	slab_lock(slab);
+	if (slab->freelist == freelist_old &&
+	    slab->counters == counters_old) {
+		slab->freelist = freelist_new;
+		slab->counters = counters_new;
+		ret = true;
+	}
+	slab_unlock(slab);
+
+	return ret;
+}
+
 /*
  * Interrupts must be disabled (for the fallback code to work right), typically
  * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
@@ -503,33 +550,25 @@ static __always_inline void slab_unlock(
  * allocation/ free operation in hardirq context. Therefore nothing can
  * interrupt the operation.
  */
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
 		void *freelist_old, unsigned long counters_old,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
+	bool ret;
+
 	if (USE_LOCKLESS_FAST_PATH())
 		lockdep_assert_irqs_disabled();
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
-    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+
 	if (s->flags & __CMPXCHG_DOUBLE) {
-		if (cmpxchg_double(&slab->freelist, &slab->counters,
-				   freelist_old, counters_old,
-				   freelist_new, counters_new))
-			return true;
-	} else
-#endif
-	{
-		slab_lock(slab);
-		if (slab->freelist == freelist_old &&
-					slab->counters == counters_old) {
-			slab->freelist = freelist_new;
-			slab->counters = counters_new;
-			slab_unlock(slab);
-			return true;
-		}
-		slab_unlock(slab);
+		ret = __update_freelist_fast(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
+	} else {
+		ret = __update_freelist_slow(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
 	}
+	if (likely(ret))
+		return true;
 
 	cpu_relax();
 	stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -541,36 +580,26 @@ static inline bool __cmpxchg_double_slab
 	return false;
 }
 
-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
 		void *freelist_old, unsigned long counters_old,
 		void *freelist_new, unsigned long counters_new,
 		const char *n)
 {
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
-    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+	bool ret;
+
 	if (s->flags & __CMPXCHG_DOUBLE) {
-		if (cmpxchg_double(&slab->freelist, &slab->counters,
-				   freelist_old, counters_old,
-				   freelist_new, counters_new))
-			return true;
-	} else
-#endif
-	{
+		ret = __update_freelist_fast(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
+	} else {
 		unsigned long flags;
 
 		local_irq_save(flags);
-		slab_lock(slab);
-		if (slab->freelist == freelist_old &&
-					slab->counters == counters_old) {
-			slab->freelist = freelist_new;
-			slab->counters = counters_new;
-			slab_unlock(slab);
-			local_irq_restore(flags);
-			return true;
-		}
-		slab_unlock(slab);
+		ret = __update_freelist_slow(slab, freelist_old, counters_old,
+				            freelist_new, counters_new);
 		local_irq_restore(flags);
 	}
+	if (likely(ret))
+		return true;
 
 	cpu_relax();
 	stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -2168,7 +2197,7 @@ static inline void *acquire_slab(struct
 	VM_BUG_ON(new.frozen);
 	new.frozen = 1;
 
-	if (!__cmpxchg_double_slab(s, slab,
+	if (!__slab_update_freelist(s, slab,
 			freelist, counters,
 			new.freelist, new.counters,
 			"acquire_slab"))
@@ -2500,7 +2529,7 @@ static void deactivate_slab(struct kmem_
 	}
 
 
-	if (!cmpxchg_double_slab(s, slab,
+	if (!slab_update_freelist(s, slab,
 				old.freelist, old.counters,
 				new.freelist, new.counters,
 				"unfreezing slab")) {
@@ -2561,7 +2590,7 @@ static void __unfreeze_partials(struct k
 
 			new.frozen = 0;
 
-		} while (!__cmpxchg_double_slab(s, slab,
+		} while (!__slab_update_freelist(s, slab,
 				old.freelist, old.counters,
 				new.freelist, new.counters,
 				"unfreezing slab"));
@@ -3022,7 +3051,7 @@ static inline void *get_freelist(struct
 		new.inuse = slab->objects;
 		new.frozen = freelist != NULL;
 
-	} while (!__cmpxchg_double_slab(s, slab,
+	} while (!__slab_update_freelist(s, slab,
 		freelist, counters,
 		NULL, new.counters,
 		"get_freelist"));
@@ -3295,6 +3324,18 @@ static __always_inline void maybe_wipe_o
 			0, sizeof(void *));
 }
 
+static inline bool
+__update_cpu_freelist_fast(struct kmem_cache *s,
+			   void *freelist_old, void *freelist_new,
+			   unsigned long tid)
+{
+	freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+	freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
+
+	return this_cpu_cmpxchg(s->cpu_slab->freelist_tid.full,
+				old.full, new.full) == old.full;
+}
+
 /*
  * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
  * have the fastpath folded into their functions. So no function call
@@ -3379,11 +3420,7 @@ static __always_inline void *slab_alloc_
 		 * against code executing on this cpu *not* from access by
 		 * other cpus.
 		 */
-		if (unlikely(!this_cpu_cmpxchg_double(
-				s->cpu_slab->freelist, s->cpu_slab->tid,
-				object, tid,
-				next_object, next_tid(tid)))) {
-
+		if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
 			note_cmpxchg_failure("slab_alloc", s, tid);
 			goto redo;
 		}
@@ -3517,7 +3554,7 @@ static void __slab_free(struct kmem_cach
 			}
 		}
 
-	} while (!cmpxchg_double_slab(s, slab,
+	} while (!slab_update_freelist(s, slab,
 		prior, counters,
 		head, new.counters,
 		"__slab_free"));
@@ -3621,11 +3658,7 @@ static __always_inline void do_slab_free
 
 		set_freepointer(s, tail_obj, freelist);
 
-		if (unlikely(!this_cpu_cmpxchg_double(
-				s->cpu_slab->freelist, s->cpu_slab->tid,
-				freelist, tid,
-				head, next_tid(tid)))) {
-
+		if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
 			note_cmpxchg_failure("slab_free", s, tid);
 			goto redo;
 		}
@@ -4319,11 +4352,12 @@ static int kmem_cache_open(struct kmem_c
 		}
 	}
 
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+#if defined(system_has_freelist_aba) && \
     defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
-	if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+	if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
 		/* Enable fast mode */
 		s->flags |= __CMPXCHG_DOUBLE;
+	}
 #endif
 
 	/*



  parent reply	other threads:[~2022-12-19 15:47 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-19 15:35 [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 01/12] crypto: Remove u128 usage Peter Zijlstra
2022-12-19 15:56   ` Jason A. Donenfeld
2022-12-19 17:00     ` Peter Zijlstra
2022-12-19 17:03       ` Jason A. Donenfeld
2022-12-20  3:50         ` Herbert Xu
2022-12-20  4:11           ` H. Peter Anvin
2022-12-20  4:15             ` Herbert Xu
2022-12-19 15:35 ` [RFC][PATCH 02/12] crypto/ghash-clmulni: Use (struct) be128 Peter Zijlstra
2022-12-20  5:45   ` Eric Biggers
2022-12-19 15:35 ` [RFC][PATCH 03/12] cyrpto/b128ops: Remove struct u128 Peter Zijlstra
2022-12-20  5:52   ` Eric Biggers
2022-12-19 15:35 ` [RFC][PATCH 04/12] types: Introduce [us]128 Peter Zijlstra
2022-12-29  8:30   ` Pavel Machek
2022-12-19 15:35 ` [RFC][PATCH 05/12] arch: Introduce arch_{,try_}_cmpxchg128{,_local}() Peter Zijlstra
2022-12-19 20:07   ` Boqun Feng
2022-12-20 11:08     ` Peter Zijlstra
2022-12-20 14:31       ` Linus Torvalds
2022-12-20 15:09         ` Peter Zijlstra
2023-01-03 13:25       ` Mark Rutland
2023-01-03 14:03         ` Mark Rutland
2023-01-03 16:19           ` Mark Rutland
2023-01-03 16:50             ` Arnd Bergmann
2023-01-04 11:36               ` Mark Rutland
2023-01-04 13:55                 ` Mark Rutland
2022-12-22  1:25   ` Boqun Feng
2022-12-22 13:16     ` Peter Zijlstra
2023-01-03 17:12   ` Heiko Carstens
2023-01-09 18:50   ` Mark Rutland
2023-01-12 10:35     ` Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 06/12] instrumentation: Wire up cmpxchg128() Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 07/12] percpu: Wire up cmpxchg128 Peter Zijlstra
2022-12-29 13:36   ` Arnd Bergmann
2023-01-04 12:09   ` Heiko Carstens
2023-01-09 16:29     ` Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 08/12] s390: Replace cmpxchg_double() with cmpxchg128() Peter Zijlstra
2023-01-10  7:23   ` Heiko Carstens
2023-01-10  8:32     ` Peter Zijlstra
2023-01-10 11:27       ` Mark Rutland
2023-01-10 11:46       ` Heiko Carstens
2023-01-12 11:12         ` Alexander Gordeev
2022-12-19 15:35 ` [RFC][PATCH 09/12] x86,amd_iommu: Replace cmpxchg_double() Peter Zijlstra
2022-12-19 16:47   ` Niklas Schnelle
2022-12-28  8:40   ` Vasant Hegde
2022-12-19 15:35 ` [RFC][PATCH 10/12] x86,intel_iommu: " Peter Zijlstra
2022-12-19 15:35 ` Peter Zijlstra [this message]
2023-01-03 15:58   ` [RFC][PATCH 11/12] slub: " Vlastimil Babka
2023-01-03 17:16   ` Heiko Carstens
2023-01-03 19:08     ` Linus Torvalds
2023-01-04 12:07       ` Heiko Carstens
2023-01-09 16:28       ` Peter Zijlstra
2023-01-09 22:02         ` Linus Torvalds
2023-01-09 22:22           ` H. Peter Anvin
2023-01-10  2:09             ` H. Peter Anvin
2023-01-10 10:28           ` Peter Zijlstra
2022-12-19 15:35 ` [RFC][PATCH 12/12] arch: Remove cmpxchg_double Peter Zijlstra
2022-12-22  1:21 ` [RFC][PATCH 00/12] Introduce cmpxchg128() -- aka. the demise of cmpxchg_double() Boqun Feng

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221219154119.550996611@infradead.org \
    --to=peterz@infradead.org \
    --cc=42.hyeyoo@gmail.com \
    --cc=agordeev@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=arnd@arndb.de \
    --cc=baolu.lu@linux.intel.com \
    --cc=boqun.feng@gmail.com \
    --cc=borntraeger@linux.ibm.com \
    --cc=bp@alien8.de \
    --cc=catalin.marinas@arm.com \
    --cc=cl@linux.com \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=davem@davemloft.net \
    --cc=dennis@kernel.org \
    --cc=dwmw2@infradead.org \
    --cc=gor@linux.ibm.com \
    --cc=hca@linux.ibm.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=hpa@zytor.com \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=iommu@lists.linux.dev \
    --cc=joro@8bytes.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mingo@redhat.com \
    --cc=penberg@kernel.org \
    --cc=rientjes@google.com \
    --cc=robin.murphy@arm.com \
    --cc=roman.gushchin@linux.dev \
    --cc=suravee.suthikulpanit@amd.com \
    --cc=svens@linux.ibm.com \
    --cc=tglx@linutronix.de \
    --cc=tj@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=vbabka@suse.cz \
    --cc=will@kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).