[rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops

All of lore.kernel.org
 help / color / mirror / Atom feed

* [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops
@ 2010-12-02 21:53 Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 1/8] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
                   ` (8 more replies)
  0 siblings, 9 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

This is a patchset on top of the thiscpuops upgrade patchset. It
adds cmpxchg and xchg support and uses that to improve various
kernel subsystem.

For vm statistics we can avoid interrupt enable/disable which reduces
the latencies of the vm counter updates to a fraction.

The slub fastpaths and slowpaths can decrease their cycle count. In
particular it is possible to improve the performance of the free slowpath.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 1/8] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 2/8] --- include/linux/percpu.h | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) Christoph Lameter
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_cmpxchg --]
[-- Type: text/plain, Size: 10952 bytes --]

Provide arch code to create the (local atomic) instructions.

V2->V3:
	- Clean up some parameters
	- Provide implementation of irqsafe_cpu_cmpxchg

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |  258 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 257 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-11-30 14:06:56.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-11-30 14:21:43.000000000 -0600
@@ -259,6 +259,22 @@ extern void __bad_size_call_parameter(vo
 	ret__;								\
 })
 
+/* Special handling for cmpxchg_double */
+#define __pcpu_size_call_return_int(stem, pcp, ...)			\
+({									\
+	int ret__;							\
+	__verify_pcpu_ptr(pcp);						\
+	switch(sizeof(*pcp)) {						\
+	case 1: ret__ = stem##1(pcp, __VA_ARGS__);break;		\
+	case 2: ret__ = stem##2(pcp, __VA_ARGS__);break;		\
+	case 4: ret__ = stem##4(pcp, __VA_ARGS__);break;		\
+	case 8: ret__ = stem##8(pcp, __VA_ARGS__);break;		\
+	default:							\
+		__bad_size_call_parameter();break;			\
+	}								\
+	ret__;								\
+})
+
 #define __pcpu_size_call(stem, variable, ...)				\
 do {									\
 	__verify_pcpu_ptr(&(variable));					\
@@ -322,6 +338,185 @@ do {									\
 # define this_cpu_read(pcp)	__pcpu_size_call_return(this_cpu_read_, (pcp))
 #endif
 
+#define __this_cpu_generic_xchg(pcp, nval)				\
+({	typeof(pcp) ret__;						\
+	ret__ = __this_cpu_read(pcp);					\
+	__this_cpu_write(pcp, nval);					\
+	ret__;								\
+})
+
+#ifndef __this_cpu_xchg
+# ifndef __this_cpu_xchg_1
+#  define __this_cpu_xchg_1(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_2
+#  define __this_cpu_xchg_2(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_4
+#  define __this_cpu_xchg_4(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_8
+#  define __this_cpu_xchg_8(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# define __this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(__this_cpu_xchg_, (pcp), nval)
+#endif
+
+#define _this_cpu_generic_xchg(pcp, nval)				\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	ret__ = __this_cpu_read(pcp);					\
+	__this_cpu_write(pcp, nval);					\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_xchg
+# ifndef this_cpu_xchg_1
+#  define this_cpu_xchg_1(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_2
+#  define this_cpu_xchg_2(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_4
+#  define this_cpu_xchg_4(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_8
+#  define this_cpu_xchg_8(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# define this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(this_cpu_xchg_, (pcp), nval)
+#endif
+
+#define _this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg
+# ifndef this_cpu_cmpxchg_1
+#  define this_cpu_cmpxchg_1(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_2
+#  define this_cpu_cmpxchg_2(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_4
+#  define this_cpu_cmpxchg_4(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_8
+#  define this_cpu_cmpxchg_8(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(this_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
+#define __this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({									\
+	typeof(pcp) ret__;						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	ret__;								\
+})
+
+#ifndef __this_cpu_cmpxchg
+# ifndef __this_cpu_cmpxchg_1
+#  define __this_cpu_cmpxchg_1(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_2
+#  define __this_cpu_cmpxchg_2(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_4
+#  define __this_cpu_cmpxchg_4(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_8
+#  define __this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define __this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(\
+				__this_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
+/*
+ * cmpxchg_double replaces two adjacent scalars at once. The first parameter
+ * passed is a percpu pointer, not a scalar like the other this_cpu
+ * operations. This is so because the function operates on two scalars
+ * (must be of same size). A truth value is returned to indicate success or
+ * failure (since a double register result is difficult to handle).
+ * There is very limited hardware support for these operations. So only certain
+ * sizes may work.
+ */
+#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({									\
+	typeof(oval2) * __percpu pcp2 = (typeof(oval2) *)((pcp) + 1);	\
+	int __ret = 0;							\
+	if (__this_cpu_read(*pcp) == (oval1) &&				\
+			 __this_cpu_read(*pcp2)  == (oval2)) {		\
+		__this_cpu_write(*pcp, (nval1));			\
+		__this_cpu_write(*pcp2, (nval2));			\
+		__ret = 1;						\
+	}								\
+	(__ret);							\
+})
+
+#ifndef __this_cpu_cmpxchg_double
+# ifndef __this_cpu_cmpxchg_double_1
+#  define __this_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_2
+#  define __this_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_4
+#  define __this_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_8
+#  define __this_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define __this_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_size_call_return_int(__this_cpu_cmpxchg_double_, (pcp),	\
+					 oval1, oval2, nval1, nval2)
+#endif
+
+#define _this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	preempt_disable();						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp,			\
+			oval1, oval2, nval1, nval2);			\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg_double
+# ifndef this_cpu_cmpxchg_double_1
+#  define this_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_2
+#  define this_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_4
+#  define this_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_8
+#  define this_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define this_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_size_call_return_int(this_cpu_cmpxchg_double_, (pcp),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
+
+
+
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	preempt_disable();						\
@@ -610,7 +805,7 @@ do {									\
  * IRQ safe versions of the per cpu RMW operations. Note that these operations
  * are *not* safe against modification of the same variable from another
  * processors (which one gets when using regular atomic operations)
- . They are guaranteed to be atomic vs. local interrupts and
+ * They are guaranteed to be atomic vs. local interrupts and
  * preemption only.
  */
 #define irqsafe_cpu_generic_to_op(pcp, val, op)				\
@@ -697,4 +892,65 @@ do {									\
 # define irqsafe_cpu_xor(pcp, val) __pcpu_size_call(irqsafe_cpu_xor_, (val))
 #endif
 
+#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({									\
+	typeof(pcp) ret__;						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cpu_cmpxchg
+# ifndef irqsafe_cpu_cmpxchg_1
+#  define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_2
+#  define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_4
+#  define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_8
+#  define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define irqsafe_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
+#define irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp,			\
+			oval1, oval2, nval1, nval2);			\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cpu_cmpxchg_double
+# ifndef irqsafe_cpu_cmpxchg_double_1
+#  define irqsafe_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_2
+#  define irqsafe_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_4
+#  define irqsafe_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_8
+#  define irqsafe_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define irqsafe_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_size_call_return_int(irqsafe_cpu_cmpxchg_double_, (pcp),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
 #endif /* __LINUX_PERCPU_H */


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 2/8] --- include/linux/percpu.h | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-)
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 1/8] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-02 22:06   ` [rfc: cpuops adv V1 2/8] Fallback to atomic xchg, cmpxchg Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_atomic_fallback --]
[-- Type: text/plain, Size: 1922 bytes --]

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-12-02 12:17:14.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-12-02 12:17:30.000000000 -0600
@@ -361,14 +361,7 @@ do {									\
 # define __this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(__this_cpu_xchg_, (pcp), nval)
 #endif
 
-#define _this_cpu_generic_xchg(pcp, nval)				\
-({	typeof(pcp) ret__;						\
-	preempt_disable();						\
-	ret__ = __this_cpu_read(pcp);					\
-	__this_cpu_write(pcp, nval);					\
-	preempt_enable();						\
-	ret__;								\
-})
+#define _this_cpu_generic_xchg(pcp, nval)	xchg(__this_cpu_ptr(&(pcp)), nval)
 
 #ifndef this_cpu_xchg
 # ifndef this_cpu_xchg_1
@@ -386,15 +379,7 @@ do {									\
 # define this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(this_cpu_xchg_, (pcp), nval)
 #endif
 
-#define _this_cpu_generic_cmpxchg(pcp, oval, nval)			\
-({	typeof(pcp) ret__;						\
-	preempt_disable();						\
-	ret__ = __this_cpu_read(pcp);					\
-	if (ret__ == (oval))						\
-		__this_cpu_write(pcp, nval);				\
-	preempt_enable();						\
-	ret__;								\
-})
+#define _this_cpu_generic_cmpxchg(pcp, oval, nval)	cmpxchg(__this_cpu_ptr(&(pcp)), oval, nval);
 
 #ifndef this_cpu_cmpxchg
 # ifndef this_cpu_cmpxchg_1
@@ -892,17 +877,7 @@ do {									\
 # define irqsafe_cpu_xor(pcp, val) __pcpu_size_call(irqsafe_cpu_xor_, (val))
 #endif
 
-#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)			\
-({									\
-	typeof(pcp) ret__;						\
-	unsigned long flags;						\
-	local_irq_save(flags);						\
-	ret__ = __this_cpu_read(pcp);					\
-	if (ret__ == (oval))						\
-		__this_cpu_write(pcp, nval);				\
-	local_irq_restore(flags);					\
-	ret__;								\
-})
+#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)	cmpxchg(__this_cpu_ptr(&(pcp), oval, nval)
 
 #ifndef irqsafe_cpu_cmpxchg
 # ifndef irqsafe_cpu_cmpxchg_1


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 1/8] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 2/8] --- include/linux/percpu.h | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-06 17:14   ` Avi Kivity
  2010-12-02 21:53 ` [rfc: cpuops adv V1 4/8] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_cmpxchg_x86 --]
[-- Type: text/plain, Size: 11154 bytes --]

Provide support as far as the hardware capabilities of the x86 cpus
allow.

V1->V2:
	- Mark %rdx clobbering during cmpxchg16b
	- Provide emulation of cmpxchg16b for early AMD processors

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/Kconfig.cpu          |    3 
 arch/x86/include/asm/percpu.h |  183 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/lib/Makefile         |    1 
 arch/x86/lib/cmpxchg16b_emu.S |   55 ++++++++++++
 4 files changed, 238 insertions(+), 4 deletions(-)

Index: linux-2.6/arch/x86/lib/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile	2010-11-30 15:14:05.000000000 -0600
+++ linux-2.6/arch/x86/lib/Makefile	2010-12-01 09:50:55.000000000 -0600
@@ -42,4 +42,5 @@ else
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
 	lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+	lib-y += cmpxchg16b_emu.o
 endif
Index: linux-2.6/arch/x86/lib/cmpxchg16b_emu.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/arch/x86/lib/cmpxchg16b_emu.S	2010-12-01 09:50:55.000000000 -0600
@@ -0,0 +1,55 @@
+/*
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; version 2
+ *	of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al  : Operation successful
+ */
+ENTRY(cmpxchg16b_local_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
+# al not via the ZF. Caller will access al to get result.
+#
+cmpxchg16b_local_emu:
+	pushf
+	cli
+
+	cmpq  %gs:(%rsi), %rax
+	jne not_same
+	cmpq %gs:8(%rsi), %rdx
+	jne not_same
+
+	movq %rbx,  %gs:(%rsi)
+	movq %rcx, %gs:8(%rsi)
+
+	popf
+	mov $1, %al
+	ret
+
+ not_same:
+	popf
+	xor  %al,%al
+	ret
+
+CFI_ENDPROC
+ENDPROC(cmpxchg16b_local_emu)
Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-01 09:47:44.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-01 09:50:55.000000000 -0600
@@ -212,6 +212,83 @@ do {									\
 	ret__;								\
 })
 
+/*
+ * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
+ * full lock semantics even though they are not needed.
+ */
+#define percpu_xchg_op(var, nval)					\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __new = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("xchgb %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "q" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("xchgw %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("xchgl %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("xchgq %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	__ret;								\
+})
+
+/*
+ * cmpxchg has no such implies lock semantics as a result it is much
+ * more efficient for cpu local operations.
+ */
+#define percpu_cmpxchg_op(var, oval, nval)				\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __old = (oval);					\
+	typeof(var) __new = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("cmpxchgb %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "q" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("cmpxchgw %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("cmpxchgl %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("cmpxchgq %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	__ret;								\
+})
+
 #define percpu_from_op(op, var, constraint)		\
 ({							\
 	typeof(var) pfo_ret__;				\
@@ -335,14 +412,76 @@ do {									\
 #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
 
+#define __this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define __this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define __this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+
 #ifndef CONFIG_M386
 #define __this_cpu_add_return_1(pcp, val)	percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_2(pcp, val)	percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_4(pcp, val)	percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_4(pcp, val)		percpu_add_return_op(pcp, val)
-#endif
+#define this_cpu_add_return_1(pcp, val)		percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_2(pcp, val)		percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_4(pcp, val)		percpu_add_return_op((pcp), val)
+
+#define __this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define __this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define __this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#endif /* !CONFIG_M386 */
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define percpu_cmpxchg8b_double(pcp, o1, o2, n1, n2)			\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	typeof(o2) __dummy = n2;						\
+	asm("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t"		\
+		    : "=a"(__ret), "=m" (*pcp), "=d"(__dummy)		\
+		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
+	__ret;								\
+})
+#endif /* CONFIG_X86_CMPXCHG64 */
+
+#define __this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2) percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2)	percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2)	percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+
+#ifndef CONFIG_X86_64
+#ifdef CONFIG_X86_CMPXCHG64
+/* We can support a 8 byte cmpxchg with a special instruction on 32 bit */
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)				\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __old = (oval);					\
+	typeof(var) __new = (nval);					\
+	asm("cmpxchg8b %2, "__percpu_arg(1)				\
+	    : "=A" (__ret), "+m" (&pcp)					\
+	    : "b" (((u32)new), "c" ((u32)(new >> 32)),  "0" (__old)	\
+	    : "memory");						\
+	__ret;								\
+})
+
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)
+#define irqsafe_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)
+
+#endif /* CONFIG_X86_CMPXCHG64 */
+#endif /* !CONFIG_X86_64 */
+
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -370,6 +509,42 @@ do {									\
 #define __this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
 
+#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+
+/*
+ * Pretty complex macro to generate cmpxchg16 instruction. The instruction
+ * is not supported on early AMD64 processors so we must be able to emulate
+ * it in software. The address used in the cmpxchg16 instruction must be
+ * aligned to a 16 byte boundary.
+ */
+#define percpu_cmpxchg16b(pcp, o1, o2, n1, n2)			\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	typeof(o2) __dummy;						\
+	VM_BUG_ON(((unsigned long)pcp) % 16);				\
+	alternative_io("call cmpxchg16b_local_emu\n\t" P6_NOP4,		\
+			"cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t",	\
+			X86_FEATURE_CX16,				\
+		    	ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),	\
+		        "S" (pcp), "b"(__n1), "c"(__n2),		\
+			 "a"(__o1), "d"(__o2));				\
+	__ret;								\
+})
+
+#define __this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2) percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+
 #endif
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
Index: linux-2.6/arch/x86/Kconfig.cpu
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig.cpu	2010-12-01 10:04:37.000000000 -0600
+++ linux-2.6/arch/x86/Kconfig.cpu	2010-12-01 10:05:18.000000000 -0600
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
+config CMPXCHG_LOCAL
+	def_bool X86_64 || (X86_32 && !M386)
+
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 4/8] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
                   ` (2 preceding siblings ...)
  2010-12-02 21:53 ` [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 5/8] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, Peter Zijlstra, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: irq --]
[-- Type: text/plain, Size: 2112 bytes --]

The irq work queue is a per cpu object and it is sufficient for
synchronization if per cpu atomics are used. Doing so simplifies
the code and reduces the overhead of the code.

Before:

christoph@linux-2.6$ size kernel/irq_work.o
   text	   data	    bss	    dec	    hex	filename
    451	      8	      1	    460	    1cc	kernel/irq_work.o

After:

christoph@linux-2.6$ size kernel/irq_work.o 
   text	   data	    bss	    dec	    hex	filename
    438	      8	      1	    447	    1bf	kernel/irq_work.o

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Christoph Lameter <cl@linux.com>

---
 kernel/irq_work.c |   18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

Index: linux-2.6/kernel/irq_work.c
===================================================================
--- linux-2.6.orig/kernel/irq_work.c	2010-12-02 12:16:54.000000000 -0600
+++ linux-2.6/kernel/irq_work.c	2010-12-02 12:18:26.000000000 -0600
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
  */
 static void __irq_work_queue(struct irq_work *entry)
 {
-	struct irq_work **head, *next;
+	struct irq_work *next;
 
-	head = &get_cpu_var(irq_work_list);
+	preempt_disable();
 
 	do {
-		next = *head;
+		next = __this_cpu_read(irq_work_list);
 		/* Can assign non-atomic because we keep the flags set. */
 		entry->next = next_flags(next, IRQ_WORK_FLAGS);
-	} while (cmpxchg(head, next, entry) != next);
+	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
 
 	/* The list was empty, raise self-interrupt to start processing. */
 	if (!irq_work_next(entry))
 		arch_irq_work_raise();
 
-	put_cpu_var(irq_work_list);
+	preempt_enable();
 }
 
 /*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
  */
 void irq_work_run(void)
 {
-	struct irq_work *list, **head;
+	struct irq_work *list;
 
-	head = &__get_cpu_var(irq_work_list);
-	if (*head == NULL)
+	if (this_cpu_read(irq_work_list) == NULL)
 		return;
 
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
 
-	list = xchg(head, NULL);
+	list = this_cpu_xchg(irq_work_list, NULL);
+
 	while (list != NULL) {
 		struct irq_work *entry = list;
 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 5/8] vmstat: User per cpu atomics to avoid interrupt disable / enable
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
                   ` (3 preceding siblings ...)
  2010-12-02 21:53 ` [rfc: cpuops adv V1 4/8] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 6/8] Lockless (and preemptless) fastpaths for slub Christoph Lameter
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: vmstat_group --]
[-- Type: text/plain, Size: 4661 bytes --]

Currently the operations to increment vm counters must disable interrupts
in order to not mess up their housekeeping of counters.

So use this_cpu_cmpxchg() to avoid the overhead. Since we can no longer
count on preremption being disabled  we still have some minor issues.
The fetching of the counter thresholds is racy.
A threshold from another cpu may be applied if we happen to be
rescheduled on another cpu.  However, the following vmstat operation
will then bring the counter again under the threshold limit.

The operations for __xxx_zone_state are not changed since the caller
has taken care of the synchronization needs (and therefore the cycle
count is even less than the optimized version for the irq disable case
provided here).

The optimization using this_cpu_cmpxchg will only be used if the arch
supports efficient this_cpu_ops (must have CONFIG_CMPXCHG_LOCAL set!)

The use of this_cpu_cmpxchg reduces the cycle count for the counter
operations by %80 (inc_zone_page_state goes from 170 cycles to 32).

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/vmstat.c |  101 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 14 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2010-12-01 10:04:19.000000000 -0600
+++ linux-2.6/mm/vmstat.c	2010-12-01 10:09:06.000000000 -0600
@@ -185,20 +185,6 @@ void __mod_zone_page_state(struct zone *
 EXPORT_SYMBOL(__mod_zone_page_state);
 
 /*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
-					int delta)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_zone_page_state(zone, item, delta);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
  * Optimized increment and decrement functions.
  *
  * These are only for a single page and therefore can take a struct page *
@@ -265,6 +251,92 @@ void __dec_zone_page_state(struct page *
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ *     0       No overstepping
+ *     1       Overstepping half of threshold
+ *     -1      Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+       enum zone_stat_item item, int delta, int overstep_mode)
+{
+	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	s8 __percpu *p = pcp->vm_stat_diff + item;
+	long o, n, t, z;
+
+	do {
+		z = 0;  /* overflow to zone counters */
+
+		/*
+		 * The fetching of the stat_threshold is racy. We may apply
+		 * a counter threshold to the wrong the cpu if we get
+		 * rescheduled while executing here. However, the following
+		 * will apply the threshold again and therefore bring the
+		 * counter under the threshold.
+		 */
+		t = this_cpu_read(pcp->stat_threshold);
+
+		o = this_cpu_read(*p);
+		n = delta + o;
+
+		if (n > t || n < -t) {
+			int os = overstep_mode * (t >> 1) ;
+
+			/* Overflow must be added to zone counters */
+			z = n + os;
+			n = -os;
+		}
+	} while (this_cpu_cmpxchg(*p, o, n) != o);
+
+	if (z)
+		zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+					int delta)
+{
+	mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+	mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+					int delta)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_zone_page_state(zone, item, delta);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
 	unsigned long flags;
@@ -295,6 +367,7 @@ void dec_zone_page_state(struct page *pa
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
+#endif
 
 /*
  * Update the zone counters for one cpu.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 6/8] Lockless (and preemptless) fastpaths for slub
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
                   ` (4 preceding siblings ...)
  2010-12-02 21:53 ` [rfc: cpuops adv V1 5/8] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 7/8] slub: Add PageSlubPartial Christoph Lameter
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: slub_generation --]
[-- Type: text/plain, Size: 14008 bytes --]

Use the this_cpu_cmpxchg_double functionality to implement a lockless
allocation algorithm on arches that support fast this_cpu_ops.

Each of the per cpu pointers is paired with a transaction id that ensures
that updates of the per cpu information can only occur in sequence on
a certain cpu.

A transaction id is a "long" integer that is comprised of an event number
and the cpu number. The event number is incremented for every change to the
per cpu state. This means that the cmpxchg instruction can verify for an
update that nothing interfered and that we are updating the percpu structure
for the processor where we picked up the information and that we are also
currently on that processor when we update the information.

This results in a significant decrease of the overhead in the fastpaths. It
also makes it easy to adopt the fast path for realtime kernels since this
is lockless and does not require that the use of the current per cpu area
over the critical section. It is only important that the per cpu area is
current at the beginning of the critical section and at that end.

So there is no need even to disable preemption which will make the allocations
scale well in a RT environment.

Test results show that the fastpath cycle count is reduced by up to ~ 40%
(alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree
adds a few cycles.

Sadly this does nothing for the slowpath which is where the main issues with
performance in slub are but the best case performance rises significantly.
(For that see the next patches)

Kmalloc: alloc/free test

Before:

10000 times kmalloc(8)/kfree -> 142 cycles
10000 times kmalloc(16)/kfree -> 142 cycles
10000 times kmalloc(32)/kfree -> 142 cycles
10000 times kmalloc(64)/kfree -> 142 cycles
10000 times kmalloc(128)/kfree -> 142 cycles
10000 times kmalloc(256)/kfree -> 140 cycles
10000 times kmalloc(512)/kfree -> 140 cycles
10000 times kmalloc(1024)/kfree -> 144 cycles
10000 times kmalloc(2048)/kfree -> 144 cycles
10000 times kmalloc(4096)/kfree -> 144 cycles
10000 times kmalloc(8192)/kfree -> 144 cycles
10000 times kmalloc(16384)/kfree -> 913 cycles

After:

10000 times kmalloc(8)/kfree -> 81 cycles
10000 times kmalloc(16)/kfree -> 81 cycles
10000 times kmalloc(32)/kfree -> 81 cycles
10000 times kmalloc(64)/kfree -> 81 cycles
10000 times kmalloc(128)/kfree -> 81 cycles
10000 times kmalloc(256)/kfree -> 87 cycles
10000 times kmalloc(512)/kfree -> 87 cycles
10000 times kmalloc(1024)/kfree -> 87 cycles
10000 times kmalloc(2048)/kfree -> 84 cycles
10000 times kmalloc(4096)/kfree -> 81 cycles
10000 times kmalloc(8192)/kfree -> 81 cycles
10000 times kmalloc(16384)/kfree -> 927 cycles


Kmalloc: Repeatedly allocate then free test

Before:

10000 times kmalloc(8) -> 102 cycles kfree -> 111 cycles
10000 times kmalloc(16) -> 101 cycles kfree -> 111 cycles
10000 times kmalloc(32) -> 120 cycles kfree -> 114 cycles
10000 times kmalloc(64) -> 161 cycles kfree -> 130 cycles
10000 times kmalloc(128) -> 284 cycles kfree -> 129 cycles
10000 times kmalloc(256) -> 410 cycles kfree -> 134 cycles
10000 times kmalloc(512) -> 312 cycles kfree -> 197 cycles
10000 times kmalloc(1024) -> 377 cycles kfree -> 494 cycles
10000 times kmalloc(2048) -> 571 cycles kfree -> 522 cycles
10000 times kmalloc(4096) -> 674 cycles kfree -> 565 cycles
10000 times kmalloc(8192) -> 836 cycles kfree -> 648 cycles
10000 times kmalloc(16384) -> 1201 cycles kfree -> 775 cycles

After:

10000 times kmalloc(8) -> 69 cycles kfree -> 115 cycles
10000 times kmalloc(16) -> 73 cycles kfree -> 115 cycles
10000 times kmalloc(32) -> 86 cycles kfree -> 119 cycles
10000 times kmalloc(64) -> 122 cycles kfree -> 125 cycles
10000 times kmalloc(128) -> 247 cycles kfree -> 132 cycles
10000 times kmalloc(256) -> 375 cycles kfree -> 137 cycles
10000 times kmalloc(512) -> 283 cycles kfree -> 183 cycles
10000 times kmalloc(1024) -> 316 cycles kfree -> 504 cycles
10000 times kmalloc(2048) -> 516 cycles kfree -> 531 cycles
10000 times kmalloc(4096) -> 610 cycles kfree -> 570 cycles
10000 times kmalloc(8192) -> 759 cycles kfree -> 651 cycles
10000 times kmalloc(16384) -> 1169 cycles kfree -> 778 cycles

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/slub_def.h |    5 -
 mm/slub.c                |  228 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 219 insertions(+), 14 deletions(-)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2010-12-02 14:53:23.000000000 -0600
+++ linux-2.6/include/linux/slub_def.h	2010-12-02 14:58:48.000000000 -0600
@@ -36,7 +36,10 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to first free per cpu object */
+	void **freelist;		/* Pointer to next available object */
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;	/* Globally unique transaction id */
+#endif
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
 #ifdef CONFIG_SLUB_STATS
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-12-02 14:53:23.000000000 -0600
+++ linux-2.6/mm/slub.c	2010-12-02 15:36:55.000000000 -0600
@@ -805,14 +805,24 @@ static inline void slab_post_alloc_hook(
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
 {
 	kmemleak_free_recursive(x, s->flags);
-}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
-{
-	kmemcheck_slab_free(s, object, s->objsize);
-	debug_check_no_locks_freed(object, s->objsize);
-	if (!(s->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(object, s->objsize);
+	/*
+	 * Trouble is that we no longer disable interupts in the fast path
+	 * So in order to make the debug calls that expect irqs to be
+	 * disabled we need to disable interrupts temporarily.
+	 */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+	{
+		unsigned long flags;
+
+		local_irq_save(flags);
+		kmemcheck_slab_free(s, x, s->objsize);
+		debug_check_no_locks_freed(x, s->objsize);
+		if (!(s->flags & SLAB_DEBUG_OBJECTS))
+			debug_check_no_obj_freed(x, s->objsize);
+		local_irq_restore(flags);
+	}
+#endif
 }
 
 /*
@@ -1099,9 +1109,6 @@ static inline void slab_post_alloc_hook(
 
 static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s,
-		void *object) {}
-
 #endif /* CONFIG_SLUB_DEBUG */
 
 /*
@@ -1485,6 +1492,77 @@ static void unfreeze_slab(struct kmem_ca
 	}
 }
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
+{
+	return tid + TID_STEP;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+	return tid % TID_STEP;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+	return tid / TID_STEP;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+	return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+		const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef CONFIG_DEBUG_VM
+	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+	printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPT
+	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+		printk("due to cpu change %d -> %d\n",
+			tid_to_cpu(tid), tid_to_cpu(actual_tid));
+	else
+#endif
+	if (tid_to_event(tid) != tid_to_event(actual_tid))
+		printk("due to cpu running other code. Event %ld->%ld\n",
+			tid_to_event(tid), tid_to_event(actual_tid));
+	else
+		printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+			actual_tid, tid, next_tid(tid));
+#endif
+}
+
+#endif
+
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+#endif
+
+}
 /*
  * Remove the cpu slab
  */
@@ -1516,6 +1594,9 @@ static void deactivate_slab(struct kmem_
 		page->inuse--;
 	}
 	c->page = NULL;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+#endif
 	unfreeze_slab(s, page, tail);
 }
 
@@ -1650,6 +1731,19 @@ static void *__slab_alloc(struct kmem_ca
 {
 	void **object;
 	struct page *new;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
+
+	local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+	/*
+	 * We may have been preempted and rescheduled on a different
+	 * cpu before disabling interrupts. Need to reload cpu area
+	 * pointer.
+	 */
+	c = this_cpu_ptr(s->cpu_slab);
+#endif
+#endif
 
 	/* We handle __GFP_ZERO in the caller */
 	gfpflags &= ~__GFP_ZERO;
@@ -1676,6 +1770,10 @@ load_freelist:
 	c->node = page_to_nid(c->page);
 unlock_out:
 	slab_unlock(c->page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+	local_irq_restore(flags);
+#endif
 	stat(s, ALLOC_SLOWPATH);
 	return object;
 
@@ -1737,23 +1835,73 @@ static __always_inline void *slab_alloc(
 {
 	void **object;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	if (slab_pre_alloc_hook(s, gfpflags))
 		return NULL;
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+redo:
+#endif
+
+	/*
+	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+	 * enabled. We may switch back and forth between cpus while
+	 * reading from one cpu area. That does not matter as long
+	 * as we end up on the original cpu again when doing the cmpxchg.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * The transaction ids are globally unique per cpu and per operation on
+	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+	 * occurs on the right processor and that there was no operation on the
+	 * linked list in between.
+	 */
+	tid = c->tid;
+	barrier();
+#endif
+
 	object = c->freelist;
 	if (unlikely(!object || !node_match(c, node)))
 
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 
 	else {
+#ifdef CMPXCHG_LOCAL
+		/*
+		 * The cmpxchg will only match if there was no additonal
+		 * operation and if we are on the right processor.
+		 *
+		 * The cmpxchg does the following atomically (without lock semantics!)
+		 * 1. Relocate first pointer to the current per cpu area.
+		 * 2. Verify that tid and freelist have not been changed
+		 * 3. If they were not changed replace tid and freelist
+		 *
+		 * Since this is without lock semantics the protection is only against
+		 * code executing on this cpu *not* from access by other cpus.
+		 */
+		if (unlikely(!irqsafe_cpu_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
+				get_freepointer(s, object), next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_alloc", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = get_freepointer(s, object);
+#endif
 		stat(s, ALLOC_FASTPATH);
 	}
+
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->objsize);
@@ -1817,9 +1965,13 @@ static void __slab_free(struct kmem_cach
 {
 	void *prior;
 	void **object = (void *)x;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
 
-	stat(s, FREE_SLOWPATH);
+	local_irq_save(flags);
+#endif
 	slab_lock(page);
+	stat(s, FREE_SLOWPATH);
 
 	if (kmem_cache_debug(s))
 		goto debug;
@@ -1849,6 +2001,9 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	return;
 
 slab_empty:
@@ -1860,6 +2015,9 @@ slab_empty:
 		stat(s, FREE_REMOVE_PARTIAL);
 	}
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 	return;
@@ -1886,23 +2044,53 @@ static __always_inline void slab_free(st
 {
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	slab_free_hook(s, x);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+#endif
+
+redo:
+	/*
+	 * Determine the currently cpus per cpu slab.
+	 * The cpu may change afterward. However that does not matter since
+	 * data is retrieved via this pointer. If we are on the same cpu
+	 * during the cmpxchg then the free will succedd.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
 
-	slab_free_hook_irq(s, x);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	tid = c->tid;
+	barrier();
+#endif
 
 	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
 		set_freepointer(s, object, c->freelist);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+		if (unlikely(!irqsafe_cpu_cmpxchg_double(&s->cpu_slab->freelist,
+				c->freelist, tid,
+				object, next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_free", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = object;
+#endif
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2105,9 +2293,23 @@ static inline int alloc_kmem_cache_cpus(
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * Must align to double word boundary for the long cmpxchg instructions
+	 * to work.
+	 */
+	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+#else
+	/* Regular alignment is sufficient */
 	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+#endif
+
+	if (!s->cpu_slab)
+		return 0;
+
+	init_kmem_cache_cpus(s);
 
-	return s->cpu_slab != NULL;
+	return 1;
 }
 
 static struct kmem_cache *kmem_cache_node;


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 7/8] slub: Add PageSlubPartial
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
                   ` (5 preceding siblings ...)
  2010-12-02 21:53 ` [rfc: cpuops adv V1 6/8] Lockless (and preemptless) fastpaths for slub Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-02 21:53 ` [rfc: cpuops adv V1 8/8] slub: [RFC] Partially lockless freepath slowpath Christoph Lameter
  2010-12-04 19:29 ` [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Pekka Enberg
  8 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: slub_flag_partial --]
[-- Type: text/plain, Size: 2630 bytes --]

The condition for a page being on the partial list is established by a set
of combinations of values in the page struct (inuse > 0 && freelist != NULL).

With the lockless updates that set may become temporarily incoherent.
Use an explit flag to signal that a page is on a partial list and allow
multiple adds and removes from the partial list.

Signed-off-by: Christoph Lameter <cl@linux.com>


---
 include/linux/page-flags.h |    2 ++
 mm/slub.c                  |   25 +++++++++++++++----------
 2 files changed, 17 insertions(+), 10 deletions(-)

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h	2010-12-02 14:53:16.000000000 -0600
+++ linux-2.6/include/linux/page-flags.h	2010-12-02 15:01:35.000000000 -0600
@@ -128,6 +128,7 @@ enum pageflags {
 
 	/* SLUB */
 	PG_slub_frozen = PG_active,
+	PG_slub_partial = PG_error,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -214,6 +215,7 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEAR
 __PAGEFLAG(SlobFree, slob_free)
 
 __PAGEFLAG(SlubFrozen, slub_frozen)
+__PAGEFLAG(SlubPartial, slub_partial) TESTSCFLAG(SlubPartial, slub_partial)
 
 /*
  * Private page markings that may be used by the filesystem that owns the page
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-12-02 14:58:48.000000000 -0600
+++ linux-2.6/mm/slub.c	2010-12-02 15:29:50.000000000 -0600
@@ -1308,13 +1308,15 @@ static __always_inline int slab_trylock(
 static void add_partial(struct kmem_cache_node *n,
 				struct page *page, int tail)
 {
-	spin_lock(&n->list_lock);
-	n->nr_partial++;
-	if (tail)
-		list_add_tail(&page->lru, &n->partial);
-	else
-		list_add(&page->lru, &n->partial);
-	spin_unlock(&n->list_lock);
+	if (!TestSetPageSlubPartial(page)) {
+		spin_lock(&n->list_lock);
+		n->nr_partial++;
+		if (tail)
+			list_add_tail(&page->lru, &n->partial);
+		else
+			list_add(&page->lru, &n->partial);
+		spin_unlock(&n->list_lock);
+	}
 }
 
 static inline void __remove_partial(struct kmem_cache_node *n,
@@ -1322,15 +1324,18 @@ static inline void __remove_partial(stru
 {
 	list_del(&page->lru);
 	n->nr_partial--;
+	__ClearPageSlubPartial(page);
 }
 
 static void remove_partial(struct kmem_cache *s, struct page *page)
 {
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
-	spin_lock(&n->list_lock);
-	__remove_partial(n, page);
-	spin_unlock(&n->list_lock);
+	if (TestClearPageSlubPartial(page)) {
+		spin_lock(&n->list_lock);
+		__remove_partial(n, page);
+		spin_unlock(&n->list_lock);
+	}
 }
 
 /*


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 8/8] slub: [RFC] Partially lockless freepath slowpath
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
                   ` (6 preceding siblings ...)
  2010-12-02 21:53 ` [rfc: cpuops adv V1 7/8] slub: Add PageSlubPartial Christoph Lameter
@ 2010-12-02 21:53 ` Christoph Lameter
  2010-12-04 19:29 ` [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Pekka Enberg
  8 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 21:53 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: slub_lockless_slowpath --]
[-- Type: text/plain, Size: 10168 bytes --]

The slub slow freepath is frequently invoked since fast frees are only
possible for objects from the current slab page. Optimization of the
slowpath is therefore necessary to increase freeing performance.

This patch supplies a partially lockless slowpath. It addresses the
performance issues related to cycle count in the slow path but not issues
that may arise because cache hotness (that are tracked differently in SLAB).

In the fastpaths we use a cmpxchg_local with segment prefix to perform
freelist insertion. We can provide a similar approach for the slowpath but
there we must use a regular cmpxchg with lock prefix since frees to a page
may occur from multiple processors.

The cmpxchg only updates the freelist in the page struct. We also maintain
an object counter (inuse) in the page structure. That counter is decremented
in a racy way. This means that we may miss a decrement and the counter may
be higher that the actual number of used objects in the slab. The counter
is not used for the determination if the page is filled up though. Thus the
page will cycle via the partial list back to slab_alloc. The counter is then
fixed in allocation processing because allocation takes the whole list for the
per cpu allocation list.

Serialization via the slab_lock() is still performed for any situation in which
the freelist needs to be shrunk. Thus holding the slab_lock prevents the fastpath
from zapping the freelist. This can be used to guarantee that no new object are
allocated from a slab during free.

Results show that the slowpath performance is improved by around 40% to 100%.

10000 Allocations then 10000 frees test

Before (no lockless patches):

10000 times kmalloc(8) -> 207 cycles kfree -> 156 cycles
10000 times kmalloc(16) -> 208 cycles kfree -> 158 cycles
10000 times kmalloc(32) -> 257 cycles kfree -> 159 cycles
10000 times kmalloc(64) -> 383 cycles kfree -> 169 cycles
10000 times kmalloc(128) -> 375 cycles kfree -> 170 cycles
10000 times kmalloc(256) -> 869 cycles kfree -> 187 cycles
10000 times kmalloc(512) -> 1129 cycles kfree -> 307 cycles
10000 times kmalloc(1024) -> 2087 cycles kfree -> 554 cycles
10000 times kmalloc(2048) -> 3912 cycles kfree -> 588 cycles
10000 times kmalloc(4096) -> 7584 cycles kfree -> 664 cycles
10000 times kmalloc(8192) -> 7927 cycles kfree -> 903 cycles
10000 times kmalloc(16384) -> 8625 cycles kfree -> 1308 cycles


After (ll fastpath and slowpath):

10000 times kmalloc(8) -> 125 cycles kfree -> 95 cycles
10000 times kmalloc(16) -> 81 cycles kfree -> 109 cycles
10000 times kmalloc(32) -> 114 cycles kfree -> 101 cycles
10000 times kmalloc(64) -> 193 cycles kfree -> 110 cycles
10000 times kmalloc(128) -> 323 cycles kfree -> 124 cycles
10000 times kmalloc(256) -> 808 cycles kfree -> 141 cycles
10000 times kmalloc(512) -> 1051 cycles kfree -> 264 cycles
10000 times kmalloc(1024) -> 2026 cycles kfree -> 523 cycles
10000 times kmalloc(2048) -> 3970 cycles kfree -> 581 cycles
10000 times kmalloc(4096) -> 7677 cycles kfree -> 683 cycles
10000 times kmalloc(8192) -> 8022 cycles kfree -> 946 cycles
10000 times kmalloc(16384) -> 8641 cycles kfree -> 1286 cycles

10000 (alloc + free) test

Before:

10000 times kmalloc(8)/kfree -> 180 cycles
10000 times kmalloc(16)/kfree -> 180 cycles
10000 times kmalloc(32)/kfree -> 187 cycles
10000 times kmalloc(64)/kfree -> 186 cycles
10000 times kmalloc(128)/kfree -> 190 cycles
10000 times kmalloc(256)/kfree -> 188 cycles
10000 times kmalloc(512)/kfree -> 197 cycles
10000 times kmalloc(1024)/kfree -> 189 cycles
10000 times kmalloc(2048)/kfree -> 190 cycles
10000 times kmalloc(4096)/kfree -> 190 cycles
10000 times kmalloc(8192)/kfree -> 192 cycles
10000 times kmalloc(16384)/kfree -> 758 cycles

After:

10000 times kmalloc(8)/kfree -> 72 cycles
10000 times kmalloc(16)/kfree -> 83 cycles
10000 times kmalloc(32)/kfree -> 72 cycles
10000 times kmalloc(64)/kfree -> 72 cycles
10000 times kmalloc(128)/kfree -> 83 cycles
10000 times kmalloc(256)/kfree -> 93 cycles
10000 times kmalloc(512)/kfree -> 77 cycles
10000 times kmalloc(1024)/kfree -> 76 cycles
10000 times kmalloc(2048)/kfree -> 87 cycles
10000 times kmalloc(4096)/kfree -> 75 cycles
10000 times kmalloc(8192)/kfree -> 77 cycles
10000 times kmalloc(16384)/kfree -> 754 cycles

Concurrent alloc/free on all cpus:

Before:

Kmalloc N*(alloc free)(8): 0=176 1=177 2=176 3=176 4=184 5=176 6=176 7=176 Average=177
Kmalloc N*(alloc free)(16): 0=176 1=176 2=176 3=176 4=176 5=182 6=176 7=182 Average=177
Kmalloc N*(alloc free)(32): 0=178 1=178 2=177 3=178 4=177 5=182 6=178 7=184 Average=179
Kmalloc N*(alloc free)(64): 0=176 1=176 2=176 3=176 4=176 5=182 6=176 7=182 Average=177
Kmalloc N*(alloc free)(128): 0=176 1=178 2=176 3=176 4=176 5=176 6=176 7=182 Average=177
Kmalloc N*(alloc free)(256): 0=176 1=178 2=178 3=178 4=176 5=184 6=178 7=178 Average=178
Kmalloc N*(alloc free)(512): 0=178 1=178 2=178 3=178 4=178 5=182 6=178 7=184 Average=179
Kmalloc N*(alloc free)(1024): 0=178 1=178 2=178 3=188 4=178 5=178 6=178 7=184 Average=180
Kmalloc N*(alloc free)(2048): 0=400 1=177 2=178 3=176 4=282 5=185 6=233 7=237 Average=233
Kmalloc N*(alloc free)(4096): 0=178 1=178 2=178 3=178 4=178 5=184 6=178 7=183 Average=179

After:

Kmalloc N*(alloc free)(8): 0=73 1=73 2=73 3=71 4=71 5=71 6=71 7=75 Average=72
Kmalloc N*(alloc free)(16): 0=74 1=71 2=71 3=72 4=71 5=73 6=71 7=73 Average=72
Kmalloc N*(alloc free)(32): 0=73 1=71 2=71 3=71 4=71 5=71 6=72 7=71 Average=71
Kmalloc N*(alloc free)(64): 0=71 1=74 2=71 3=71 4=73 5=73 6=71 7=71 Average=72
Kmalloc N*(alloc free)(128): 0=71 1=71 2=81 3=73 4=71 5=71 6=75 7=75 Average=73
Kmalloc N*(alloc free)(256): 0=72 1=76 2=76 3=72 4=76 5=76 6=76 7=76 Average=75
Kmalloc N*(alloc free)(512): 0=76 1=76 2=76 3=76 4=72 5=72 6=76 7=76 Average=75
Kmalloc N*(alloc free)(1024): 0=76 1=76 2=76 3=76 4=77 5=76 6=168 7=77 Average=88
Kmalloc N*(alloc free)(2048): 0=81 1=81 2=81 3=81 4=77 5=77 6=72 7=76 Average=78
Kmalloc N*(alloc free)(4096): 0=99 1=76 2=76 3=76 4=77 5=94 6=72 7=76 Average=81


WARNING: The patch is not mature yet. There are unresolved issues around
freelist traversal and fallback for arches not supporting cmpxchg etc.

The resulting kernel so far survived initial testing in kvm with the
in-kernel memory allocator benchmarks and hackbench from user space.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/slub.c |   65 +++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-12-02 15:36:59.000000000 -0600
+++ linux-2.6/mm/slub.c	2010-12-02 15:37:24.000000000 -0600
@@ -1579,6 +1579,10 @@ static void deactivate_slab(struct kmem_
 
 	if (page->freelist)
 		stat(s, DEACTIVATE_REMOTE_FREES);
+	else
+		/* Fix up results of any racy updates */
+		page->inuse = page->objects;
+
 	/*
 	 * Merge cpu freelist into slab freelist. Typically we get here
 	 * because both freelists are empty. So this is unlikely
@@ -1586,6 +1590,7 @@ static void deactivate_slab(struct kmem_
 	 */
 	while (unlikely(c->freelist)) {
 		void **object;
+		void *prior;
 
 		tail = 0;	/* Hot objects. Put the slab first */
 
@@ -1594,8 +1599,11 @@ static void deactivate_slab(struct kmem_
 		c->freelist = get_freepointer(s, c->freelist);
 
 		/* And put onto the regular freelist */
-		set_freepointer(s, object, page->freelist);
-		page->freelist = object;
+redo:
+		prior = page->freelist;
+		set_freepointer(s, object, prior);
+		if (cmpxchg(&page->freelist, prior, object) != prior)
+			goto redo;
 		page->inuse--;
 	}
 	c->page = NULL;
@@ -1763,15 +1771,14 @@ static void *__slab_alloc(struct kmem_ca
 	stat(s, ALLOC_REFILL);
 
 load_freelist:
-	object = c->page->freelist;
+	c->page->inuse = c->page->objects;
+	object = xchg(&c->page->freelist, NULL);
 	if (unlikely(!object))
 		goto another_slab;
 	if (kmem_cache_debug(s))
 		goto debug;
 
 	c->freelist = get_freepointer(s, object);
-	c->page->inuse = c->page->objects;
-	c->page->freelist = NULL;
 	c->node = page_to_nid(c->page);
 unlock_out:
 	slab_unlock(c->page);
@@ -1970,40 +1977,48 @@ static void __slab_free(struct kmem_cach
 {
 	void *prior;
 	void **object = (void *)x;
-#ifdef CONFIG_CMPXCHG_LOCAL
 	unsigned long flags;
 
-	local_irq_save(flags);
-#endif
-	slab_lock(page);
 	stat(s, FREE_SLOWPATH);
-
 	if (kmem_cache_debug(s))
 		goto debug;
 
 checks_ok:
 	prior = page->freelist;
 	set_freepointer(s, object, prior);
-	page->freelist = object;
-	page->inuse--;
+	if (cmpxchg(&page->freelist, prior, object) != prior)
+		goto checks_ok;
 
-	if (unlikely(PageSlubFrozen(page))) {
+	/* Racy update */
+	if (unlikely(PageSlubFrozen(page) || (--page->inuse && prior))) {
 		stat(s, FREE_FROZEN);
-		goto out_unlock;
+		return;
 	}
 
-	if (unlikely(!page->inuse))
-		goto slab_empty;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_save(flags);
+#endif
+	slab_lock(page);	/* Locking prevents reduction of free list */
+
+	if (PageSlubFrozen(page))	/* If page has been exempted by now yield */
+		goto out_unlock;
+
+	/*
+	 * Still objects in use but those may be gone at any point now since
+	 * we are not locking out the freepath.
+	 */
 
 	/*
 	 * Objects left in the slab. If it was not on the partial list before
 	 * then add it.
 	 */
-	if (unlikely(!prior)) {
-		add_partial(get_node(s, page_to_nid(page)), page, 1);
-		stat(s, FREE_ADD_PARTIAL);
-	}
+	add_partial(get_node(s, page_to_nid(page)), page, 1);
 
+	if (!page->inuse)
+		/* They are indeed gone and we need to remove the page from the partial list again */
+		goto slab_empty;
+
+	/* Objects left and slab on the partial list */
 out_unlock:
 	slab_unlock(page);
 #ifdef CONFIG_CMPXCHG_LOCAL
@@ -2012,13 +2027,7 @@ out_unlock:
 	return;
 
 slab_empty:
-	if (prior) {
-		/*
-		 * Slab still on the partial list.
-		 */
-		remove_partial(s, page);
-		stat(s, FREE_REMOVE_PARTIAL);
-	}
+	remove_partial(s, page);
 	slab_unlock(page);
 #ifdef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
@@ -2029,7 +2038,7 @@ slab_empty:
 
 debug:
 	if (!free_debug_processing(s, page, x, addr))
-		goto out_unlock;
+		return;
 	goto checks_ok;
 }
 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [rfc: cpuops adv V1 2/8] Fallback to atomic xchg, cmpxchg
  2010-12-02 21:53 ` [rfc: cpuops adv V1 2/8] --- include/linux/percpu.h | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) Christoph Lameter
@ 2010-12-02 22:06   ` Christoph Lameter
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-02 22:06 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

Sorry this should have been:

Subject: this_cpu_xchg/cmpxchg: Fall back to atomic xchg, cmpxchg

It may be better if we fall back to the full operation instead of
simulation through irq disable etc.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |   31 +++----------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-12-02 12:17:14.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-12-02 12:17:30.000000000 -0600
@@ -361,14 +361,7 @@ do {									\
 # define __this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(__this_cpu_xchg_, (pcp), nval)
 #endif

-#define _this_cpu_generic_xchg(pcp, nval)				\
-({	typeof(pcp) ret__;						\
-	preempt_disable();						\
-	ret__ = __this_cpu_read(pcp);					\
-	__this_cpu_write(pcp, nval);					\
-	preempt_enable();						\
-	ret__;								\
-})
+#define _this_cpu_generic_xchg(pcp, nval)	xchg(__this_cpu_ptr(&(pcp)), nval)

 #ifndef this_cpu_xchg
 # ifndef this_cpu_xchg_1
@@ -386,15 +379,7 @@ do {									\
 # define this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(this_cpu_xchg_, (pcp), nval)
 #endif

-#define _this_cpu_generic_cmpxchg(pcp, oval, nval)			\
-({	typeof(pcp) ret__;						\
-	preempt_disable();						\
-	ret__ = __this_cpu_read(pcp);					\
-	if (ret__ == (oval))						\
-		__this_cpu_write(pcp, nval);				\
-	preempt_enable();						\
-	ret__;								\
-})
+#define _this_cpu_generic_cmpxchg(pcp, oval, nval)	cmpxchg(__this_cpu_ptr(&(pcp)), oval, nval);

 #ifndef this_cpu_cmpxchg
 # ifndef this_cpu_cmpxchg_1
@@ -892,17 +877,7 @@ do {									\
 # define irqsafe_cpu_xor(pcp, val) __pcpu_size_call(irqsafe_cpu_xor_, (val))
 #endif

-#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)			\
-({									\
-	typeof(pcp) ret__;						\
-	unsigned long flags;						\
-	local_irq_save(flags);						\
-	ret__ = __this_cpu_read(pcp);					\
-	if (ret__ == (oval))						\
-		__this_cpu_write(pcp, nval);				\
-	local_irq_restore(flags);					\
-	ret__;								\
-})
+#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)	cmpxchg(__this_cpu_ptr(&(pcp), oval, nval)

 #ifndef irqsafe_cpu_cmpxchg
 # ifndef irqsafe_cpu_cmpxchg_1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops
  2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
                   ` (7 preceding siblings ...)
  2010-12-02 21:53 ` [rfc: cpuops adv V1 8/8] slub: [RFC] Partially lockless freepath slowpath Christoph Lameter
@ 2010-12-04 19:29 ` Pekka Enberg
  2010-12-04 19:31   ` Tejun Heo
  2010-12-06 15:51   ` Christoph Lameter
  8 siblings, 2 replies; 17+ messages in thread
From: Pekka Enberg @ 2010-12-04 19:29 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

Hi,

[ Sorry for the delay everyone. ]

Christoph, do you mind sending me a series that you think is stable 
enough for linux-next? The numbers are pretty impressive and I'd love to 
give them some testing locally and in linux-next.

How should we coordinate the per-CPU ops patches, btw? We had pretty 
good experience with the per-cpu allocator patches where Tejun merged 
the patches in his tree and I pulled it in slab.git. Who handles the 
per-cpu ops?

             Pekka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops
  2010-12-04 19:29 ` [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Pekka Enberg
@ 2010-12-04 19:31   ` Tejun Heo
  2010-12-06 15:52     ` Christoph Lameter
  2010-12-06 15:51   ` Christoph Lameter
  1 sibling, 1 reply; 17+ messages in thread
From: Tejun Heo @ 2010-12-04 19:31 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Christoph Lameter, akpm, linux-kernel, Eric Dumazet, Mathieu Desnoyers

Hello,

On 12/04/2010 08:29 PM, Pekka Enberg wrote:
> [ Sorry for the delay everyone. ]
> 
> Christoph, do you mind sending me a series that you think is stable
> enough for linux-next? The numbers are pretty impressive and I'd
> love to give them some testing locally and in linux-next.
>
> How should we coordinate the per-CPU ops patches, btw? We had
> pretty good experience with the per-cpu allocator patches where
> Tejun merged the patches in his tree and I pulled it in
> slab.git. Who handles the per-cpu ops?

I'll be happy to apply the per-cpu ops part.  Christoph, would it be
okay to apply patches from this posting?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops
  2010-12-04 19:29 ` [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Pekka Enberg
  2010-12-04 19:31   ` Tejun Heo
@ 2010-12-06 15:51   ` Christoph Lameter
  1 sibling, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-06 15:51 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On Sat, 4 Dec 2010, Pekka Enberg wrote:

> Christoph, do you mind sending me a series that you think is stable enough for
> linux-next? The numbers are pretty impressive and I'd love to give them some
> testing locally and in linux-next.

I'd like to see the cpu ops *updates* patchset getting into -next and
then be hopefully merged for the next relased. The cpuops advanced
patchset (this one) can go into -next *after* the next merge phase.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops
  2010-12-04 19:31   ` Tejun Heo
@ 2010-12-06 15:52     ` Christoph Lameter
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-12-06 15:52 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Pekka Enberg, akpm, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On Sat, 4 Dec 2010, Tejun Heo wrote:

> I'll be happy to apply the per-cpu ops part.  Christoph, would it be
> okay to apply patches from this posting?

No. Please focus merging efforts on the cpu ops updates patchset. There is
a reason that there is an "rfc" in the title of this patchset (cpu ops
avanced).



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-12-02 21:53 ` [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
@ 2010-12-06 17:14   ` Avi Kivity
  2010-12-06 17:35     ` Christoph Lameter
  0 siblings, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2010-12-06 17:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On 12/02/2010 11:53 PM, Christoph Lameter wrote:
> Provide support as far as the hardware capabilities of the x86 cpus
> allow.
>
>
>
> +/*
> + * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
> + * full lock semantics even though they are not needed.
> + */

Perhaps we can use cmpxchg instead of xchg to avoid this? costs one more 
instruction but may be worth it.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-12-06 17:14   ` Avi Kivity
@ 2010-12-06 17:35     ` Christoph Lameter
  2010-12-07  9:31       ` Avi Kivity
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-12-06 17:35 UTC (permalink / raw)
  To: Avi Kivity
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On Mon, 6 Dec 2010, Avi Kivity wrote:

> On 12/02/2010 11:53 PM, Christoph Lameter wrote:
> > Provide support as far as the hardware capabilities of the x86 cpus
> > allow.
> >
> >
> >
> > +/*
> > + * Beware: xchg on x86 has an implied lock prefix. There will be the cost
> > of
> > + * full lock semantics even though they are not needed.
> > + */
>
> Perhaps we can use cmpxchg instead of xchg to avoid this? costs one more
> instruction but may be worth it.

Hmmm... Maybe good since I also need a xchg_double. And xchg_double can
only be realized with cmpxchg16b. Using cmpxchg would make it consistent.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-12-06 17:35     ` Christoph Lameter
@ 2010-12-07  9:31       ` Avi Kivity
  0 siblings, 0 replies; 17+ messages in thread
From: Avi Kivity @ 2010-12-07  9:31 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On 12/06/2010 07:35 PM, Christoph Lameter wrote:
> On Mon, 6 Dec 2010, Avi Kivity wrote:
>
> >  On 12/02/2010 11:53 PM, Christoph Lameter wrote:
> >  >  Provide support as far as the hardware capabilities of the x86 cpus
> >  >  allow.
> >  >
> >  >
> >  >
> >  >  +/*
> >  >  + * Beware: xchg on x86 has an implied lock prefix. There will be the cost
> >  >  of
> >  >  + * full lock semantics even though they are not needed.
> >  >  + */
> >
> >  Perhaps we can use cmpxchg instead of xchg to avoid this? costs one more
> >  instruction but may be worth it.
>
> Hmmm... Maybe good since I also need a xchg_double. And xchg_double can
> only be realized with cmpxchg16b. Using cmpxchg would make it consistent.

I don't think we need to worry about consistency, this is an 
implementation not an interface.

We have three choices:

   xchg %1, %0

atomic, one instruction

   1: cmpxchg %2, %0
   jnz 1b

two non-atomic instructions, potential mispredicted jump, extra clobber 
(%1 == "=a")

   mov %0, %1
   1: cmpxchg %2, %0
   jnz 1b

three non-atomic instructions, no mispredict, extra clobber

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2010-12-07  9:31 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-02 21:53 [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Christoph Lameter
2010-12-02 21:53 ` [rfc: cpuops adv V1 1/8] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
2010-12-02 21:53 ` [rfc: cpuops adv V1 2/8] --- include/linux/percpu.h | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) Christoph Lameter
2010-12-02 22:06   ` [rfc: cpuops adv V1 2/8] Fallback to atomic xchg, cmpxchg Christoph Lameter
2010-12-02 21:53 ` [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
2010-12-06 17:14   ` Avi Kivity
2010-12-06 17:35     ` Christoph Lameter
2010-12-07  9:31       ` Avi Kivity
2010-12-02 21:53 ` [rfc: cpuops adv V1 4/8] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
2010-12-02 21:53 ` [rfc: cpuops adv V1 5/8] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
2010-12-02 21:53 ` [rfc: cpuops adv V1 6/8] Lockless (and preemptless) fastpaths for slub Christoph Lameter
2010-12-02 21:53 ` [rfc: cpuops adv V1 7/8] slub: Add PageSlubPartial Christoph Lameter
2010-12-02 21:53 ` [rfc: cpuops adv V1 8/8] slub: [RFC] Partially lockless freepath slowpath Christoph Lameter
2010-12-04 19:29 ` [rfc: cpuops adv V1 0/8] Cmpxchg and xchg support for cpu ops Pekka Enberg
2010-12-04 19:31   ` Tejun Heo
2010-12-06 15:52     ` Christoph Lameter
2010-12-06 15:51   ` Christoph Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.