linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [cpuops cmpxchg double V1 0/4] this_cpu_cmpxchg_double support
@ 2010-12-14 17:48 Christoph Lameter
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
                   ` (3 more replies)
  0 siblings, 4 replies; 29+ messages in thread
From: Christoph Lameter @ 2010-12-14 17:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

This patch series introduces this_cpu_cmpxchg_double().

x86 cpus support cmpxchg16b and cmpxchg8b that can be used to perform a
cmpxchg with two words instead of only one. That allows to put more state
into an atomic instruction.

this_cpu_cmpxchg_double() is then used in the slub allocator to avoid
interrupt disable/enable in both alloc and free fastpath.
this_cpu_cmpxchg_double works nicely with the per cpu data of the
allocator. Doing so significantly speeds up the fastpaths.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-14 17:48 [cpuops cmpxchg double V1 0/4] this_cpu_cmpxchg_double support Christoph Lameter
@ 2010-12-14 17:48 ` Christoph Lameter
  2010-12-18 14:47   ` Tejun Heo
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support Christoph Lameter
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 29+ messages in thread
From: Christoph Lameter @ 2010-12-14 17:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_double_generic --]
[-- Type: text/plain, Size: 6708 bytes --]

Introduce this_cpu_cmpxchg_double. this_cpu_cmpxchg_double() allows the
comparision between two consecutive words and replaces them if there is
a match.

	bool this_cpu_cmpxchg(xx __percpu *p, old_word1, old_word2, new_word1, new_word2)

this_cpu_cmpxchg does not return the old value (difficult since there are two
words) but a boolean indicating if the operation was successful.

Also this_cpu_cmpxchg does take a per cpu pointer rather than a variable
reference like the other this_cpu_ops. This is because two words are compared.

The pointer passed must be double word aligned!

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |  130 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-12-14 10:34:20.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-12-14 10:36:13.000000000 -0600
@@ -259,6 +259,27 @@ extern void __bad_size_call_parameter(vo
 	pscr2_ret__;							\
 })
 
+/*
+ * Special handling for cmpxchg_double. cmpxchg_double is passed a
+ * __percpu pointer and that pointer has to be aligned to a double
+ * word boundary
+ */
+#define __pcpu_double_call_return_int(stem, pcp, ...)			\
+({									\
+	int ret__;							\
+	__verify_pcpu_ptr(pcp);						\
+	VM_BUG_ON((unsigned long)(pcp) % (2 * sizeof(unsigned long)));	\
+	switch(sizeof(*pcp)) {						\
+	case 1: ret__ = stem##1(pcp, __VA_ARGS__);break;		\
+	case 2: ret__ = stem##2(pcp, __VA_ARGS__);break;		\
+	case 4: ret__ = stem##4(pcp, __VA_ARGS__);break;		\
+	case 8: ret__ = stem##8(pcp, __VA_ARGS__);break;		\
+	default:							\
+		__bad_size_call_parameter();break;			\
+	}								\
+	ret__;								\
+})
+
 #define __pcpu_size_call(stem, variable, ...)				\
 do {									\
 	__verify_pcpu_ptr(&(variable));					\
@@ -422,6 +443,82 @@ do {									\
 				__this_cpu_cmpxchg_, pcp, oval, nval)
 #endif
 
+/*
+ * cmpxchg_double replaces two adjacent scalars at once. The first parameter
+ * passed is a percpu pointer, not a scalar like the other this_cpu
+ * operations. This is so because the function operates on two scalars
+ * (must be of same size). A truth value is returned to indicate success or
+ * failure (since a double register result is difficult to handle).
+ * There is very limited hardware support for these operations. So only certain
+ * sizes may work.
+ */
+#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({									\
+	typeof(oval2) * __percpu pcp2 = (typeof(oval2) *)((pcp) + 1);	\
+	int __ret = 0;							\
+	if (__this_cpu_read(*pcp) == (oval1) &&				\
+			 __this_cpu_read(*pcp2)  == (oval2)) {		\
+		__this_cpu_write(*pcp, (nval1));			\
+		__this_cpu_write(*pcp2, (nval2));			\
+		__ret = 1;						\
+	}								\
+	(__ret);							\
+})
+
+#ifndef __this_cpu_cmpxchg_double
+# ifndef __this_cpu_cmpxchg_double_1
+#  define __this_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_2
+#  define __this_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_4
+#  define __this_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_8
+#  define __this_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define __this_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_int(__this_cpu_cmpxchg_double_, (pcp),	\
+					 oval1, oval2, nval1, nval2)
+#endif
+
+#define _this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	preempt_disable();						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp,			\
+			oval1, oval2, nval1, nval2);			\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg_double
+# ifndef this_cpu_cmpxchg_double_1
+#  define this_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_2
+#  define this_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_4
+#  define this_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_8
+#  define this_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define this_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_int(this_cpu_cmpxchg_double_, (pcp),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	preempt_disable();						\
@@ -825,4 +922,37 @@ do {									\
 # define irqsafe_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval)
 #endif
 
+#define irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp,			\
+			oval1, oval2, nval1, nval2);			\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cpu_cmpxchg_double
+# ifndef irqsafe_cpu_cmpxchg_double_1
+#  define irqsafe_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_2
+#  define irqsafe_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_4
+#  define irqsafe_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_8
+#  define irqsafe_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define irqsafe_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_int(irqsafe_cpu_cmpxchg_double_, (pcp),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
 #endif /* __LINUX_PERCPU_H */


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-14 17:48 [cpuops cmpxchg double V1 0/4] this_cpu_cmpxchg_double support Christoph Lameter
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
@ 2010-12-14 17:48 ` Christoph Lameter
  2010-12-15  0:46   ` H. Peter Anvin
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 3/4] slub: Get rid of slab_free_hook_irq() Christoph Lameter
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  3 siblings, 1 reply; 29+ messages in thread
From: Christoph Lameter @ 2010-12-14 17:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_double_x86 --]
[-- Type: text/plain, Size: 4925 bytes --]

Support this_cpu_cmpxchg_double using the cmpxchg16b and cmpxchg8b instructions.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   47 ++++++++++++++++++++++++++++++++++
 arch/x86/lib/Makefile         |    1 
 arch/x86/lib/cmpxchg16b_emu.S |   58 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-14 10:31:33.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-14 10:39:04.000000000 -0600
@@ -451,6 +451,26 @@ do {									\
 #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #endif /* !CONFIG_M386 */
 
+#ifdef CONFIG_X86_CMPXCHG64
+#define percpu_cmpxchg8b_double(pcp, o1, o2, n1, n2)			\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	typeof(o2) __dummy = n2;						\
+	asm("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t"		\
+		    : "=a"(__ret), "=m" (*pcp), "=d"(__dummy)		\
+		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
+	__ret;								\
+})
+
+#define __this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2) percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2)	percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2)	percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#endif /* CONFIG_X86_CMPXCHG64 */
+
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -486,6 +506,33 @@ do {									\
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
+/*
+ * Pretty complex macro to generate cmpxchg16 instruction. The instruction
+ * is not supported on early AMD64 processors so we must be able to emulate
+ * it in software. The address used in the cmpxchg16 instruction must be
+ * aligned to a 16 byte boundary.
+ */
+#define percpu_cmpxchg16b(pcp, o1, o2, n1, n2)			\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	typeof(o2) __dummy;						\
+	alternative_io("call cmpxchg16b_cpuops_emu\n\t" P6_NOP4,		\
+			"cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t",	\
+			X86_FEATURE_CX16,				\
+		    	ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),	\
+		        "S" (pcp), "b"(__n1), "c"(__n2),		\
+			 "a"(__o1), "d"(__o2));				\
+	__ret;								\
+})
+
+#define __this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2) percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+
 #endif
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
Index: linux-2.6/arch/x86/lib/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile	2010-12-10 12:16:27.000000000 -0600
+++ linux-2.6/arch/x86/lib/Makefile	2010-12-14 10:36:50.000000000 -0600
@@ -42,4 +42,5 @@ else
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
 	lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+	lib-y += cmpxchg16b_emu.o
 endif
Index: linux-2.6/arch/x86/lib/cmpxchg16b_emu.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/arch/x86/lib/cmpxchg16b_emu.S	2010-12-14 10:36:50.000000000 -0600
@@ -0,0 +1,58 @@
+/*
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; version 2
+ *	of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al  : Operation successful
+ */
+ENTRY(cmpxchg16b_cpuops_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
+# al not via the ZF. Caller will access al to get result.
+#
+cmpxchg16b_cpuops_emu:
+	pushf
+	cli
+
+	cmpq  %gs:(%rsi), %rax
+	jne not_same
+	cmpq %gs:8(%rsi), %rdx
+	jne not_same
+
+	movq %rbx,  %gs:(%rsi)
+	movq %rcx, %gs:8(%rsi)
+
+	popf
+	mov $1, %al
+	ret
+
+ not_same:
+	popf
+	xor  %al,%al
+	ret
+
+CFI_ENDPROC
+
+ENDPROC(cmpxchg16b_cpuops)
+
+


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [cpuops cmpxchg double V1 3/4] slub: Get rid of slab_free_hook_irq()
  2010-12-14 17:48 [cpuops cmpxchg double V1 0/4] this_cpu_cmpxchg_double support Christoph Lameter
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support Christoph Lameter
@ 2010-12-14 17:48 ` Christoph Lameter
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  3 siblings, 0 replies; 29+ messages in thread
From: Christoph Lameter @ 2010-12-14 17:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: slub_remove_irq_freehook --]
[-- Type: text/plain, Size: 2269 bytes --]

The following patch will make the fastpaths lockless and will no longer
require interrupts to be disabled. Calling the free hook with irq disabled
will no longer be possible.

Move the slab_free_hook_irq() logic into slab_free_hook. Only disable
interrupts if the features are selected that require callbacks with
interrupts off and reenable after calls have been made.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/slub.c |   29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-12-14 10:45:11.000000000 -0600
+++ linux-2.6/mm/slub.c	2010-12-14 10:46:40.000000000 -0600
@@ -805,14 +805,24 @@ static inline void slab_post_alloc_hook(
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
 {
 	kmemleak_free_recursive(x, s->flags);
-}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
-{
-	kmemcheck_slab_free(s, object, s->objsize);
-	debug_check_no_locks_freed(object, s->objsize);
-	if (!(s->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(object, s->objsize);
+	/*
+	 * Trouble is that we may no longer disable interupts in the fast path
+	 * So in order to make the debug calls that expect irqs to be
+	 * disabled we need to disable interrupts temporarily.
+	 */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+	{
+		unsigned long flags;
+
+		local_irq_save(flags);
+		kmemcheck_slab_free(s, x, s->objsize);
+		debug_check_no_locks_freed(x, s->objsize);
+		if (!(s->flags & SLAB_DEBUG_OBJECTS))
+			debug_check_no_obj_freed(x, s->objsize);
+		local_irq_restore(flags);
+	}
+#endif
 }
 
 /*
@@ -1099,9 +1109,6 @@ static inline void slab_post_alloc_hook(
 
 static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s,
-		void *object) {}
-
 #endif /* CONFIG_SLUB_DEBUG */
 
 /*
@@ -1893,8 +1900,6 @@ static __always_inline void slab_free(st
 	local_irq_save(flags);
 	c = __this_cpu_ptr(s->cpu_slab);
 
-	slab_free_hook_irq(s, x);
-
 	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
 		set_freepointer(s, object, c->freelist);
 		c->freelist = object;


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub
  2010-12-14 17:48 [cpuops cmpxchg double V1 0/4] this_cpu_cmpxchg_double support Christoph Lameter
                   ` (2 preceding siblings ...)
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 3/4] slub: Get rid of slab_free_hook_irq() Christoph Lameter
@ 2010-12-14 17:48 ` Christoph Lameter
  2010-12-15 16:51   ` Tejun Heo
  3 siblings, 1 reply; 29+ messages in thread
From: Christoph Lameter @ 2010-12-14 17:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_double_slub_fastpath --]
[-- Type: text/plain, Size: 12710 bytes --]

Use the this_cpu_cmpxchg_double functionality to implement a lockless
allocation algorithm on arches that support fast this_cpu_ops.

Each of the per cpu pointers is paired with a transaction id that ensures
that updates of the per cpu information can only occur in sequence on
a certain cpu.

A transaction id is a "long" integer that is comprised of an event number
and the cpu number. The event number is incremented for every change to the
per cpu state. This means that the cmpxchg instruction can verify for an
update that nothing interfered and that we are updating the percpu structure
for the processor where we picked up the information and that we are also
currently on that processor when we update the information.

This results in a significant decrease of the overhead in the fastpaths. It
also makes it easy to adopt the fast path for realtime kernels since this
is lockless and does not require the use of the current per cpu area
over the critical section. It is only important that the per cpu area is
current at the beginning of the critical section and at the end.

So there is no need even to disable preemption.

Test results show that the fastpath cycle count is reduced by up to ~ 40%
(alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree
adds a few cycles.

Sadly this does nothing for the slowpath which is where the main issues with
performance in slub are but the best case performance rises significantly.
(For that see the more complex slub patches that require cmpxchg_double)

Kmalloc: alloc/free test

Before:

10000 times kmalloc(8)/kfree -> 142 cycles
10000 times kmalloc(16)/kfree -> 142 cycles
10000 times kmalloc(32)/kfree -> 142 cycles
10000 times kmalloc(64)/kfree -> 142 cycles
10000 times kmalloc(128)/kfree -> 142 cycles
10000 times kmalloc(256)/kfree -> 140 cycles
10000 times kmalloc(512)/kfree -> 140 cycles
10000 times kmalloc(1024)/kfree -> 144 cycles
10000 times kmalloc(2048)/kfree -> 144 cycles
10000 times kmalloc(4096)/kfree -> 144 cycles
10000 times kmalloc(8192)/kfree -> 144 cycles
10000 times kmalloc(16384)/kfree -> 913 cycles

After:

10000 times kmalloc(8)/kfree -> 81 cycles
10000 times kmalloc(16)/kfree -> 81 cycles
10000 times kmalloc(32)/kfree -> 81 cycles
10000 times kmalloc(64)/kfree -> 81 cycles
10000 times kmalloc(128)/kfree -> 81 cycles
10000 times kmalloc(256)/kfree -> 87 cycles
10000 times kmalloc(512)/kfree -> 87 cycles
10000 times kmalloc(1024)/kfree -> 87 cycles
10000 times kmalloc(2048)/kfree -> 84 cycles
10000 times kmalloc(4096)/kfree -> 81 cycles
10000 times kmalloc(8192)/kfree -> 81 cycles
10000 times kmalloc(16384)/kfree -> 927 cycles


Kmalloc: Repeatedly allocate then free test

Before:

10000 times kmalloc(8) -> 102 cycles kfree -> 111 cycles
10000 times kmalloc(16) -> 101 cycles kfree -> 111 cycles
10000 times kmalloc(32) -> 120 cycles kfree -> 114 cycles
10000 times kmalloc(64) -> 161 cycles kfree -> 130 cycles
10000 times kmalloc(128) -> 284 cycles kfree -> 129 cycles
10000 times kmalloc(256) -> 410 cycles kfree -> 134 cycles
10000 times kmalloc(512) -> 312 cycles kfree -> 197 cycles
10000 times kmalloc(1024) -> 377 cycles kfree -> 494 cycles
10000 times kmalloc(2048) -> 571 cycles kfree -> 522 cycles
10000 times kmalloc(4096) -> 674 cycles kfree -> 565 cycles
10000 times kmalloc(8192) -> 836 cycles kfree -> 648 cycles
10000 times kmalloc(16384) -> 1201 cycles kfree -> 775 cycles

After:

10000 times kmalloc(8) -> 69 cycles kfree -> 115 cycles
10000 times kmalloc(16) -> 73 cycles kfree -> 115 cycles
10000 times kmalloc(32) -> 86 cycles kfree -> 119 cycles
10000 times kmalloc(64) -> 122 cycles kfree -> 125 cycles
10000 times kmalloc(128) -> 247 cycles kfree -> 132 cycles
10000 times kmalloc(256) -> 375 cycles kfree -> 137 cycles
10000 times kmalloc(512) -> 283 cycles kfree -> 183 cycles
10000 times kmalloc(1024) -> 316 cycles kfree -> 504 cycles
10000 times kmalloc(2048) -> 516 cycles kfree -> 531 cycles
10000 times kmalloc(4096) -> 610 cycles kfree -> 570 cycles
10000 times kmalloc(8192) -> 759 cycles kfree -> 651 cycles
10000 times kmalloc(16384) -> 1169 cycles kfree -> 778 cycles

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/slub_def.h |    5 -
 mm/slub.c                |  201 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 203 insertions(+), 3 deletions(-)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2010-12-14 10:51:17.000000000 -0600
+++ linux-2.6/include/linux/slub_def.h	2010-12-14 10:51:27.000000000 -0600
@@ -36,7 +36,10 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to first free per cpu object */
+	void **freelist;	/* Pointer to next available object */
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;	/* Globally unique transaction id */
+#endif
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
 #ifdef CONFIG_SLUB_STATS
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-12-14 10:51:24.000000000 -0600
+++ linux-2.6/mm/slub.c	2010-12-14 10:51:29.000000000 -0600
@@ -1492,6 +1492,77 @@ static void unfreeze_slab(struct kmem_ca
 	}
 }
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
+{
+	return tid + TID_STEP;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+	return tid % TID_STEP;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+	return tid / TID_STEP;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+	return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+		const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef CONFIG_DEBUG_VM
+	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+	printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPT
+	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+		printk("due to cpu change %d -> %d\n",
+			tid_to_cpu(tid), tid_to_cpu(actual_tid));
+	else
+#endif
+	if (tid_to_event(tid) != tid_to_event(actual_tid))
+		printk("due to cpu running other code. Event %ld->%ld\n",
+			tid_to_event(tid), tid_to_event(actual_tid));
+	else
+		printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+			actual_tid, tid, next_tid(tid));
+#endif
+}
+
+#endif
+
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+#endif
+
+}
 /*
  * Remove the cpu slab
  */
@@ -1523,6 +1594,9 @@ static void deactivate_slab(struct kmem_
 		page->inuse--;
 	}
 	c->page = NULL;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+#endif
 	unfreeze_slab(s, page, tail);
 }
 
@@ -1657,6 +1731,19 @@ static void *__slab_alloc(struct kmem_ca
 {
 	void **object;
 	struct page *new;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
+
+	local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+	/*
+	 * We may have been preempted and rescheduled on a different
+	 * cpu before disabling interrupts. Need to reload cpu area
+	 * pointer.
+	 */
+	c = this_cpu_ptr(s->cpu_slab);
+#endif
+#endif
 
 	/* We handle __GFP_ZERO in the caller */
 	gfpflags &= ~__GFP_ZERO;
@@ -1683,6 +1770,10 @@ load_freelist:
 	c->node = page_to_nid(c->page);
 unlock_out:
 	slab_unlock(c->page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+	local_irq_restore(flags);
+#endif
 	stat(s, ALLOC_SLOWPATH);
 	return object;
 
@@ -1744,23 +1835,73 @@ static __always_inline void *slab_alloc(
 {
 	void **object;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	if (slab_pre_alloc_hook(s, gfpflags))
 		return NULL;
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+redo:
+#endif
+
+	/*
+	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+	 * enabled. We may switch back and forth between cpus while
+	 * reading from one cpu area. That does not matter as long
+	 * as we end up on the original cpu again when doing the cmpxchg.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * The transaction ids are globally unique per cpu and per operation on
+	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+	 * occurs on the right processor and that there was no operation on the
+	 * linked list in between.
+	 */
+	tid = c->tid;
+	barrier();
+#endif
+
 	object = c->freelist;
 	if (unlikely(!object || !node_match(c, node)))
 
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 
 	else {
+#ifdef CMPXCHG_LOCAL
+		/*
+		 * The cmpxchg will only match if there was no additonal
+		 * operation and if we are on the right processor.
+		 *
+		 * The cmpxchg does the following atomically (without lock semantics!)
+		 * 1. Relocate first pointer to the current per cpu area.
+		 * 2. Verify that tid and freelist have not been changed
+		 * 3. If they were not changed replace tid and freelist
+		 *
+		 * Since this is without lock semantics the protection is only against
+		 * code executing on this cpu *not* from access by other cpus.
+		 */
+		if (unlikely(!this_cpu_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
+				get_freepointer(s, object), next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_alloc", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = get_freepointer(s, object);
+#endif
 		stat(s, ALLOC_FASTPATH);
 	}
+
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->objsize);
@@ -1824,9 +1965,13 @@ static void __slab_free(struct kmem_cach
 {
 	void *prior;
 	void **object = (void *)x;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
 
-	stat(s, FREE_SLOWPATH);
+	local_irq_save(flags);
+#endif
 	slab_lock(page);
+	stat(s, FREE_SLOWPATH);
 
 	if (kmem_cache_debug(s))
 		goto debug;
@@ -1856,6 +2001,9 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	return;
 
 slab_empty:
@@ -1867,6 +2015,9 @@ slab_empty:
 		stat(s, FREE_REMOVE_PARTIAL);
 	}
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 	return;
@@ -1893,21 +2044,53 @@ static __always_inline void slab_free(st
 {
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	slab_free_hook(s, x);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+#endif
+
+redo:
+	/*
+	 * Determine the currently cpus per cpu slab.
+	 * The cpu may change afterward. However that does not matter since
+	 * data is retrieved via this pointer. If we are on the same cpu
+	 * during the cmpxchg then the free will succedd.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+	tid = c->tid;
+	barrier();
+#endif
+
 	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
 		set_freepointer(s, object, c->freelist);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+		if (unlikely(!this_cpu_cmpxchg_double(&s->cpu_slab->freelist,
+				c->freelist, tid,
+				object, next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_free", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = object;
+#endif
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2110,9 +2293,23 @@ static inline int alloc_kmem_cache_cpus(
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * Must align to double word boundary for the double cmpxchg instructions
+	 * to work.
+	 */
+	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+#else
+	/* Regular alignment is sufficient */
 	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+#endif
+
+	if (!s->cpu_slab)
+		return 0;
 
-	return s->cpu_slab != NULL;
+	init_kmem_cache_cpus(s);
+
+	return 1;
 }
 
 static struct kmem_cache *kmem_cache_node;


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support Christoph Lameter
@ 2010-12-15  0:46   ` H. Peter Anvin
  2010-12-15  0:56     ` H. Peter Anvin
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2010-12-15  0:46 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 12/14/2010 09:48 AM, Christoph Lameter wrote:
> +
> +/*
> + * Inputs:
> + * %rsi : memory location to compare
> + * %rax : low 64 bits of old value
> + * %rdx : high 64 bits of old value
> + * %rbx : low 64 bits of new value
> + * %rcx : high 64 bits of new value
> + * %al  : Operation successful
> + */
> +ENTRY(cmpxchg16b_cpuops_emu)
> +CFI_STARTPROC
> +
> +#
> +# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
> +# al not via the ZF. Caller will access al to get result.
> +#
> +cmpxchg16b_cpuops_emu:
> +	pushf
> +	cli
> +
> +	cmpq  %gs:(%rsi), %rax
> +	jne not_same
> +	cmpq %gs:8(%rsi), %rdx
> +	jne not_same
> +
> +	movq %rbx,  %gs:(%rsi)
> +	movq %rcx, %gs:8(%rsi)
> +
> +	popf
> +	mov $1, %al
> +	ret
> +
> + not_same:
> +	popf
> +	xor  %al,%al
> +	ret
> +
> +CFI_ENDPROC
> +
> +ENDPROC(cmpxchg16b_cpuops)
> +

NAK on this.  This is acceptable for cmpxchg8b only because we don't
support SMP on 486s anymore.  x86-64 is another matter...
	
	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-15  0:46   ` H. Peter Anvin
@ 2010-12-15  0:56     ` H. Peter Anvin
  2010-12-15 16:12       ` Christoph Lameter
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2010-12-15  0:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 12/14/2010 04:46 PM, H. Peter Anvin wrote:
> On 12/14/2010 09:48 AM, Christoph Lameter wrote:
>> +
>> +/*
>> + * Inputs:
>> + * %rsi : memory location to compare
>> + * %rax : low 64 bits of old value
>> + * %rdx : high 64 bits of old value
>> + * %rbx : low 64 bits of new value
>> + * %rcx : high 64 bits of new value
>> + * %al  : Operation successful
>> + */
>> +ENTRY(cmpxchg16b_cpuops_emu)
>> +CFI_STARTPROC
>> +
>> +#
>> +# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
>> +# al not via the ZF. Caller will access al to get result.
>> +#
> 
> NAK on this.  This is acceptable for cmpxchg8b only because we don't
> support SMP on 486s anymore.  x86-64 is another matter...
> 	

Hm, this is meant to be a CPU local operation, isn't it... it isn't very
clear from the comments or naming *in this file*.

Could you make it a little clearer in the local code, please?

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-15  0:56     ` H. Peter Anvin
@ 2010-12-15 16:12       ` Christoph Lameter
  2010-12-15 16:20         ` Christoph Lameter
  2010-12-15 16:32         ` H. Peter Anvin
  0 siblings, 2 replies; 29+ messages in thread
From: Christoph Lameter @ 2010-12-15 16:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Tue, 14 Dec 2010, H. Peter Anvin wrote:

> > NAK on this.  This is acceptable for cmpxchg8b only because we don't
> > support SMP on 486s anymore.  x86-64 is another matter...
> >
>
> Hm, this is meant to be a CPU local operation, isn't it... it isn't very
> clear from the comments or naming *in this file*.

Ok. But the code is correct as far as I can tell. Its not a global cmpxchg
but local. Interrupt disable is ok.

> Could you make it a little clearer in the local code, please?

ok.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-15 16:12       ` Christoph Lameter
@ 2010-12-15 16:20         ` Christoph Lameter
  2010-12-15 17:36           ` H. Peter Anvin
  2010-12-15 16:32         ` H. Peter Anvin
  1 sibling, 1 reply; 29+ messages in thread
From: Christoph Lameter @ 2010-12-15 16:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

Better comments:

Subject: Fixup comments for cmpxchg16b_cpuops_emu

Indicate that the sematics are not fullly atomic but just per cpu atomic.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/lib/cmpxchg16b_emu.S |    4 ++++
 1 file changed, 4 insertions(+)

Index: linux-2.6/arch/x86/lib/cmpxchg16b_emu.S
===================================================================
--- linux-2.6.orig/arch/x86/lib/cmpxchg16b_emu.S	2010-12-15 10:18:18.000000000 -0600
+++ linux-2.6/arch/x86/lib/cmpxchg16b_emu.S	2010-12-15 10:19:11.000000000 -0600
@@ -30,6 +30,10 @@ CFI_STARTPROC
 # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
 # al not via the ZF. Caller will access al to get result.
 #
+# Note that this is only useful for a cpuops operation. Meaning that we
+# do *not* have a fully atomic operation but just an operation that is
+# *atomic* on a single cpu (as provided by the this_cpu_xx class of macros)
+#
 cmpxchg16b_cpuops_emu:
 	pushf
 	cli


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-15 16:12       ` Christoph Lameter
  2010-12-15 16:20         ` Christoph Lameter
@ 2010-12-15 16:32         ` H. Peter Anvin
  2010-12-15 16:41           ` Christoph Lameter
  1 sibling, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2010-12-15 16:32 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 12/15/2010 08:12 AM, Christoph Lameter wrote:
> On Tue, 14 Dec 2010, H. Peter Anvin wrote:
> 
>>> NAK on this.  This is acceptable for cmpxchg8b only because we don't
>>> support SMP on 486s anymore.  x86-64 is another matter...
>>>
>>
>> Hm, this is meant to be a CPU local operation, isn't it... it isn't very
>> clear from the comments or naming *in this file*.
> 
> Ok. But the code is correct as far as I can tell. Its not a global cmpxchg
> but local. Interrupt disable is ok.
> 
>> Could you make it a little clearer in the local code, please?
> 
> ok.
> 

Yes, that's what I meant... the code is correct, it's just not obvious
from someone who looks at the code that it is correct, because the
intended function of the code isn't obvious from that file by itself (in
fact, it looks almost exactly like the 486 cmpxchg ersatz code.)

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-15 16:32         ` H. Peter Anvin
@ 2010-12-15 16:41           ` Christoph Lameter
  0 siblings, 0 replies; 29+ messages in thread
From: Christoph Lameter @ 2010-12-15 16:41 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Wed, 15 Dec 2010, H. Peter Anvin wrote:

> Yes, that's what I meant... the code is correct, it's just not obvious
> from someone who looks at the code that it is correct, because the
> intended function of the code isn't obvious from that file by itself (in
> fact, it looks almost exactly like the 486 cmpxchg ersatz code.)

Confirmed. It is a copy of that 486 ersatz code ... But its ok for the use
case here.



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub Christoph Lameter
@ 2010-12-15 16:51   ` Tejun Heo
  2010-12-15 16:55     ` Pekka Enberg
  0 siblings, 1 reply; 29+ messages in thread
From: Tejun Heo @ 2010-12-15 16:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

On 12/14/2010 06:48 PM, Christoph Lameter wrote:
> Use the this_cpu_cmpxchg_double functionality to implement a lockless
> allocation algorithm on arches that support fast this_cpu_ops.
> 
> Each of the per cpu pointers is paired with a transaction id that ensures
> that updates of the per cpu information can only occur in sequence on
> a certain cpu.
> 
> A transaction id is a "long" integer that is comprised of an event number
> and the cpu number. The event number is incremented for every change to the
> per cpu state. This means that the cmpxchg instruction can verify for an
> update that nothing interfered and that we are updating the percpu structure
> for the processor where we picked up the information and that we are also
> currently on that processor when we update the information.
> 
> This results in a significant decrease of the overhead in the fastpaths. It
> also makes it easy to adopt the fast path for realtime kernels since this
> is lockless and does not require the use of the current per cpu area
> over the critical section. It is only important that the per cpu area is
> current at the beginning of the critical section and at the end.
> 
> So there is no need even to disable preemption.
> 
> Test results show that the fastpath cycle count is reduced by up to ~ 40%
> (alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree
> adds a few cycles.
> 
> Sadly this does nothing for the slowpath which is where the main issues with
> performance in slub are but the best case performance rises significantly.
> (For that see the more complex slub patches that require cmpxchg_double)

The first two look good to me but I frankly don't have much idea about
the latter two.  Pekka, can you please ack those?  Alternatively, you
can later pull the percpu tree in and apply the allocator bits in your
tree, which I actually prefer.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub
  2010-12-15 16:51   ` Tejun Heo
@ 2010-12-15 16:55     ` Pekka Enberg
  2010-12-15 16:57       ` Tejun Heo
  0 siblings, 1 reply; 29+ messages in thread
From: Pekka Enberg @ 2010-12-15 16:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet, H. Peter Anvin, Mathieu Desnoyers

Hi Tejun,

On Wed, Dec 15, 2010 at 6:51 PM, Tejun Heo <tj@kernel.org> wrote:
> The first two look good to me but I frankly don't have much idea about
> the latter two.  Pekka, can you please ack those?  Alternatively, you
> can later pull the percpu tree in and apply the allocator bits in your
> tree, which I actually prefer.

I'd prefer to pull the first two from your tree and put the allocator
changes to slab.git.

                        Pekka

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub
  2010-12-15 16:55     ` Pekka Enberg
@ 2010-12-15 16:57       ` Tejun Heo
  0 siblings, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2010-12-15 16:57 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet, H. Peter Anvin, Mathieu Desnoyers

Hey,

On 12/15/2010 05:55 PM, Pekka Enberg wrote:
> On Wed, Dec 15, 2010 at 6:51 PM, Tejun Heo <tj@kernel.org> wrote:
>> The first two look good to me but I frankly don't have much idea about
>> the latter two.  Pekka, can you please ack those?  Alternatively, you
>> can later pull the percpu tree in and apply the allocator bits in your
>> tree, which I actually prefer.
> 
> I'd prefer to pull the first two from your tree and put the allocator
> changes to slab.git.

Great, then I'll let you know after the percpu tree is frozen with the
new changes.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-15 16:20         ` Christoph Lameter
@ 2010-12-15 17:36           ` H. Peter Anvin
  2010-12-15 17:53             ` Christoph Lameter
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2010-12-15 17:36 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 12/15/2010 08:20 AM, Christoph Lameter wrote:
> Better comments:
> 
> Subject: Fixup comments for cmpxchg16b_cpuops_emu
> 
> Indicate that the sematics are not fullly atomic but just per cpu atomic.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  arch/x86/lib/cmpxchg16b_emu.S |    4 ++++
>  1 file changed, 4 insertions(+)
> 
> Index: linux-2.6/arch/x86/lib/cmpxchg16b_emu.S
> ===================================================================
> --- linux-2.6.orig/arch/x86/lib/cmpxchg16b_emu.S	2010-12-15 10:18:18.000000000 -0600
> +++ linux-2.6/arch/x86/lib/cmpxchg16b_emu.S	2010-12-15 10:19:11.000000000 -0600
> @@ -30,6 +30,10 @@ CFI_STARTPROC
>  # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
>  # al not via the ZF. Caller will access al to get result.
>  #
> +# Note that this is only useful for a cpuops operation. Meaning that we
> +# do *not* have a fully atomic operation but just an operation that is
> +# *atomic* on a single cpu (as provided by the this_cpu_xx class of macros)
> +#
>  cmpxchg16b_cpuops_emu:
>  	pushf
>  	cli
> 

Could we rename it this_cpu_cmpxchg16b_emu or something like that?

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support
  2010-12-15 17:36           ` H. Peter Anvin
@ 2010-12-15 17:53             ` Christoph Lameter
  0 siblings, 0 replies; 29+ messages in thread
From: Christoph Lameter @ 2010-12-15 17:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Wed, 15 Dec 2010, H. Peter Anvin wrote:

> >  cmpxchg16b_cpuops_emu:
> >  	pushf
> >  	cli
> >
>
> Could we rename it this_cpu_cmpxchg16b_emu or something like that?

Ok. New fixup patch:



Subject: x86, percpu: Rename cmpxchg16b_cpuops_emu to this_cpu_cmpxchg16b_emu

Rename the function to make clear what it is used for. Also add some comments
to explain the nature of the atomicness.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |    2 +-
 arch/x86/lib/cmpxchg16b_emu.S |   10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

Index: linux-2.6/arch/x86/lib/cmpxchg16b_emu.S
===================================================================
--- linux-2.6.orig/arch/x86/lib/cmpxchg16b_emu.S	2010-12-15 10:57:21.000000000 -0600
+++ linux-2.6/arch/x86/lib/cmpxchg16b_emu.S	2010-12-15 11:47:50.000000000 -0600
@@ -23,14 +23,18 @@
  * %rcx : high 64 bits of new value
  * %al  : Operation successful
  */
-ENTRY(cmpxchg16b_cpuops_emu)
+ENTRY(this_cpu_cmpxchg16b_emu)
 CFI_STARTPROC

 #
 # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
 # al not via the ZF. Caller will access al to get result.
 #
-cmpxchg16b_cpuops_emu:
+# Note that this is only useful for a cpuops operation. Meaning that we
+# do *not* have a fully atomic operation but just an operation that is
+# *atomic* on a single cpu (as provided by the this_cpu_xx class of macros)
+#
+this_cpu_cmpxchg16b_emu:
 	pushf
 	cli

@@ -53,6 +57,6 @@ cmpxchg16b_cpuops_emu:

 CFI_ENDPROC

-ENDPROC(cmpxchg16b_cpuops)
+ENDPROC(this_cpu_cmpxchg16b_emu)


Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-15 11:48:09.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-15 11:48:26.000000000 -0600
@@ -520,7 +520,7 @@ do {									\
 	typeof(o2) __o2 = o2;						\
 	typeof(o2) __n2 = n2;						\
 	typeof(o2) __dummy;						\
-	alternative_io("call cmpxchg16b_cpuops_emu\n\t" P6_NOP4,		\
+	alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4,		\
 			"cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t",	\
 			X86_FEATURE_CX16,				\
 		    	ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),	\

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-14 17:48 ` [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
@ 2010-12-18 14:47   ` Tejun Heo
  2010-12-18 14:51     ` Tejun Heo
  2010-12-21 22:36     ` Christoph Lameter
  0 siblings, 2 replies; 29+ messages in thread
From: Tejun Heo @ 2010-12-18 14:47 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

Hello, Christoph.

On 12/14/2010 06:48 PM, Christoph Lameter wrote:
> +/*
> + * cmpxchg_double replaces two adjacent scalars at once. The first parameter
> + * passed is a percpu pointer, not a scalar like the other this_cpu
> + * operations. This is so because the function operates on two scalars
> + * (must be of same size). A truth value is returned to indicate success or
> + * failure (since a double register result is difficult to handle).
> + * There is very limited hardware support for these operations. So only certain
> + * sizes may work.
> + */
> +#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	

This is ugly. :-( I think we should have made this_cpu_*() ops take
pointers from the beginning.  Anyways, that's too late, so is it
completely impossible to make cmpxchg_double's take a scalar value?
It can take the pointer all the same, no?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-18 14:47   ` Tejun Heo
@ 2010-12-18 14:51     ` Tejun Heo
  2010-12-21 22:36     ` Christoph Lameter
  1 sibling, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2010-12-18 14:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

On 12/18/2010 03:47 PM, Tejun Heo wrote:
> This is ugly. :-( I think we should have made this_cpu_*() ops take
> pointers from the beginning.  Anyways, that's too late, so is it
> completely impossible to make cmpxchg_double's take a scalar value?
> It can take the pointer all the same, no?

Also, the ratonale behind taking a scalar value instead of pointer was
that the operation changes what they do depending on the type of the
parameter (the size of the parameter), which is true for the double
ops too, so I think it would be better if we stick with lvalues.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-18 14:47   ` Tejun Heo
  2010-12-18 14:51     ` Tejun Heo
@ 2010-12-21 22:36     ` Christoph Lameter
  2010-12-21 23:24       ` H. Peter Anvin
  1 sibling, 1 reply; 29+ messages in thread
From: Christoph Lameter @ 2010-12-21 22:36 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

On Sat, 18 Dec 2010, Tejun Heo wrote:

> pointers from the beginning.  Anyways, that's too late, so is it
> completely impossible to make cmpxchg_double's take a scalar value?
> It can take the pointer all the same, no?

It could take a scalar value like the others but we are then not operating
on the scalar alone but also on the following field.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-21 22:36     ` Christoph Lameter
@ 2010-12-21 23:24       ` H. Peter Anvin
  2010-12-22  9:14         ` Tejun Heo
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2010-12-21 23:24 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 12/21/2010 02:36 PM, Christoph Lameter wrote:
> On Sat, 18 Dec 2010, Tejun Heo wrote:
> 
>> pointers from the beginning.  Anyways, that's too late, so is it
>> completely impossible to make cmpxchg_double's take a scalar value?
>> It can take the pointer all the same, no?
> 
> It could take a scalar value like the others but we are then not operating
> on the scalar alone but also on the following field.
> 

I'm a bit confused on this one.  The standard cmpxchg() takes a scalar
and a pointer, and returns a scalar.  The equivalent for the "double"
variety would be to return a compound object, basically:

struct double_ulong {
	unsigned long v[2];
};

... which can be returned in registers on both i386 and x86-64.

It's a bit clumsy from a type perspective, but I'm not sure that that is
a bad thing.  Doing too much type genericity has caused us problems in
the past.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-21 23:24       ` H. Peter Anvin
@ 2010-12-22  9:14         ` Tejun Heo
  2010-12-24  0:16           ` Christoph Lameter
  0 siblings, 1 reply; 29+ messages in thread
From: Tejun Heo @ 2010-12-22  9:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers

Hello,

On Tue, Dec 21, 2010 at 03:24:45PM -0800, H. Peter Anvin wrote:
> On 12/21/2010 02:36 PM, Christoph Lameter wrote:
> > On Sat, 18 Dec 2010, Tejun Heo wrote:
> > 
> >> pointers from the beginning.  Anyways, that's too late, so is it
> >> completely impossible to make cmpxchg_double's take a scalar value?
> >> It can take the pointer all the same, no?
> > 
> > It could take a scalar value like the others but we are then not operating
> > on the scalar alone but also on the following field.

Yes, it's weird but the operation itself is weird enough and named
accordingly, so to me it seems like a much lesser problem than
breaking interface consistency with other this_cpu_ ops.

> I'm a bit confused on this one.  The standard cmpxchg() takes a scalar
> and a pointer, and returns a scalar.  The equivalent for the "double"
> variety would be to return a compound object, basically:
> 
> struct double_ulong {
> 	unsigned long v[2];
> };
> 
> ... which can be returned in registers on both i386 and x86-64.
> 
> It's a bit clumsy from a type perspective, but I'm not sure that that is
> a bad thing.  Doing too much type genericity has caused us problems in
> the past.

Yeah, the above might be better too.  Is there any reason to use
cmpxchg_double on anything smaller?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-22  9:14         ` Tejun Heo
@ 2010-12-24  0:16           ` Christoph Lameter
  2010-12-24  0:22             ` H. Peter Anvin
  0 siblings, 1 reply; 29+ messages in thread
From: Christoph Lameter @ 2010-12-24  0:16 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Wed, 22 Dec 2010, Tejun Heo wrote:

> > I'm a bit confused on this one.  The standard cmpxchg() takes a scalar
> > and a pointer, and returns a scalar.  The equivalent for the "double"
> > variety would be to return a compound object, basically:
> >
> > struct double_ulong {
> > 	unsigned long v[2];
> > };
> >
> > ... which can be returned in registers on both i386 and x86-64.

Really? How would that work? I tried with uint128 but could not get the
compiler to do the right thing.

> > It's a bit clumsy from a type perspective, but I'm not sure that that is
> > a bad thing.  Doing too much type genericity has caused us problems in
> > the past.
>
> Yeah, the above might be better too.  Is there any reason to use
> cmpxchg_double on anything smaller?

Yes. You may want to use cmpxchg_double on 32 bit entities for backwards
compatibilities sake or any other smaller unit size. But those could also
be realized using this_cpu_cmpxchg_<double the size> by just aggregating
the amount.

If we can indeed pass 128 bit entities (as claimed by hpa) via registers
then the logical choice would be to do

	this_cpu_cmpxchg_16(pcp, old, new)

instead of cmpxchg_double. All parameters would have to be bit.
Then we can avoid the strange cmpxchg_double semantics and can completely
avoid introducing those.





^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-24  0:16           ` Christoph Lameter
@ 2010-12-24  0:22             ` H. Peter Anvin
  2010-12-25  4:53               ` Christoph Lameter
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2010-12-24  0:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 12/23/2010 04:16 PM, Christoph Lameter wrote:
> On Wed, 22 Dec 2010, Tejun Heo wrote:
> 
>>> I'm a bit confused on this one.  The standard cmpxchg() takes a scalar
>>> and a pointer, and returns a scalar.  The equivalent for the "double"
>>> variety would be to return a compound object, basically:
>>>
>>> struct double_ulong {
>>> 	unsigned long v[2];
>>> };
>>>
>>> ... which can be returned in registers on both i386 and x86-64.
> 
> Really? How would that work? I tried with uint128 but could not get the
> compiler to do the right thing.
> 

There are two return registers; two machine registers can be returned in
registers.  [u]int128 is poorly implemented in a lot of gcc versions,
since it really hasn't been exercised.  However, two-word structures
should work.  I do not believe a two-word *array* works, though.

>>> It's a bit clumsy from a type perspective, but I'm not sure that that is
>>> a bad thing.  Doing too much type genericity has caused us problems in
>>> the past.
>>
>> Yeah, the above might be better too.  Is there any reason to use
>> cmpxchg_double on anything smaller?
> 
> Yes. You may want to use cmpxchg_double on 32 bit entities for backwards
> compatibilities sake or any other smaller unit size. But those could also
> be realized using this_cpu_cmpxchg_<double the size> by just aggregating
> the amount.
> 
> If we can indeed pass 128 bit entities (as claimed by hpa) via registers
> then the logical choice would be to do
> 
> 	this_cpu_cmpxchg_16(pcp, old, new)
> 
> instead of cmpxchg_double. All parameters would have to be bit.
> Then we can avoid the strange cmpxchg_double semantics and can completely
> avoid introducing those.

I'm not sure it works with passing in a structure.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-24  0:22             ` H. Peter Anvin
@ 2010-12-25  4:53               ` Christoph Lameter
  2010-12-25  6:11                 ` H. Peter Anvin
  2010-12-25 16:52                 ` Tejun Heo
  0 siblings, 2 replies; 29+ messages in thread
From: Christoph Lameter @ 2010-12-25  4:53 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Thu, 23 Dec 2010, H. Peter Anvin wrote:

> There are two return registers; two machine registers can be returned in
> registers.  [u]int128 is poorly implemented in a lot of gcc versions,
> since it really hasn't been exercised.  However, two-word structures
> should work.  I do not believe a two-word *array* works, though.

Oh gosh. So we would be using a tight corner case for gcc that may only
work with certain versions of gcc? Note that the current version does only
return a boolean. There is no need for returning double words. I'd be
happy if we could *pass* double words.

> > If we can indeed pass 128 bit entities (as claimed by hpa) via registers
> > then the logical choice would be to do
> >
> > 	this_cpu_cmpxchg_16(pcp, old, new)
> >
> > instead of cmpxchg_double. All parameters would have to be bit.
> > Then we can avoid the strange cmpxchg_double semantics and can completely
> > avoid introducing those.
>
> I'm not sure it works with passing in a structure.

I think then we better leave it as is. The use case so far is minimal so
we could easily change that if we get to a better solution later.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-25  4:53               ` Christoph Lameter
@ 2010-12-25  6:11                 ` H. Peter Anvin
  2010-12-25 16:52                 ` Tejun Heo
  1 sibling, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2010-12-25  6:11 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 12/24/2010 08:53 PM, Christoph Lameter wrote:
> On Thu, 23 Dec 2010, H. Peter Anvin wrote:
> 
>> There are two return registers; two machine registers can be returned in
>> registers.  [u]int128 is poorly implemented in a lot of gcc versions,
>> since it really hasn't been exercised.  However, two-word structures
>> should work.  I do not believe a two-word *array* works, though.
> 
> Oh gosh. So we would be using a tight corner case for gcc that may only
> work with certain versions of gcc? Note that the current version does only
> return a boolean. There is no need for returning double words. I'd be
> happy if we could *pass* double words.
> 


A structure is not a corner case; a uint128 would be.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-25  4:53               ` Christoph Lameter
  2010-12-25  6:11                 ` H. Peter Anvin
@ 2010-12-25 16:52                 ` Tejun Heo
  2010-12-25 23:55                   ` Christoph Lameter
  1 sibling, 1 reply; 29+ messages in thread
From: Tejun Heo @ 2010-12-25 16:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: H. Peter Anvin, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Sat, Dec 25, 2010 at 5:53 AM, Christoph Lameter <cl@linux.com> wrote:
> Oh gosh. So we would be using a tight corner case for gcc that may only
> work with certain versions of gcc? Note that the current version does only
> return a boolean. There is no need for returning double words. I'd be
> happy if we could *pass* double words.

I wouldn't call it a corner case.  It's pretty clearly defined in the
ABI.  But that said, it might still be problematic on other
architectures when we try to apply it to different architectures.  Is
everyone against just taking a scalar for the first variable instead
of taking a pointer?  I'd be happier with that than the current one.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-25 16:52                 ` Tejun Heo
@ 2010-12-25 23:55                   ` Christoph Lameter
  2010-12-27 10:52                     ` Tejun Heo
  0 siblings, 1 reply; 29+ messages in thread
From: Christoph Lameter @ 2010-12-25 23:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Sat, 25 Dec 2010, Tejun Heo wrote:

> ABI.  But that said, it might still be problematic on other
> architectures when we try to apply it to different architectures.  Is
> everyone against just taking a scalar for the first variable instead
> of taking a pointer?  I'd be happier with that than the current one.

How about replacing that with two scalars? Macro will check that the
scalaers are properly aligned and that the second follows the first. Then
there is also better symmetry in the parameters.

bool this_cpu_cmpxchg_double(
          percpu_1, percpu_2
          old_1, old_2
          new_1, new_2
)


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-25 23:55                   ` Christoph Lameter
@ 2010-12-27 10:52                     ` Tejun Heo
  2011-01-03 22:43                       ` Christoph Lameter
  0 siblings, 1 reply; 29+ messages in thread
From: Tejun Heo @ 2010-12-27 10:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: H. Peter Anvin, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Sat, Dec 25, 2010 at 05:55:22PM -0600, Christoph Lameter wrote:
> On Sat, 25 Dec 2010, Tejun Heo wrote:
> 
> > ABI.  But that said, it might still be problematic on other
> > architectures when we try to apply it to different architectures.  Is
> > everyone against just taking a scalar for the first variable instead
> > of taking a pointer?  I'd be happier with that than the current one.
> 
> How about replacing that with two scalars? Macro will check that the
> scalaers are properly aligned and that the second follows the first. Then
> there is also better symmetry in the parameters.
> 
> bool this_cpu_cmpxchg_double(
>           percpu_1, percpu_2
>           old_1, old_2
>           new_1, new_2
> )

Yeah, maybe.  The interface is a bit redundant but probably the most
fool proof.  I'm okay with either taking a single scalar or both.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double
  2010-12-27 10:52                     ` Tejun Heo
@ 2011-01-03 22:43                       ` Christoph Lameter
  0 siblings, 0 replies; 29+ messages in thread
From: Christoph Lameter @ 2011-01-03 22:43 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Mon, 27 Dec 2010, Tejun Heo wrote:

> On Sat, Dec 25, 2010 at 05:55:22PM -0600, Christoph Lameter wrote:
> > On Sat, 25 Dec 2010, Tejun Heo wrote:
> >
> > > ABI.  But that said, it might still be problematic on other
> > > architectures when we try to apply it to different architectures.  Is
> > > everyone against just taking a scalar for the first variable instead
> > > of taking a pointer?  I'd be happier with that than the current one.
> >
> > How about replacing that with two scalars? Macro will check that the
> > scalaers are properly aligned and that the second follows the first. Then
> > there is also better symmetry in the parameters.
> >
> > bool this_cpu_cmpxchg_double(
> >           percpu_1, percpu_2
> >           old_1, old_2
> >           new_1, new_2
> > )
>
> Yeah, maybe.  The interface is a bit redundant but probably the most
> fool proof.  I'm okay with either taking a single scalar or both.

The single large 128 bit scalar does not work. Having to define an
additional structure it also a bit clumsy. I think its best to get another
patchset out that also duplicates the first parameter and makes the percpu
variable specs conform to the other this_cpu ops.

Still on vacation so it may take to the end of the week for me to get this
done.


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2011-01-03 22:43 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-14 17:48 [cpuops cmpxchg double V1 0/4] this_cpu_cmpxchg_double support Christoph Lameter
2010-12-14 17:48 ` [cpuops cmpxchg double V1 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
2010-12-18 14:47   ` Tejun Heo
2010-12-18 14:51     ` Tejun Heo
2010-12-21 22:36     ` Christoph Lameter
2010-12-21 23:24       ` H. Peter Anvin
2010-12-22  9:14         ` Tejun Heo
2010-12-24  0:16           ` Christoph Lameter
2010-12-24  0:22             ` H. Peter Anvin
2010-12-25  4:53               ` Christoph Lameter
2010-12-25  6:11                 ` H. Peter Anvin
2010-12-25 16:52                 ` Tejun Heo
2010-12-25 23:55                   ` Christoph Lameter
2010-12-27 10:52                     ` Tejun Heo
2011-01-03 22:43                       ` Christoph Lameter
2010-12-14 17:48 ` [cpuops cmpxchg double V1 2/4] x86: this_cpu_cmpxchg_double() support Christoph Lameter
2010-12-15  0:46   ` H. Peter Anvin
2010-12-15  0:56     ` H. Peter Anvin
2010-12-15 16:12       ` Christoph Lameter
2010-12-15 16:20         ` Christoph Lameter
2010-12-15 17:36           ` H. Peter Anvin
2010-12-15 17:53             ` Christoph Lameter
2010-12-15 16:32         ` H. Peter Anvin
2010-12-15 16:41           ` Christoph Lameter
2010-12-14 17:48 ` [cpuops cmpxchg double V1 3/4] slub: Get rid of slab_free_hook_irq() Christoph Lameter
2010-12-14 17:48 ` [cpuops cmpxchg double V1 4/4] Lockless (and preemptless) fastpaths for slub Christoph Lameter
2010-12-15 16:51   ` Tejun Heo
2010-12-15 16:55     ` Pekka Enberg
2010-12-15 16:57       ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).