All of lore.kernel.org
 help / color / mirror / Atom feed
* [cpuops cmpxchg double V2 0/4] this_cpu_cmpxchg_double support
@ 2011-01-06 20:45 Christoph Lameter
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
                   ` (3 more replies)
  0 siblings, 4 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-06 20:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

This patch series introduces this_cpu_cmpxchg_double().

x86 cpus support cmpxchg16b and cmpxchg8b instuction which are capable of
switching two words instead of one during a cmpxchg.
Two words allow to swap more state in an atomic instruction.

this_cpu_cmpxchg_double() is used in the slub allocator to avoid
interrupt disable/enable in both alloc and free fastpath.
this_cpu_cmpxchg_double() works nicely with the per cpu data of the
allocator. Using the new operation significantly speeds up the fastpaths.

V1->V2
	- Change parameter convention for this_cpu_cmpxchg_double. Specify both
	  percpu variables in same way as the two old and new values.
	- Do not require a per cpu pointer but a variable to conform to the
	  convention used in other this_cpu_ops.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-06 20:45 [cpuops cmpxchg double V2 0/4] this_cpu_cmpxchg_double support Christoph Lameter
@ 2011-01-06 20:45 ` Christoph Lameter
  2011-01-06 21:08   ` Mathieu Desnoyers
  2011-01-06 22:05   ` H. Peter Anvin
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 2/4] x86: this_cpu_cmpxchg_double() support Christoph Lameter
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-06 20:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_double_generic --]
[-- Type: text/plain, Size: 6856 bytes --]

Introduce this_cpu_cmpxchg_double. this_cpu_cmpxchg_double() allows the
comparision between two consecutive words and replaces them if there is
a match.

	bool this_cpu_cmpxchg_double(pcp1, pcp2,
		old_word1, old_word2, new_word1, new_word2)

this_cpu_cmpxchg_double does not return the old value (difficult since
there are two words) but a boolean indicating if the operation was
successful.

The first percpu variable must be double word aligned!

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |  130 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2011-01-05 15:00:46.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2011-01-06 13:19:43.000000000 -0600
@@ -259,6 +259,29 @@ extern void __bad_size_call_parameter(vo
 	pscr2_ret__;							\
 })
 
+/*
+ * Special handling for cmpxchg_double. cmpxchg_double is passed two
+ * percpu variables. The first has to be aligned to a double word
+ * boundary and the second has to follow directly thereafter.
+ */
+#define __pcpu_double_call_return_int(stem, pcp1, pcp2, ...)		\
+({									\
+	int ret__;							\
+	__verify_pcpu_ptr(&pcp1);					\
+	VM_BUG_ON((unsigned long)(&pcp1) % (2 * sizeof(pcp1)));		\
+	VM_BUG_ON((unsigned long)(&pcp2) != (unsigned long)(&pcp1) + sizeof(pcp1));\
+	VM_BUG_ON(sizeof(pcp1) != sizeof(pcp2));			\
+	switch(sizeof(pcp1)) {						\
+	case 1: ret__ = stem##1(pcp1, pcp2, __VA_ARGS__);break;		\
+	case 2: ret__ = stem##2(pcp1, pcp2, __VA_ARGS__);break;		\
+	case 4: ret__ = stem##4(pcp1, pcp2, __VA_ARGS__);break;		\
+	case 8: ret__ = stem##8(pcp1, pcp2, __VA_ARGS__);break;		\
+	default:							\
+		__bad_size_call_parameter();break;			\
+	}								\
+	ret__;								\
+})
+
 #define __pcpu_size_call(stem, variable, ...)				\
 do {									\
 	__verify_pcpu_ptr(&(variable));					\
@@ -422,6 +445,80 @@ do {									\
 				__this_cpu_cmpxchg_, pcp, oval, nval)
 #endif
 
+/*
+ * cmpxchg_double replaces two adjacent scalars at once. The first two
+ * parameters are per cpu variables which have to be of the same size.
+ * A truth value is returned to indicate success or
+ * failure (since a double register result is difficult to handle).
+ * There is very limited hardware support for these operations. So only certain
+ * sizes may work.
+ */
+#define __this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+({									\
+	int __ret = 0;							\
+	if (__this_cpu_read(pcp1) == (oval1) &&				\
+			 __this_cpu_read(pcp2)  == (oval2)) {		\
+		__this_cpu_write(pcp1, (nval1));			\
+		__this_cpu_write(pcp2, (nval2));			\
+		__ret = 1;						\
+	}								\
+	(__ret);							\
+})
+
+#ifndef __this_cpu_cmpxchg_double
+# ifndef __this_cpu_cmpxchg_double_1
+#  define __this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_2
+#  define __this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_4
+#  define __this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_8
+#  define __this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# define __this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_int(__this_cpu_cmpxchg_double_, (pcp1), (pcp2)	\
+					 oval1, oval2, nval1, nval2)
+#endif
+
+#define _this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	preempt_disable();						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
+			oval1, oval2, nval1, nval2);			\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg_double
+# ifndef this_cpu_cmpxchg_double_1
+#  define this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_2
+#  define this_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_4
+#  define this_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_8
+#  define this_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_int(this_cpu_cmpxchg_double_, (pcp1), (pcp2),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	preempt_disable();						\
@@ -825,4 +922,37 @@ do {									\
 # define irqsafe_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval)
 #endif
 
+#define irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+({									\
+	int ret__;							\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2,		\
+			oval1, oval2, nval1, nval2);			\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cpu_cmpxchg_double
+# ifndef irqsafe_cpu_cmpxchg_double_1
+#  define irqsafe_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_2
+#  define irqsafe_cpu_cmpxchg_double_2(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_4
+#  define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_double_8
+#  define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)
+# endif
+# define irqsafe_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2)	\
+	__pcpu_double_call_return_int(irqsafe_cpu_cmpxchg_double_, (pcp1), (pcp2),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
 #endif /* __LINUX_PERCPU_H */


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [cpuops cmpxchg double V2 2/4] x86: this_cpu_cmpxchg_double() support
  2011-01-06 20:45 [cpuops cmpxchg double V2 0/4] this_cpu_cmpxchg_double support Christoph Lameter
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
@ 2011-01-06 20:45 ` Christoph Lameter
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 3/4] slub: Get rid of slab_free_hook_irq() Christoph Lameter
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 4/4] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  3 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-06 20:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_double_x86 --]
[-- Type: text/plain, Size: 5207 bytes --]

Support this_cpu_cmpxchg_double using the cmpxchg16b and cmpxchg8b instructions.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   47 +++++++++++++++++++++++++++++++
 arch/x86/lib/Makefile         |    1 
 arch/x86/lib/cmpxchg16b_emu.S |   62 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2011-01-05 15:01:14.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2011-01-06 13:25:18.000000000 -0600
@@ -442,6 +442,26 @@ do {									\
 #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #endif /* !CONFIG_M386 */
 
+#ifdef CONFIG_X86_CMPXCHG64
+#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)			\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	typeof(o2) __dummy = n2;					\
+	asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t"		\
+		    : "=a"(__ret), "=m" (pcp1), "=d"(__dummy)		\
+		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
+	__ret;								\
+})
+
+#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
+#endif /* CONFIG_X86_CMPXCHG64 */
+
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -477,6 +497,33 @@ do {									\
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
+/*
+ * Pretty complex macro to generate cmpxchg16 instruction. The instruction
+ * is not supported on early AMD64 processors so we must be able to emulate
+ * it in software. The address used in the cmpxchg16 instruction must be
+ * aligned to a 16 byte boundary.
+ */
+#define percpu_cmpxchg16b(pcp1, o1, o2, n1, n2)				\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	typeof(o2) __dummy;						\
+	alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4,	\
+			"cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t",	\
+			X86_FEATURE_CX16,				\
+		    	ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),	\
+		        "S" (&pcp1), "b"(__n1), "c"(__n2),		\
+			 "a"(__o1), "d"(__o2));				\
+	__ret;								\
+})
+
+#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b(pcp1, o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg16b(pcp1, o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg16b(pcp1, o1, o2, n1, n2)
+
 #endif
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
Index: linux-2.6/arch/x86/lib/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile	2011-01-05 15:01:14.000000000 -0600
+++ linux-2.6/arch/x86/lib/Makefile	2011-01-05 15:01:17.000000000 -0600
@@ -42,4 +42,5 @@ else
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
 	lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+	lib-y += cmpxchg16b_emu.o
 endif
Index: linux-2.6/arch/x86/lib/cmpxchg16b_emu.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/arch/x86/lib/cmpxchg16b_emu.S	2011-01-05 15:01:17.000000000 -0600
@@ -0,0 +1,62 @@
+/*
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; version 2
+ *	of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al  : Operation successful
+ */
+ENTRY(this_cpu_cmpxchg16b_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
+# al not via the ZF. Caller will access al to get result.
+#
+# Note that this is only useful for a cpuops operation. Meaning that we
+# do *not* have a fully atomic operation but just an operation that is
+# *atomic* on a single cpu (as provided by the this_cpu_xx class of macros)
+#
+this_cpu_cmpxchg16b_emu:
+	pushf
+	cli
+
+	cmpq  %gs:(%rsi), %rax
+	jne not_same
+	cmpq %gs:8(%rsi), %rdx
+	jne not_same
+
+	movq %rbx,  %gs:(%rsi)
+	movq %rcx, %gs:8(%rsi)
+
+	popf
+	mov $1, %al
+	ret
+
+ not_same:
+	popf
+	xor  %al,%al
+	ret
+
+CFI_ENDPROC
+
+ENDPROC(this_cpu_cmpxchg16b_emu)
+
+


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [cpuops cmpxchg double V2 3/4] slub: Get rid of slab_free_hook_irq()
  2011-01-06 20:45 [cpuops cmpxchg double V2 0/4] this_cpu_cmpxchg_double support Christoph Lameter
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 2/4] x86: this_cpu_cmpxchg_double() support Christoph Lameter
@ 2011-01-06 20:45 ` Christoph Lameter
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 4/4] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  3 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-06 20:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: slub_remove_irq_freehook --]
[-- Type: text/plain, Size: 2269 bytes --]

The following patch will make the fastpaths lockless and will no longer
require interrupts to be disabled. Calling the free hook with irq disabled
will no longer be possible.

Move the slab_free_hook_irq() logic into slab_free_hook. Only disable
interrupts if the features are selected that require callbacks with
interrupts off and reenable after calls have been made.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/slub.c |   29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-12-14 10:45:11.000000000 -0600
+++ linux-2.6/mm/slub.c	2010-12-14 10:46:40.000000000 -0600
@@ -805,14 +805,24 @@ static inline void slab_post_alloc_hook(
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
 {
 	kmemleak_free_recursive(x, s->flags);
-}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
-{
-	kmemcheck_slab_free(s, object, s->objsize);
-	debug_check_no_locks_freed(object, s->objsize);
-	if (!(s->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(object, s->objsize);
+	/*
+	 * Trouble is that we may no longer disable interupts in the fast path
+	 * So in order to make the debug calls that expect irqs to be
+	 * disabled we need to disable interrupts temporarily.
+	 */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+	{
+		unsigned long flags;
+
+		local_irq_save(flags);
+		kmemcheck_slab_free(s, x, s->objsize);
+		debug_check_no_locks_freed(x, s->objsize);
+		if (!(s->flags & SLAB_DEBUG_OBJECTS))
+			debug_check_no_obj_freed(x, s->objsize);
+		local_irq_restore(flags);
+	}
+#endif
 }
 
 /*
@@ -1099,9 +1109,6 @@ static inline void slab_post_alloc_hook(
 
 static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
 
-static inline void slab_free_hook_irq(struct kmem_cache *s,
-		void *object) {}
-
 #endif /* CONFIG_SLUB_DEBUG */
 
 /*
@@ -1893,8 +1900,6 @@ static __always_inline void slab_free(st
 	local_irq_save(flags);
 	c = __this_cpu_ptr(s->cpu_slab);
 
-	slab_free_hook_irq(s, x);
-
 	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
 		set_freepointer(s, object, c->freelist);
 		c->freelist = object;


^ permalink raw reply	[flat|nested] 44+ messages in thread

* [cpuops cmpxchg double V2 4/4] Lockless (and preemptless) fastpaths for slub
  2011-01-06 20:45 [cpuops cmpxchg double V2 0/4] this_cpu_cmpxchg_double support Christoph Lameter
                   ` (2 preceding siblings ...)
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 3/4] slub: Get rid of slab_free_hook_irq() Christoph Lameter
@ 2011-01-06 20:45 ` Christoph Lameter
  3 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-06 20:45 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_double_slub_fastpath --]
[-- Type: text/plain, Size: 12414 bytes --]

Use the this_cpu_cmpxchg_double functionality to implement a lockless
allocation algorithm on arches that support fast this_cpu_ops.

Each of the per cpu pointers is paired with a transaction id that ensures
that updates of the per cpu information can only occur in sequence on
a certain cpu.

A transaction id is a "long" integer that is comprised of an event number
and the cpu number. The event number is incremented for every change to the
per cpu state. The cmpxchg instruction can therefore verify for an
update that nothing else has interfered and that we are updating the percpu'
structure for the processor where we initially picked up the information
and that we are also currently on that processor.

So there is no need even to disable preemption.

Test results show that the fastpath cycle count is reduced by up to ~ 40%
(alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree
adds a few cycles.

Sadly this does nothing for the slowpath which is where the main issues with
performance in slub are but the best case performance rises significantly.
(For that see the more complex slub patches that require cmpxchg_double)

Kmalloc: alloc/free test

Before:

10000 times kmalloc(8)/kfree -> 134 cycles
10000 times kmalloc(16)/kfree -> 152 cycles
10000 times kmalloc(32)/kfree -> 144 cycles
10000 times kmalloc(64)/kfree -> 142 cycles
10000 times kmalloc(128)/kfree -> 142 cycles
10000 times kmalloc(256)/kfree -> 132 cycles
10000 times kmalloc(512)/kfree -> 132 cycles
10000 times kmalloc(1024)/kfree -> 135 cycles
10000 times kmalloc(2048)/kfree -> 135 cycles
10000 times kmalloc(4096)/kfree -> 135 cycles
10000 times kmalloc(8192)/kfree -> 144 cycles
10000 times kmalloc(16384)/kfree -> 754 cycles

After:

10000 times kmalloc(8)/kfree -> 78 cycles
10000 times kmalloc(16)/kfree -> 78 cycles
10000 times kmalloc(32)/kfree -> 82 cycles
10000 times kmalloc(64)/kfree -> 88 cycles
10000 times kmalloc(128)/kfree -> 79 cycles
10000 times kmalloc(256)/kfree -> 79 cycles
10000 times kmalloc(512)/kfree -> 85 cycles
10000 times kmalloc(1024)/kfree -> 82 cycles
10000 times kmalloc(2048)/kfree -> 82 cycles
10000 times kmalloc(4096)/kfree -> 85 cycles
10000 times kmalloc(8192)/kfree -> 82 cycles
10000 times kmalloc(16384)/kfree -> 706 cycles


Kmalloc: Repeatedly allocate then free test

Before:

10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles
10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles
10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles
10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles
10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles
10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles
10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles
10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles
10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles
10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles
10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles
10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles

After:

10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles
10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles
10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles
10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles
10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles
10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles
10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles
10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles
10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles
10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles
10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles
10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/slub_def.h |    5 -
 mm/slub.c                |  205 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 207 insertions(+), 3 deletions(-)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2011-01-05 11:41:50.000000000 -0600
+++ linux-2.6/include/linux/slub_def.h	2011-01-05 11:47:18.000000000 -0600
@@ -36,7 +36,10 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to first free per cpu object */
+	void **freelist;	/* Pointer to next available object */
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;	/* Globally unique transaction id */
+#endif
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
 #ifdef CONFIG_SLUB_STATS
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2011-01-05 11:47:16.000000000 -0600
+++ linux-2.6/mm/slub.c	2011-01-05 13:37:50.000000000 -0600
@@ -1492,6 +1492,77 @@ static void unfreeze_slab(struct kmem_ca
 	}
 }
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
+{
+	return tid + TID_STEP;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+	return tid % TID_STEP;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+	return tid / TID_STEP;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+	return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+		const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef CONFIG_DEBUG_VM
+	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+	printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPT
+	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+		printk("due to cpu change %d -> %d\n",
+			tid_to_cpu(tid), tid_to_cpu(actual_tid));
+	else
+#endif
+	if (tid_to_event(tid) != tid_to_event(actual_tid))
+		printk("due to cpu running other code. Event %ld->%ld\n",
+			tid_to_event(tid), tid_to_event(actual_tid));
+	else
+		printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+			actual_tid, tid, next_tid(tid));
+#endif
+}
+
+#endif
+
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+#endif
+
+}
 /*
  * Remove the cpu slab
  */
@@ -1523,6 +1594,9 @@ static void deactivate_slab(struct kmem_
 		page->inuse--;
 	}
 	c->page = NULL;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+#endif
 	unfreeze_slab(s, page, tail);
 }
 
@@ -1657,6 +1731,19 @@ static void *__slab_alloc(struct kmem_ca
 {
 	void **object;
 	struct page *new;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
+
+	local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+	/*
+	 * We may have been preempted and rescheduled on a different
+	 * cpu before disabling interrupts. Need to reload cpu area
+	 * pointer.
+	 */
+	c = this_cpu_ptr(s->cpu_slab);
+#endif
+#endif
 
 	/* We handle __GFP_ZERO in the caller */
 	gfpflags &= ~__GFP_ZERO;
@@ -1683,6 +1770,10 @@ load_freelist:
 	c->node = page_to_nid(c->page);
 unlock_out:
 	slab_unlock(c->page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	c->tid = next_tid(c->tid);
+	local_irq_restore(flags);
+#endif
 	stat(s, ALLOC_SLOWPATH);
 	return object;
 
@@ -1744,23 +1835,76 @@ static __always_inline void *slab_alloc(
 {
 	void **object;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	if (slab_pre_alloc_hook(s, gfpflags))
 		return NULL;
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+#else
+redo:
+#endif
+
+	/*
+	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+	 * enabled. We may switch back and forth between cpus while
+	 * reading from one cpu area. That does not matter as long
+	 * as we end up on the original cpu again when doing the cmpxchg.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * The transaction ids are globally unique per cpu and per operation on
+	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+	 * occurs on the right processor and that there was no operation on the
+	 * linked list in between.
+	 */
+	tid = c->tid;
+	barrier();
+#endif
+
 	object = c->freelist;
 	if (unlikely(!object || !node_match(c, node)))
 
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 
 	else {
+#ifdef CONFIG_CMPXCHG_LOCAL
+		/*
+		 * The cmpxchg will only match if there was no additonal
+		 * operation and if we are on the right processor.
+		 *
+		 * The cmpxchg does the following atomically (without lock semantics!)
+		 * 1. Relocate first pointer to the current per cpu area.
+		 * 2. Verify that tid and freelist have not been changed
+		 * 3. If they were not changed replace tid and freelist
+		 *
+		 * Since this is without lock semantics the protection is only against
+		 * code executing on this cpu *not* from access by other cpus.
+		 */
+		if (unlikely(!this_cpu_cmpxchg_double(
+				s->cpu_slab->freelist, s->cpu_slab->tid,
+				object, tid,
+				get_freepointer(s, object), next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_alloc", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = get_freepointer(s, object);
+#endif
 		stat(s, ALLOC_FASTPATH);
 	}
+
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->objsize);
@@ -1824,9 +1968,13 @@ static void __slab_free(struct kmem_cach
 {
 	void *prior;
 	void **object = (void *)x;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long flags;
 
-	stat(s, FREE_SLOWPATH);
+	local_irq_save(flags);
+#endif
 	slab_lock(page);
+	stat(s, FREE_SLOWPATH);
 
 	if (kmem_cache_debug(s))
 		goto debug;
@@ -1856,6 +2004,9 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	return;
 
 slab_empty:
@@ -1867,6 +2018,9 @@ slab_empty:
 		stat(s, FREE_REMOVE_PARTIAL);
 	}
 	slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+	local_irq_restore(flags);
+#endif
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 	return;
@@ -1893,21 +2047,54 @@ static __always_inline void slab_free(st
 {
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+	unsigned long tid;
+#else
 	unsigned long flags;
+#endif
 
 	slab_free_hook(s, x);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_save(flags);
+#endif
+
+redo:
+	/*
+	 * Determine the currently cpus per cpu slab.
+	 * The cpu may change afterward. However that does not matter since
+	 * data is retrieved via this pointer. If we are on the same cpu
+	 * during the cmpxchg then the free will succedd.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+	tid = c->tid;
+	barrier();
+#endif
+
 	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
 		set_freepointer(s, object, c->freelist);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+		if (unlikely(!this_cpu_cmpxchg_double(
+				s->cpu_slab->freelist, s->cpu_slab->tid,
+				c->freelist, tid,
+				object, next_tid(tid)))) {
+
+			note_cmpxchg_failure("slab_free", s, tid);
+			goto redo;
+		}
+#else
 		c->freelist = object;
+#endif
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr);
 
+#ifndef CONFIG_CMPXCHG_LOCAL
 	local_irq_restore(flags);
+#endif
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2110,9 +2297,23 @@ static inline int alloc_kmem_cache_cpus(
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+	/*
+	 * Must align to double word boundary for the double cmpxchg instructions
+	 * to work.
+	 */
+	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+#else
+	/* Regular alignment is sufficient */
 	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+#endif
+
+	if (!s->cpu_slab)
+		return 0;
 
-	return s->cpu_slab != NULL;
+	init_kmem_cache_cpus(s);
+
+	return 1;
 }
 
 static struct kmem_cache *kmem_cache_node;


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
@ 2011-01-06 21:08   ` Mathieu Desnoyers
  2011-01-06 21:43     ` Christoph Lameter
  2011-01-06 22:05   ` H. Peter Anvin
  1 sibling, 1 reply; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-06 21:08 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin

* Christoph Lameter (cl@linux.com) wrote:
[...]
> + * cmpxchg_double replaces two adjacent scalars at once. The first two
> + * parameters are per cpu variables which have to be of the same size.
> + * A truth value is returned to indicate success or
> + * failure (since a double register result is difficult to handle).
> + * There is very limited hardware support for these operations. So only certain
> + * sizes may work.

What's the issue with returning the value read by cmpxchg_double in addition to
the boolean ? "returning" it per se might be an issue, but you could add 2 more
parameters to the macros that take the addresses of these outputs. Returning the
values read by cmpxchg instead of just the boolean result helps removing the
extra reads from cmpxchg loops, which is why I think it's a shame to just return
the boolean result.

Thoughts ?

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-06 21:08   ` Mathieu Desnoyers
@ 2011-01-06 21:43     ` Christoph Lameter
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-06 21:43 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin

On Thu, 6 Jan 2011, Mathieu Desnoyers wrote:

> * Christoph Lameter (cl@linux.com) wrote:
> [...]
> > + * cmpxchg_double replaces two adjacent scalars at once. The first two
> > + * parameters are per cpu variables which have to be of the same size.
> > + * A truth value is returned to indicate success or
> > + * failure (since a double register result is difficult to handle).
> > + * There is very limited hardware support for these operations. So only certain
> > + * sizes may work.
>
> What's the issue with returning the value read by cmpxchg_double in addition to
> the boolean ? "returning" it per se might be an issue, but you could add 2 more
> parameters to the macros that take the addresses of these outputs. Returning the
> values read by cmpxchg instead of just the boolean result helps removing the
> extra reads from cmpxchg loops, which is why I think it's a shame to just return
> the boolean result.
>
> Thoughts ?

In the use cases so far the cmpxchg_double is usually successful. If
cmpxchg_double succeeds then we know what was in those memory locations
before the operation.

Returning the data is only interesting for the case where the
cmpxchg_double fails. We'd be optimizing for a very rare case.

cmpxchg_double is here not used for locking (where the mismatching value
is of importance in the failure case and where we spin frequently) but
for other types of serialization like the transactions id here. Note that
this matches *two* types of information. Each mismatch on its own would
require specific handling if we want to consider operating on the
mismatched values.

this_cpu_cmpxchg also includes the relocation relative to the current per
cpu area which is a third data item to keep in mind. A failure may be due
to having been rescheduled to another cpu. Sorting all of that out is not
worth it IMHO.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-06 20:45 ` [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
  2011-01-06 21:08   ` Mathieu Desnoyers
@ 2011-01-06 22:05   ` H. Peter Anvin
  2011-01-07 15:15     ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2011-01-06 22:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On 01/06/2011 12:45 PM, Christoph Lameter wrote:
> Introduce this_cpu_cmpxchg_double. this_cpu_cmpxchg_double() allows the
> comparision between two consecutive words and replaces them if there is
> a match.
> 
> 	bool this_cpu_cmpxchg_double(pcp1, pcp2,
> 		old_word1, old_word2, new_word1, new_word2)
> 
> this_cpu_cmpxchg_double does not return the old value (difficult since
> there are two words) but a boolean indicating if the operation was
> successful.
> 
> The first percpu variable must be double word aligned!

I really truly hate this interface.  The whole notion of passing two
pointers where only one is really used, is just painful.

	-hpa

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-06 22:05   ` H. Peter Anvin
@ 2011-01-07 15:15     ` Christoph Lameter
  2011-01-07 18:04       ` Mathieu Desnoyers
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2011-01-07 15:15 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

On Thu, 6 Jan 2011, H. Peter Anvin wrote:

> On 01/06/2011 12:45 PM, Christoph Lameter wrote:
> > Introduce this_cpu_cmpxchg_double. this_cpu_cmpxchg_double() allows the
> > comparision between two consecutive words and replaces them if there is
> > a match.
> >
> > 	bool this_cpu_cmpxchg_double(pcp1, pcp2,
> > 		old_word1, old_word2, new_word1, new_word2)
> >
> > this_cpu_cmpxchg_double does not return the old value (difficult since
> > there are two words) but a boolean indicating if the operation was
> > successful.
> >
> > The first percpu variable must be double word aligned!
>
> I really truly hate this interface.  The whole notion of passing two
> pointers where only one is really used, is just painful.

Well the second cpu variable is just there for correctness checking. That
way we can verify that the types are compatible for the second word and
that the second word was properly placed relative to the first one. It
also helps the reader in the source because it shows explicitly which two
words are modified by the operation.

If you look at the prior patch series at the use case you will see that
s->cpu_slab->tid was not referred to in the cmpxchg operation but
implicitly modified. That is bad and was the initial motivation to require
both to be specified.

During debugging I had a couple of problems with the way the compiler
placed those two fields that only became evident after painful debugging
sessions. With the checks possible only because both are specifiec there
is an explicit failure if either of the two cpu variables is specified in
such a way that the cmpxchg32b cannot work.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-07 15:15     ` Christoph Lameter
@ 2011-01-07 18:04       ` Mathieu Desnoyers
  2011-01-07 18:41         ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-07 18:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: H. Peter Anvin, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet

* Christoph Lameter (cl@linux.com) wrote:
> On Thu, 6 Jan 2011, H. Peter Anvin wrote:
> 
> > On 01/06/2011 12:45 PM, Christoph Lameter wrote:
> > > Introduce this_cpu_cmpxchg_double. this_cpu_cmpxchg_double() allows the
> > > comparision between two consecutive words and replaces them if there is
> > > a match.
> > >
> > > 	bool this_cpu_cmpxchg_double(pcp1, pcp2,
> > > 		old_word1, old_word2, new_word1, new_word2)
> > >
> > > this_cpu_cmpxchg_double does not return the old value (difficult since
> > > there are two words) but a boolean indicating if the operation was
> > > successful.
> > >
> > > The first percpu variable must be double word aligned!
> >
> > I really truly hate this interface.  The whole notion of passing two
> > pointers where only one is really used, is just painful.
> 
> Well the second cpu variable is just there for correctness checking. That
> way we can verify that the types are compatible for the second word and
> that the second word was properly placed relative to the first one. It
> also helps the reader in the source because it shows explicitly which two
> words are modified by the operation.
> 
> If you look at the prior patch series at the use case you will see that
> s->cpu_slab->tid was not referred to in the cmpxchg operation but
> implicitly modified. That is bad and was the initial motivation to require
> both to be specified.
> 
> During debugging I had a couple of problems with the way the compiler
> placed those two fields that only became evident after painful debugging
> sessions. With the checks possible only because both are specifiec there
> is an explicit failure if either of the two cpu variables is specified in
> such a way that the cmpxchg32b cannot work.

I have to admit that I also hate the current interface for two reasons:

a) the way it splits the target address in two.

b) the loss of the value read (the fact that the only current user of this API
   does not need the value returned seems like a very weak argument to define an
   API).

I'm probably missing important compiler trickiness here, but why are we not
rather relying on forced compiler alignment to declare the target address type ?
e.g.:

typedef struct {
        unsigned long v[2];
} __attribute__((aligned(2 * sizeof(long)))) cmpxchg_double_t;

For the usage you are doing within the allocator code, you can use an union (the
bitfield sizes are completely arbitrary).

union alloc_cd {
        cmpxchg_double_t cd;
        struct {
                void *ptr;
                unsigned long seq:48;
                unsigned long cpu:16;
        } v;
};

So this_cpu_cmpxchg_double would expect the type "cmpxchg_double_t *" as its
first argument.

One way to manage to return the value read would be to pass a pointer to
"old_word1-2" as argument rather than the value itself, and return the value
read in these locations. Anyway, we won't need the old words for comparison with
the return value, because the returned "bool" takes care of telling us if they
differ.

Adding the types just for clarity sake (should be handled differently in your
macro scheme of course), this_cpu_cmpxchg_double would look like:

typedef struct {
        unsigned long v[2];
} __attribute__((aligned(2 * sizeof(long)))) cmpxchg_double_t;

bool this_cpu_cmpxchg_double(cmpxchg_double_t *pcp,
                             unsigned long *old_ret_word1,
                             unsigned long *old_ret_word2,
                             unsigned long new_word1,
                             unsigned long new_word2)

Thoughts ?

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-07 18:04       ` Mathieu Desnoyers
@ 2011-01-07 18:41         ` Christoph Lameter
  2011-01-08 17:24           ` Tejun Heo
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2011-01-07 18:41 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: H. Peter Anvin, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet

On Fri, 7 Jan 2011, Mathieu Desnoyers wrote:

> I have to admit that I also hate the current interface for two reasons:
>
> a) the way it splits the target address in two.

We are handling two different entities that are placed next to each other.

> b) the loss of the value read (the fact that the only current user of this API
>    does not need the value returned seems like a very weak argument to define an
>    API).

The other user of cmpxchg_double that I have in my tree also does not have
the need. Repeatability is not as simple to implement as with a single
word cmpxchg.

> I'm probably missing important compiler trickiness here, but why are we not
> rather relying on forced compiler alignment to declare the target address type ?
> e.g.:
>
> typedef struct {
>         unsigned long v[2];
> } __attribute__((aligned(2 * sizeof(long)))) cmpxchg_double_t;
>
> For the usage you are doing within the allocator code, you can use an union (the
> bitfield sizes are completely arbitrary).
>
> union alloc_cd {
>         cmpxchg_double_t cd;
>         struct {
>                 void *ptr;
>                 unsigned long seq:48;
>                 unsigned long cpu:16;
>         } v;
> };
>
> So this_cpu_cmpxchg_double would expect the type "cmpxchg_double_t *" as its
> first argument.

That requires conversions back and forth between words and this
type. We only need the bonding between the two per cpu variables for the
single cmpxchg_double operation. There is no need to do this for the two
old values or the two new values.

> One way to manage to return the value read would be to pass a pointer to
> "old_word1-2" as argument rather than the value itself, and return the value
> read in these locations. Anyway, we won't need the old words for comparison with
> the return value, because the returned "bool" takes care of telling us if they
> differ.

That would require allocating storage for the two word variable required,
passing an address etc. All not necessary right now. Would increase the
latency of the inner loop. Saving the returned values also would increase
the code uselessly.

> Adding the types just for clarity sake (should be handled differently in your
> macro scheme of course), this_cpu_cmpxchg_double would look like:
>
> typedef struct {
>         unsigned long v[2];
> } __attribute__((aligned(2 * sizeof(long)))) cmpxchg_double_t;
>
> bool this_cpu_cmpxchg_double(cmpxchg_double_t *pcp,
>                              unsigned long *old_ret_word1,
>                              unsigned long *old_ret_word2,
>                              unsigned long new_word1,
>                              unsigned long new_word2)
>
> Thoughts ?

Looks not symmetric. The proposed approach in the patch has a clear
association of the first and second percpu variable with the first and
second old and new values.

Also cmpxchg_double may be used with different types and different object
sizes. Are we going to define cmpxchg_double_t for all types supported
and maybe created by the user? this_cpu_cmpxchg_double also supports two 4
byte fields.

Plus you can have mixed types. In the current use case you have one
integer and one pointer. So the cmpxchg_double_t would not fit in cleanly.
We would have to define custom types. And with that we will then have
difficulties codeing methods to check the compatibility of types.

I tried something like this before I came up with the current
solution. It was significantly more code and the code was difficult
to understand. Never was as clean and this one.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-07 18:41         ` Christoph Lameter
@ 2011-01-08 17:24           ` Tejun Heo
  2011-01-09  8:33             ` Pekka Enberg
  2011-01-21  7:31             ` Pekka Enberg
  0 siblings, 2 replies; 44+ messages in thread
From: Tejun Heo @ 2011-01-08 17:24 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Mathieu Desnoyers, H. Peter Anvin, akpm, Pekka Enberg,
	linux-kernel, Eric Dumazet

Hello,

On Fri, Jan 07, 2011 at 12:41:58PM -0600, Christoph Lameter wrote:
> On Fri, 7 Jan 2011, Mathieu Desnoyers wrote:
> 
> > I have to admit that I also hate the current interface for two reasons:

Call me weird but I like this one than others.  It sure is ugly but
the operation itself isn't a particularly pretty so it kinda matches.
Also, this one is the least error prone and more consistent with other
cpu ops.

> > b) the loss of the value read (the fact that the only current user of this API
> >    does not need the value returned seems like a very weak argument to define an
> >    API).
> 
> The other user of cmpxchg_double that I have in my tree also does not have
> the need. Repeatability is not as simple to implement as with a single
> word cmpxchg.

Yeah, even in regular cmpxchg, the read value on failure isn't of very
high value.  The cpu usually has to go retry anyway && likely to have
the cacheline already, so it's not gonna cost much.

So, yeah, of the proposed ones, this is my favorite.  Peter and
Mathieu don't like it.  What do others think?  Pekka, Eric, Andrew,
what do you guys think?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-08 17:24           ` Tejun Heo
@ 2011-01-09  8:33             ` Pekka Enberg
  2011-01-21  7:31             ` Pekka Enberg
  1 sibling, 0 replies; 44+ messages in thread
From: Pekka Enberg @ 2011-01-09  8:33 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Mathieu Desnoyers, H. Peter Anvin, akpm,
	linux-kernel, Eric Dumazet

On Sat, 2011-01-08 at 12:24 -0500, Tejun Heo wrote:
> Hello,
> 
> On Fri, Jan 07, 2011 at 12:41:58PM -0600, Christoph Lameter wrote:
> > On Fri, 7 Jan 2011, Mathieu Desnoyers wrote:
> > 
> > > I have to admit that I also hate the current interface for two reasons:
> 
> Call me weird but I like this one than others.  It sure is ugly but
> the operation itself isn't a particularly pretty so it kinda matches.
> Also, this one is the least error prone and more consistent with other
> cpu ops.
> 
> > > b) the loss of the value read (the fact that the only current user of this API
> > >    does not need the value returned seems like a very weak argument to define an
> > >    API).
> > 
> > The other user of cmpxchg_double that I have in my tree also does not have
> > the need. Repeatability is not as simple to implement as with a single
> > word cmpxchg.
> 
> Yeah, even in regular cmpxchg, the read value on failure isn't of very
> high value.  The cpu usually has to go retry anyway && likely to have
> the cacheline already, so it's not gonna cost much.
> 
> So, yeah, of the proposed ones, this is my favorite.  Peter and
> Mathieu don't like it.  What do others think?  Pekka, Eric, Andrew,
> what do you guys think?

I'm not a huge fan of the API either but like you, I like it best from
the proposed ones.

			Pekka


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-08 17:24           ` Tejun Heo
  2011-01-09  8:33             ` Pekka Enberg
@ 2011-01-21  7:31             ` Pekka Enberg
  2011-01-21  9:26               ` Tejun Heo
  1 sibling, 1 reply; 44+ messages in thread
From: Pekka Enberg @ 2011-01-21  7:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Mathieu Desnoyers, H. Peter Anvin, akpm,
	linux-kernel, Eric Dumazet

On 1/8/11 7:24 PM, Tejun Heo wrote:
> Hello,
>
> On Fri, Jan 07, 2011 at 12:41:58PM -0600, Christoph Lameter wrote:
>> On Fri, 7 Jan 2011, Mathieu Desnoyers wrote:
>>
>>> I have to admit that I also hate the current interface for two reasons:
>
> Call me weird but I like this one than others.  It sure is ugly but
> the operation itself isn't a particularly pretty so it kinda matches.
> Also, this one is the least error prone and more consistent with other
> cpu ops.

So what are we going to do about this patch? I'd love to merge 
Christoph's SLUB patches for linux-next now that .38-rc1 is out.

			Pekka

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21  7:31             ` Pekka Enberg
@ 2011-01-21  9:26               ` Tejun Heo
  2011-01-21 15:31                 ` H. Peter Anvin
  0 siblings, 1 reply; 44+ messages in thread
From: Tejun Heo @ 2011-01-21  9:26 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Christoph Lameter, Mathieu Desnoyers, H. Peter Anvin, akpm,
	linux-kernel, Eric Dumazet

Hello,

On Fri, Jan 21, 2011 at 09:31:02AM +0200, Pekka Enberg wrote:
> On 1/8/11 7:24 PM, Tejun Heo wrote:
> >Call me weird but I like this one than others.  It sure is ugly but
> >the operation itself isn't a particularly pretty so it kinda matches.
> >Also, this one is the least error prone and more consistent with other
> >cpu ops.
> 
> So what are we going to do about this patch? I'd love to merge
> Christoph's SLUB patches for linux-next now that .38-rc1 is out.

At least you like it, which is good.

I don't think the currently proposed one with two separate parameters
is significantly better than other alternatives and vice-versa.  They
all have slightly different ugliness and error proneness issues.

That said, I still like the double parameter one best, and, unless
there are distinctively good reasons to choose another one, I'm gonna
commit it to percpu tree in a few days so that merge can proceed.  So,
if you have something to say, now would be a good time to assert it.

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21  9:26               ` Tejun Heo
@ 2011-01-21 15:31                 ` H. Peter Anvin
  2011-01-21 15:48                   ` Tejun Heo
  0 siblings, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2011-01-21 15:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Pekka Enberg, Christoph Lameter, Mathieu Desnoyers, akpm,
	linux-kernel, Eric Dumazet

On 01/21/2011 01:26 AM, Tejun Heo wrote:
> Hello,
> 
> On Fri, Jan 21, 2011 at 09:31:02AM +0200, Pekka Enberg wrote:
>> On 1/8/11 7:24 PM, Tejun Heo wrote:
>>> Call me weird but I like this one than others.  It sure is ugly but
>>> the operation itself isn't a particularly pretty so it kinda matches.
>>> Also, this one is the least error prone and more consistent with other
>>> cpu ops.
>>
>> So what are we going to do about this patch? I'd love to merge
>> Christoph's SLUB patches for linux-next now that .38-rc1 is out.
> 
> At least you like it, which is good.
> 
> I don't think the currently proposed one with two separate parameters
> is significantly better than other alternatives and vice-versa.  They
> all have slightly different ugliness and error proneness issues.
> 
> That said, I still like the double parameter one best, and, unless
> there are distinctively good reasons to choose another one, I'm gonna
> commit it to percpu tree in a few days so that merge can proceed.  So,
> if you have something to say, now would be a good time to assert it.
> 

I really object to passing two pointers where one of them has to be a
fixed offset to the other.  That really doesn't make any sense.

	-hpa

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 15:31                 ` H. Peter Anvin
@ 2011-01-21 15:48                   ` Tejun Heo
  2011-01-21 16:30                     ` H. Peter Anvin
  2011-01-21 16:54                     ` Mathieu Desnoyers
  0 siblings, 2 replies; 44+ messages in thread
From: Tejun Heo @ 2011-01-21 15:48 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Pekka Enberg, Christoph Lameter, Mathieu Desnoyers, akpm,
	linux-kernel, Eric Dumazet

Hello, Peter.

On Fri, Jan 21, 2011 at 07:31:55AM -0800, H. Peter Anvin wrote:
> I really object to passing two pointers where one of them has to be a
> fixed offset to the other.  That really doesn't make any sense.

Yeah, I hear you, but it really comes down to which ugliness disgusts
one the most.  That, unfortunately, is inherently very subjective when
there's no significantly better choice.

For me, the double parameter thing at least seems to have the
advantages of being able to verify the two intended memory locations
to be used actually are together and looking ugly reflecting its true
nature.

The inherent ugliness stems from the fact that we don't have the
built-in data type to properly deal with this.  Array of length two
might be better fit, but I can see as many downsides with that too.

So, if anyone can give something clearly better for technical reasons,
I'll be more than happy to take it, but as it currently stands, it
seems we'll have to choose one among uglies and not everyone would be
happy about the choice.  :-(

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 15:48                   ` Tejun Heo
@ 2011-01-21 16:30                     ` H. Peter Anvin
  2011-01-21 16:34                       ` Tejun Heo
  2011-01-21 16:54                     ` Mathieu Desnoyers
  1 sibling, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2011-01-21 16:30 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Pekka Enberg, Christoph Lameter, Mathieu Desnoyers, akpm,
	linux-kernel, Eric Dumazet

There is a technical argument: any noninline version will have actual code overhead.

"Tejun Heo" <tj@kernel.org> wrote:

>Hello, Peter.
>
>On Fri, Jan 21, 2011 at 07:31:55AM -0800, H. Peter Anvin wrote:
>> I really object to passing two pointers where one of them has to be a
>> fixed offset to the other.  That really doesn't make any sense.
>
>Yeah, I hear you, but it really comes down to which ugliness disgusts
>one the most.  That, unfortunately, is inherently very subjective when
>there's no significantly better choice.
>
>For me, the double parameter thing at least seems to have the
>advantages of being able to verify the two intended memory locations
>to be used actually are together and looking ugly reflecting its true
>nature.
>
>The inherent ugliness stems from the fact that we don't have the
>built-in data type to properly deal with this.  Array of length two
>might be better fit, but I can see as many downsides with that too.
>
>So, if anyone can give something clearly better for technical reasons,
>I'll be more than happy to take it, but as it currently stands, it
>seems we'll have to choose one among uglies and not everyone would be
>happy about the choice.  :-(
>
>Thanks.
>
>-- 
>tejun

-- 
Sent from my mobile phone.  Please pardon any lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 16:30                     ` H. Peter Anvin
@ 2011-01-21 16:34                       ` Tejun Heo
  0 siblings, 0 replies; 44+ messages in thread
From: Tejun Heo @ 2011-01-21 16:34 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Pekka Enberg, Christoph Lameter, Mathieu Desnoyers, akpm,
	linux-kernel, Eric Dumazet

On Fri, Jan 21, 2011 at 08:30:41AM -0800, H. Peter Anvin wrote:
> There is a technical argument: any noninline version will have
> actual code overhead.

But the type checking and all those stuff have to be macros anyway.
If out-of-line implementation is necessary, it can take whatever
parameter to optimize performance from the frontend macro.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 15:48                   ` Tejun Heo
  2011-01-21 16:30                     ` H. Peter Anvin
@ 2011-01-21 16:54                     ` Mathieu Desnoyers
  2011-01-21 17:07                       ` Christoph Lameter
  2011-01-21 17:08                       ` Tejun Heo
  1 sibling, 2 replies; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-21 16:54 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Pekka Enberg, Christoph Lameter, akpm,
	linux-kernel, Eric Dumazet

* Tejun Heo (tj@kernel.org) wrote:
> Hello, Peter.
> 
> On Fri, Jan 21, 2011 at 07:31:55AM -0800, H. Peter Anvin wrote:
> > I really object to passing two pointers where one of them has to be a
> > fixed offset to the other.  That really doesn't make any sense.
> 
> Yeah, I hear you, but it really comes down to which ugliness disgusts
> one the most.  That, unfortunately, is inherently very subjective when
> there's no significantly better choice.
> 
> For me, the double parameter thing at least seems to have the
> advantages of being able to verify the two intended memory locations
> to be used actually are together and looking ugly reflecting its true
> nature.
> 
> The inherent ugliness stems from the fact that we don't have the
> built-in data type to properly deal with this.  Array of length two
> might be better fit, but I can see as many downsides with that too.
> 
> So, if anyone can give something clearly better for technical reasons,
> I'll be more than happy to take it, but as it currently stands, it
> seems we'll have to choose one among uglies and not everyone would be
> happy about the choice.  :-(

Quoting Christoph, from the previous exchange:

"The single large 128 bit scalar does not work. Having to define an
additional structure it also a bit clumsy. I think its best to get another
patchset out that also duplicates the first parameter and makes the percpu
variable specs conform to the other this_cpu ops."

I'm again probably missing something, but what is "clumsy" about defining a
structure like the following to ensure proper alignment of the target
pointer (instead of adding a runtime test) ?

struct cmpxchg_double {
#if __BYTE_ORDER == __LITTLE_ENDIAN
        unsigned long low, high;
#else
        unsigned long high, low;
#endif
} __attribute__((packed, aligned(2 * sizeof(unsigned long))));

(note: packed here along with "aligned" does _not_ generate ugly bytewise
read/write memory ops like "packed" alone. The use of "packed" is to let the
compiler down-align the structure to the value requested, instead of uselessly
aligning it on 32-byte if it chooses to.)

The prototype could then look like:

bool __this_cpu_generic_cmpxchg_double(pcp, oval_low, oval_high, nval_low, nval_high);

With:
  struct cmpxchg_double *pcp

I think Christoph's point is that he wants to alias this with a pointer. Well,
this can be done cleanly with:

union {
        struct cmpxchg_double casdbl;
        struct {
                void *ptr;
                unsigned long cpuid_tid;
        } t;
}

So by keeping distinct variables for the oval/nal arguments, we let the compiler
use registers (instead of the mandatory stack use that would be required if we
pass union or structures as oval/nval arguments), but we ensure proper alignment
(and drop the unneeded second pointer, as well as the runtime pointer alignment
checks) by passing one single pcp pointer of a fixed type with a known
alignment.

Thoughts ?

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 16:54                     ` Mathieu Desnoyers
@ 2011-01-21 17:07                       ` Christoph Lameter
  2011-01-21 17:50                         ` Mathieu Desnoyers
  2011-01-21 17:08                       ` Tejun Heo
  1 sibling, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2011-01-21 17:07 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Tejun Heo, H. Peter Anvin, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

On Fri, 21 Jan 2011, Mathieu Desnoyers wrote:

> I'm again probably missing something, but what is "clumsy" about defining a
> structure like the following to ensure proper alignment of the target
> pointer (instead of adding a runtime test) ?
>
> struct cmpxchg_double {
> #if __BYTE_ORDER == __LITTLE_ENDIAN
>         unsigned long low, high;
> #else
>         unsigned long high, low;
> #endif
> } __attribute__((packed, aligned(2 * sizeof(unsigned long))));
>
> (note: packed here along with "aligned" does _not_ generate ugly bytewise
> read/write memory ops like "packed" alone. The use of "packed" is to let the
> compiler down-align the structure to the value requested, instead of uselessly
> aligning it on 32-byte if it chooses to.)
>
> The prototype could then look like:
>
> bool __this_cpu_generic_cmpxchg_double(pcp, oval_low, oval_high, nval_low, nval_high);
>
> With:
>   struct cmpxchg_double *pcp

That does not conform to the parameter conventions in other this_cpu_ops.
The first parameter is a variable because the notion of a pointer is
problematic given that percpu operations use a segment prefix to relocate
pointers. You would be implicitly passing a 128 bit argument although the
compiler may not need to generate code for that.

> I think Christoph's point is that he wants to alias this with a pointer. Well,
> this can be done cleanly with:
>
> union {
>         struct cmpxchg_double casdbl;
>         struct {
>                 void *ptr;
>                 unsigned long cpuid_tid;
>         } t;
> }

There is no need for aliases with the existing implementation.

How will the macro check the parameters now?

> Thoughts ?

Could you actually try to write a patch instead running through points
that we have discussed earlier?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 16:54                     ` Mathieu Desnoyers
  2011-01-21 17:07                       ` Christoph Lameter
@ 2011-01-21 17:08                       ` Tejun Heo
  2011-01-21 17:13                         ` H. Peter Anvin
  2011-01-21 19:32                         ` Mathieu Desnoyers
  1 sibling, 2 replies; 44+ messages in thread
From: Tejun Heo @ 2011-01-21 17:08 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: H. Peter Anvin, Pekka Enberg, Christoph Lameter, akpm,
	linux-kernel, Eric Dumazet

Hello,

On Fri, Jan 21, 2011 at 11:54:25AM -0500, Mathieu Desnoyers wrote:
> "The single large 128 bit scalar does not work. Having to define an
> additional structure it also a bit clumsy. I think its best to get
> another patchset out that also duplicates the first parameter and
> makes the percpu variable specs conform to the other this_cpu ops."
>
> I'm again probably missing something, but what is "clumsy" about
> defining a structure like the following to ensure proper alignment
> of the target pointer (instead of adding a runtime test) ?
> 
> struct cmpxchg_double {
> #if __BYTE_ORDER == __LITTLE_ENDIAN
>         unsigned long low, high;
> #else
>         unsigned long high, low;
> #endif
> } __attribute__((packed, aligned(2 * sizeof(unsigned long))));
>
> (note: packed here along with "aligned" does _not_ generate ugly
> bytewise read/write memory ops like "packed" alone. The use of
> "packed" is to let the compiler down-align the structure to the
> value requested, instead of uselessly aligning it on 32-byte if it
> chooses to.)

Yeah, good point.  :-)

> The prototype could then look like:
> 
> bool __this_cpu_generic_cmpxchg_double(pcp, oval_low, oval_high, nval_low, nval_high);
> 
> With:
>   struct cmpxchg_double *pcp
> 
> I think Christoph's point is that he wants to alias this with a pointer. Well,
> this can be done cleanly with:
> 
> union {
>         struct cmpxchg_double casdbl;
>         struct {
>                 void *ptr;
>                 unsigned long cpuid_tid;
>         } t;
> }
> 
> So by keeping distinct variables for the oval/nal arguments, we let
> the compiler use registers (instead of the mandatory stack use that
> would be required if we pass union or structures as oval/nval
> arguments), but we ensure proper alignment (and drop the unneeded
> second pointer, as well as the runtime pointer alignment checks) by
> passing one single pcp pointer of a fixed type with a known
> alignment.

Actually, we might be able to work around the stack usage regardless
of the passed type.  We should be able to decompose the structure or
ulonglong using macro and then pass the decomposed parameters to a
function if necessary.

Hmm, if we do ulonglong, we don't even need a separate interface and
can simply use cmpxchg().  If someone can come up with something which
uses ulonglong that way, it would be great; then, we wouldn't need to
worry about alignment either.  Am I missing something?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:08                       ` Tejun Heo
@ 2011-01-21 17:13                         ` H. Peter Anvin
  2011-01-21 17:19                           ` Tejun Heo
  2011-01-21 17:24                           ` Christoph Lameter
  2011-01-21 19:32                         ` Mathieu Desnoyers
  1 sibling, 2 replies; 44+ messages in thread
From: H. Peter Anvin @ 2011-01-21 17:13 UTC (permalink / raw)
  To: Tejun Heo, Mathieu Desnoyers
  Cc: Pekka Enberg, Christoph Lameter, akpm, linux-kernel, Eric Dumazet

We could do cmpxchg with a structure... the problem with a lon int type is that Cristoph ran into bugs with __int128 on 64 bits.

"Tejun Heo" <tj@kernel.org> wrote:

>Hello,
>
>On Fri, Jan 21, 2011 at 11:54:25AM -0500, Mathieu Desnoyers wrote:
>> "The single large 128 bit scalar does not work. Having to define an
>> additional structure it also a bit clumsy. I think its best to get
>> another patchset out that also duplicates the first parameter and
>> makes the percpu variable specs conform to the other this_cpu ops."
>>
>> I'm again probably missing something, but what is "clumsy" about
>> defining a structure like the following to ensure proper alignment
>> of the target pointer (instead of adding a runtime test) ?
>> 
>> struct cmpxchg_double {
>> #if __BYTE_ORDER == __LITTLE_ENDIAN
>>         unsigned long low, high;
>> #else
>>         unsigned long high, low;
>> #endif
>> } __attribute__((packed, aligned(2 * sizeof(unsigned long))));
>>
>> (note: packed here along with "aligned" does _not_ generate ugly
>> bytewise read/write memory ops like "packed" alone. The use of
>> "packed" is to let the compiler down-align the structure to the
>> value requested, instead of uselessly aligning it on 32-byte if it
>> chooses to.)
>
>Yeah, good point.  :-)
>
>> The prototype could then look like:
>> 
>> bool __this_cpu_generic_cmpxchg_double(pcp, oval_low, oval_high,
>nval_low, nval_high);
>> 
>> With:
>>   struct cmpxchg_double *pcp
>> 
>> I think Christoph's point is that he wants to alias this with a
>pointer. Well,
>> this can be done cleanly with:
>> 
>> union {
>>         struct cmpxchg_double casdbl;
>>         struct {
>>                 void *ptr;
>>                 unsigned long cpuid_tid;
>>         } t;
>> }
>> 
>> So by keeping distinct variables for the oval/nal arguments, we let
>> the compiler use registers (instead of the mandatory stack use that
>> would be required if we pass union or structures as oval/nval
>> arguments), but we ensure proper alignment (and drop the unneeded
>> second pointer, as well as the runtime pointer alignment checks) by
>> passing one single pcp pointer of a fixed type with a known
>> alignment.
>
>Actually, we might be able to work around the stack usage regardless
>of the passed type.  We should be able to decompose the structure or
>ulonglong using macro and then pass the decomposed parameters to a
>function if necessary.
>
>Hmm, if we do ulonglong, we don't even need a separate interface and
>can simply use cmpxchg().  If someone can come up with something which
>uses ulonglong that way, it would be great; then, we wouldn't need to
>worry about alignment either.  Am I missing something?
>
>Thanks.
>
>-- 
>tejun

-- 
Sent from my mobile phone.  Please pardon any lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:13                         ` H. Peter Anvin
@ 2011-01-21 17:19                           ` Tejun Heo
  2011-01-24  6:01                             ` H. Peter Anvin
  2011-01-21 17:24                           ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: Tejun Heo @ 2011-01-21 17:19 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Mathieu Desnoyers, Pekka Enberg, Christoph Lameter, akpm,
	linux-kernel, Eric Dumazet

On Fri, Jan 21, 2011 at 09:13:41AM -0800, H. Peter Anvin wrote:
> We could do cmpxchg with a structure... the problem with a lon int
> type is that Cristoph ran into bugs with __int128 on 64 bits.

But, IIRC, the problem with int128 was with passing it as parameter
and return value.  We don't have to do that.  We'll be just using it
as a data storage / container type.  Or even that is broken?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:13                         ` H. Peter Anvin
  2011-01-21 17:19                           ` Tejun Heo
@ 2011-01-21 17:24                           ` Christoph Lameter
  2011-01-21 17:42                             ` Mathieu Desnoyers
  1 sibling, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2011-01-21 17:24 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, Mathieu Desnoyers, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

On Fri, 21 Jan 2011, H. Peter Anvin wrote:

> We could do cmpxchg with a structure... the problem with a lon int type is that Cristoph ran into bugs with __int128 on 64 bits.

We also would need to be pass the structure by value (well its really a
variable but its like passing by value) in order to be similar to the
other this_cpu_ops

You'd want either

DEFINE_PERCPU(struct mycustomdoublestruct, percpu_dd)

this_cpu_cmpxchg_double(percpu_dd, oldword1, oldword2, newword1, newword2)

with the problem of type checking

or

this_cpu_cmpxchg_double(percpu_dd, old_dd, new_dd)

with the problem of 128 bit constants/structs passed by value.





^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:24                           ` Christoph Lameter
@ 2011-01-21 17:42                             ` Mathieu Desnoyers
  2011-01-21 17:50                               ` Christoph Lameter
  2011-01-21 18:31                               ` H. Peter Anvin
  0 siblings, 2 replies; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-21 17:42 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: H. Peter Anvin, Tejun Heo, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

* Christoph Lameter (cl@linux.com) wrote:
> On Fri, 21 Jan 2011, H. Peter Anvin wrote:
> 
> > We could do cmpxchg with a structure... the problem with a lon int type is that Cristoph ran into bugs with __int128 on 64 bits.
> 
> We also would need to be pass the structure by value (well its really a
> variable but its like passing by value) in order to be similar to the
> other this_cpu_ops
> 
> You'd want either
> 
> DEFINE_PERCPU(struct mycustomdoublestruct, percpu_dd)
> 
> this_cpu_cmpxchg_double(percpu_dd, oldword1, oldword2, newword1, newword2)
> 
> with the problem of type checking

What is the problem with type checking here ?

We could use a gcc builtin like the following to check if the alignment of
"percpu_dd" meets the double-cas requirements:

#define testmacro(a, b) \
        __builtin_choose_expr(__alignof__(a) >= 2 * sizeof(unsigned long), \
                              ((a).low) = (b), \    /* success */
                              ((a).low) = (void) 0) /* compile-error */

> or
> 
> this_cpu_cmpxchg_double(percpu_dd, old_dd, new_dd)
> 
> with the problem of 128 bit constants/structs passed by value.

Yeah, I guess trying to deal with 128-bit value might be a bit tricky. But
having something sane and with compile-time-checked alignment for the percpu_dd
type is not to be looked over.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:42                             ` Mathieu Desnoyers
@ 2011-01-21 17:50                               ` Christoph Lameter
  2011-01-21 18:10                                 ` Mathieu Desnoyers
  2011-01-21 18:31                               ` H. Peter Anvin
  1 sibling, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2011-01-21 17:50 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: H. Peter Anvin, Tejun Heo, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

On Fri, 21 Jan 2011, Mathieu Desnoyers wrote:

> > this_cpu_cmpxchg_double(percpu_dd, oldword1, oldword2, newword1, newword2)
> >
> > with the problem of type checking
>
> What is the problem with type checking here ?

You need to know the fields in the struct to do the type checking with
each of the other parameters.

> We could use a gcc builtin like the following to check if the alignment of
> "percpu_dd" meets the double-cas requirements:
>
> #define testmacro(a, b) \
>         __builtin_choose_expr(__alignof__(a) >= 2 * sizeof(unsigned long), \
>                               ((a).low) = (b), \    /* success */
>                               ((a).low) = (void) 0) /* compile-error */
>
> > or
> >
> > this_cpu_cmpxchg_double(percpu_dd, old_dd, new_dd)
> >
> > with the problem of 128 bit constants/structs passed by value.
>
> Yeah, I guess trying to deal with 128-bit value might be a bit tricky. But
> having something sane and with compile-time-checked alignment for the percpu_dd
> type is not to be looked over.

The existing implementation could be equipped to do a compile time check
for the proper alignment if the pointer passed is constant.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:07                       ` Christoph Lameter
@ 2011-01-21 17:50                         ` Mathieu Desnoyers
  2011-01-21 18:06                           ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-21 17:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, H. Peter Anvin, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

* Christoph Lameter (cl@linux.com) wrote:
> On Fri, 21 Jan 2011, Mathieu Desnoyers wrote:
> 
> > I'm again probably missing something, but what is "clumsy" about defining a
> > structure like the following to ensure proper alignment of the target
> > pointer (instead of adding a runtime test) ?
> >
> > struct cmpxchg_double {
> > #if __BYTE_ORDER == __LITTLE_ENDIAN
> >         unsigned long low, high;
> > #else
> >         unsigned long high, low;
> > #endif
> > } __attribute__((packed, aligned(2 * sizeof(unsigned long))));
> >
> > (note: packed here along with "aligned" does _not_ generate ugly bytewise
> > read/write memory ops like "packed" alone. The use of "packed" is to let the
> > compiler down-align the structure to the value requested, instead of uselessly
> > aligning it on 32-byte if it chooses to.)
> >
> > The prototype could then look like:
> >
> > bool __this_cpu_generic_cmpxchg_double(pcp, oval_low, oval_high, nval_low, nval_high);
> >
> > With:
> >   struct cmpxchg_double *pcp
> 
> That does not conform to the parameter conventions in other this_cpu_ops.
> The first parameter is a variable because the notion of a pointer is
> problematic given that percpu operations use a segment prefix to relocate
> pointers.

So the first argument could be along the lines of:

struct cmpxchg_double pcp

then.

> You would be implicitly passing a 128 bit argument although the
> compiler may not need to generate code for that.

Sorry, I don't understand this last statement. Does it still apply if we pass
pcp as I just proposed ? (without the pointer, with a __builtin_choose_expr
check on __alignof__ of the pcp parameter)

> 
> > I think Christoph's point is that he wants to alias this with a pointer. Well,
> > this can be done cleanly with:
> >
> > union {
> >         struct cmpxchg_double casdbl;
> >         struct {
> >                 void *ptr;
> >                 unsigned long cpuid_tid;
> >         } t;
> > }
> 
> There is no need for aliases with the existing implementation.
>
> How will the macro check the parameters now?

Well, my last proposal to check __alignof__ within a __builtin_choose_expr
check wouldn't need this union actually, which would be much better I think.

> 
> > Thoughts ?
> 
> Could you actually try to write a patch instead running through points
> that we have discussed earlier?

I'm going back to the points that have been previously dismissed with a rather
large degree of handwaving. ;) I'm OK with looking into the API and providing
code snippets to show the basic implementation behind the ideas, but I
unfortunately don't have the bandwidth to stop everything I'm currently working
on and start working on double-cas patches. Sorry, I wish I could do more.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:50                         ` Mathieu Desnoyers
@ 2011-01-21 18:06                           ` Christoph Lameter
  2011-01-21 18:37                             ` Mathieu Desnoyers
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2011-01-21 18:06 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Tejun Heo, H. Peter Anvin, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

On Fri, 21 Jan 2011, Mathieu Desnoyers wrote:

> > > With:
> > >   struct cmpxchg_double *pcp
> >
> > That does not conform to the parameter conventions in other this_cpu_ops.
> > The first parameter is a variable because the notion of a pointer is
> > problematic given that percpu operations use a segment prefix to relocate
> > pointers.
>
> So the first argument could be along the lines of:
>
> struct cmpxchg_double pcp
>
> then.

Ok then you would pass a struct by value? Or use a non-scalar as a
variable passed to a this_cpu_op? So far per cpu scalars have been the
only variables allowed to be specified in this_cpu operations.

> > >         struct cmpxchg_double casdbl;
> > >         struct {
> > >                 void *ptr;
> > >                 unsigned long cpuid_tid;
> > >         } t;
> > > }
> >
> > There is no need for aliases with the existing implementation.
> >
> > How will the macro check the parameters now?
>
> Well, my last proposal to check __alignof__ within a __builtin_choose_expr
> check wouldn't need this union actually, which would be much better I think.

The existing implementation has a check for alignment. That is not the
problem. The typechecking would need to be addressed. I.e. if I pass a
pointer for old and an ulong for the new value then I'd like to see the
compiler complain. Or if the first parameter is a long but the type of the
first word is a pointer etc etc.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:50                               ` Christoph Lameter
@ 2011-01-21 18:10                                 ` Mathieu Desnoyers
  2011-01-21 18:42                                   ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-21 18:10 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: H. Peter Anvin, Tejun Heo, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

* Christoph Lameter (cl@linux.com) wrote:
> On Fri, 21 Jan 2011, Mathieu Desnoyers wrote:
> 
> > > this_cpu_cmpxchg_double(percpu_dd, oldword1, oldword2, newword1, newword2)
> > >
> > > with the problem of type checking
> >
> > What is the problem with type checking here ?
> 
> You need to know the fields in the struct to do the type checking with
> each of the other parameters.

Isn't that a bit much to try to match the type of each oldword/newword
parameter to the structure fields ? Having separated word 1-2 parameter is just
an artefact caused by the inability of some gcc to deal with int128; were we to
use int128, we would have none of this type-checking whatsoever.

We could simply check that the first parameter alignment is >= 2 * sizeof(long)
and that its size == 2 * sizeof(long), so that the layout in memory fits the
cmpxchg_double requirements. This should work both for structure and array
parameters.

Now if the user needs to map "oldword1, oldword2" to the actual percpu_dd
fields, we could ensure that the order of these two parameters actually match
the structure field or array index order. This would, of course, be documented
above this_cpu_cmpxchg_double().

> 
> > We could use a gcc builtin like the following to check if the alignment of
> > "percpu_dd" meets the double-cas requirements:
> >
> > #define testmacro(a, b) \
> >         __builtin_choose_expr(__alignof__(a) >= 2 * sizeof(unsigned long), \
> >                               ((a).low) = (b), \    /* success */
> >                               ((a).low) = (void) 0) /* compile-error */
> >
> > > or
> > >
> > > this_cpu_cmpxchg_double(percpu_dd, old_dd, new_dd)
> > >
> > > with the problem of 128 bit constants/structs passed by value.
> >
> > Yeah, I guess trying to deal with 128-bit value might be a bit tricky. But
> > having something sane and with compile-time-checked alignment for the percpu_dd
> > type is not to be looked over.
> 
> The existing implementation could be equipped to do a compile time check
> for the proper alignment if the pointer passed is constant.

"if the pointer passed is constant" -> if you use the actual type of percpu_dd
to check the alignment, then you can do an alignment check at compile-time even
for a non-const parameter. The requirement imposed on typing will take care to
make sure that even a non-const pointer will have the proper alignment.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:42                             ` Mathieu Desnoyers
  2011-01-21 17:50                               ` Christoph Lameter
@ 2011-01-21 18:31                               ` H. Peter Anvin
  2011-01-21 18:46                                 ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2011-01-21 18:31 UTC (permalink / raw)
  To: Mathieu Desnoyers, Christoph Lameter
  Cc: Tejun Heo, Pekka Enberg, akpm, linux-kernel, Eric Dumazet

What is the problem with passing 128-bit structures by value?  I'm quite sure *that* is okay.

"Mathieu Desnoyers" <mathieu.desnoyers@efficios.com> wrote:

>* Christoph Lameter (cl@linux.com) wrote:
>> On Fri, 21 Jan 2011, H. Peter Anvin wrote:
>> 
>> > We could do cmpxchg with a structure... the problem with a lon int
>type is that Cristoph ran into bugs with __int128 on 64 bits.
>> 
>> We also would need to be pass the structure by value (well its really
>a
>> variable but its like passing by value) in order to be similar to the
>> other this_cpu_ops
>> 
>> You'd want either
>> 
>> DEFINE_PERCPU(struct mycustomdoublestruct, percpu_dd)
>> 
>> this_cpu_cmpxchg_double(percpu_dd, oldword1, oldword2, newword1,
>newword2)
>> 
>> with the problem of type checking
>
>What is the problem with type checking here ?
>
>We could use a gcc builtin like the following to check if the alignment
>of
>"percpu_dd" meets the double-cas requirements:
>
>#define testmacro(a, b) \
>   __builtin_choose_expr(__alignof__(a) >= 2 * sizeof(unsigned long), \
>                              ((a).low) = (b), \    /* success */
>                              ((a).low) = (void) 0) /* compile-error */
>
>> or
>> 
>> this_cpu_cmpxchg_double(percpu_dd, old_dd, new_dd)
>> 
>> with the problem of 128 bit constants/structs passed by value.
>
>Yeah, I guess trying to deal with 128-bit value might be a bit tricky.
>But
>having something sane and with compile-time-checked alignment for the
>percpu_dd
>type is not to be looked over.
>
>Thanks,
>
>Mathieu
>
>-- 
>Mathieu Desnoyers
>Operating System Efficiency R&D Consultant
>EfficiOS Inc.
>http://www.efficios.com

-- 
Sent from my mobile phone.  Please pardon any lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 18:06                           ` Christoph Lameter
@ 2011-01-21 18:37                             ` Mathieu Desnoyers
  0 siblings, 0 replies; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-21 18:37 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, H. Peter Anvin, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

* Christoph Lameter (cl@linux.com) wrote:
> On Fri, 21 Jan 2011, Mathieu Desnoyers wrote:
> 
> > > > With:
> > > >   struct cmpxchg_double *pcp
> > >
> > > That does not conform to the parameter conventions in other this_cpu_ops.
> > > The first parameter is a variable because the notion of a pointer is
> > > problematic given that percpu operations use a segment prefix to relocate
> > > pointers.
> >
> > So the first argument could be along the lines of:
> >
> > struct cmpxchg_double pcp
> >
> > then.
> 
> Ok then you would pass a struct by value? Or use a non-scalar as a
> variable passed to a this_cpu_op? So far per cpu scalars have been the
> only variables allowed to be specified in this_cpu operations.

What I have in mind is that the struct passed would be non-scalar for this
specific operation. I'm not sure about the distinction between "pass a struct by
value" and "use a non-scalar as a variable passed to a this_cpu_op" -- I feel
I'm missing an important detail in what you say, because I see these as being
the same thing.

> 
> > > >         struct cmpxchg_double casdbl;
> > > >         struct {
> > > >                 void *ptr;
> > > >                 unsigned long cpuid_tid;
> > > >         } t;
> > > > }
> > >
> > > There is no need for aliases with the existing implementation.
> > >
> > > How will the macro check the parameters now?
> >
> > Well, my last proposal to check __alignof__ within a __builtin_choose_expr
> > check wouldn't need this union actually, which would be much better I think.
> 
> The existing implementation has a check for alignment. That is not the
> problem.

It's a dynamic check right ? (based on VM_BUG_ON() if I remember well) It adds
code and runtime conditions, which would go away if we let the alignment check
be done at compile-time.

> The typechecking would need to be addressed. I.e. if I pass a
> pointer for old and an ulong for the new value then I'd like to see the
> compiler complain. Or if the first parameter is a long but the type of the
> first word is a pointer etc etc.

Hrm. Then the only solution I see would be to require that the structure
used as percpu_dd parameter have fixed field names (yeah, that's a bit odd, but
could not come up with a more elegant solution at the moment):

struct mycustomdoublestruct {
        sometype word1;
        someothertype word2;
}

So we can access percpu_dd.word1 and percpu_dd.word2 within
this_cpu_cmpxchg_double for the type checking.

Mathieu


-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 18:10                                 ` Mathieu Desnoyers
@ 2011-01-21 18:42                                   ` Christoph Lameter
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-21 18:42 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: H. Peter Anvin, Tejun Heo, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

On Fri, 21 Jan 2011, Mathieu Desnoyers wrote:

> "if the pointer passed is constant" -> if you use the actual type of percpu_dd
> to check the alignment, then you can do an alignment check at compile-time even
> for a non-const parameter. The requirement imposed on typing will take care to
> make sure that even a non-const pointer will have the proper alignment.

True that would be a benefit if we used something like percpu_dd. If
somehow we can marry that to a parameter list that only includes single
word entities then we would have the best of both.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 18:31                               ` H. Peter Anvin
@ 2011-01-21 18:46                                 ` Christoph Lameter
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-01-21 18:46 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Mathieu Desnoyers, Tejun Heo, Pekka Enberg, akpm, linux-kernel,
	Eric Dumazet

On Fri, 21 Jan 2011, H. Peter Anvin wrote:

> What is the problem with passing 128-bit structures by value?  I'm quite sure *that* is okay.

You mentioned that earlier and I tried to pass 128 bit structures. My
compiler did not do any of that (gcc 4.4.5). Could come up with
some example code to make yourself certain?



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:08                       ` Tejun Heo
  2011-01-21 17:13                         ` H. Peter Anvin
@ 2011-01-21 19:32                         ` Mathieu Desnoyers
  2011-01-23 18:00                           ` H. Peter Anvin
  1 sibling, 1 reply; 44+ messages in thread
From: Mathieu Desnoyers @ 2011-01-21 19:32 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Pekka Enberg, Christoph Lameter, akpm,
	linux-kernel, Eric Dumazet

* Tejun Heo (tj@kernel.org) wrote:
[...]
> > (note: packed here along with "aligned" does _not_ generate ugly
> > bytewise read/write memory ops like "packed" alone. The use of
> > "packed" is to let the compiler down-align the structure to the
> > value requested, instead of uselessly aligning it on 32-byte if it
> > chooses to.)
> 
> Yeah, good point.  :-)

For the records, I just noticed that "packed, aligned(8)" can generate unaligned
accesses on sparc64 by removing the padding between a "int" and a following
pointer. So we should not use it.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 19:32                         ` Mathieu Desnoyers
@ 2011-01-23 18:00                           ` H. Peter Anvin
  0 siblings, 0 replies; 44+ messages in thread
From: H. Peter Anvin @ 2011-01-23 18:00 UTC (permalink / raw)
  To: Mathieu Desnoyers, Tejun Heo
  Cc: Pekka Enberg, Christoph Lameter, akpm, linux-kernel, Eric Dumazet

Don't apply "packed" to a structure which needs padding!  That's just user error.

"Mathieu Desnoyers" <mathieu.desnoyers@efficios.com> wrote:

>* Tejun Heo (tj@kernel.org) wrote:
>[...]
>> > (note: packed here along with "aligned" does _not_ generate ugly
>> > bytewise read/write memory ops like "packed" alone. The use of
>> > "packed" is to let the compiler down-align the structure to the
>> > value requested, instead of uselessly aligning it on 32-byte if it
>> > chooses to.)
>> 
>> Yeah, good point.  :-)
>
>For the records, I just noticed that "packed, aligned(8)" can generate
>unaligned
>accesses on sparc64 by removing the padding between a "int" and a
>following
>pointer. So we should not use it.
>
>Thanks,
>
>Mathieu
>
>-- 
>Mathieu Desnoyers
>Operating System Efficiency R&D Consultant
>EfficiOS Inc.
>http://www.efficios.com

-- 
Sent from my mobile phone.  Please pardon any lack of formatting.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-21 17:19                           ` Tejun Heo
@ 2011-01-24  6:01                             ` H. Peter Anvin
  2011-02-25 13:09                               ` Pekka Enberg
  0 siblings, 1 reply; 44+ messages in thread
From: H. Peter Anvin @ 2011-01-24  6:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Mathieu Desnoyers, Pekka Enberg, Christoph Lameter, akpm,
	linux-kernel, Eric Dumazet

On 01/21/2011 09:19 AM, Tejun Heo wrote:
> On Fri, Jan 21, 2011 at 09:13:41AM -0800, H. Peter Anvin wrote:
>> We could do cmpxchg with a structure... the problem with a lon int
>> type is that Cristoph ran into bugs with __int128 on 64 bits.
>
> But, IIRC, the problem with int128 was with passing it as parameter
> and return value.  We don't have to do that.  We'll be just using it
> as a data storage / container type.  Or even that is broken?

Well, part of the point was to pass in registers.

No idea on the data storage type.

	-hpa

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-01-24  6:01                             ` H. Peter Anvin
@ 2011-02-25 13:09                               ` Pekka Enberg
  2011-02-25 13:19                                 ` Tejun Heo
  0 siblings, 1 reply; 44+ messages in thread
From: Pekka Enberg @ 2011-02-25 13:09 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, Mathieu Desnoyers, Christoph Lameter, akpm,
	linux-kernel, Eric Dumazet

On Fri, Jan 21, 2011 at 09:13:41AM -0800, H. Peter Anvin wrote:
>>> We could do cmpxchg with a structure... the problem with a lon int
>>> type is that Cristoph ran into bugs with __int128 on 64 bits.

On 01/21/2011 09:19 AM, Tejun Heo wrote:
>> But, IIRC, the problem with int128 was with passing it as parameter
>> and return value. We don't have to do that. We'll be just using it
>> as a data storage / container type. Or even that is broken?

On 1/24/11 8:01 AM, H. Peter Anvin wrote:
> Well, part of the point was to pass in registers.
>
> No idea on the data storage type.

Ping? The current situation is that we're unable to merge a perfectly 
good SLUB performance optimization because we can't seem to agree on the 
this_cpu_cmpxchg_double() API.

			Pekka

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-02-25 13:09                               ` Pekka Enberg
@ 2011-02-25 13:19                                 ` Tejun Heo
  2011-02-25 16:26                                   ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Tejun Heo @ 2011-02-25 13:19 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: H. Peter Anvin, Mathieu Desnoyers, Christoph Lameter, akpm,
	linux-kernel, Eric Dumazet

On Fri, Feb 25, 2011 at 03:09:26PM +0200, Pekka Enberg wrote:
> On Fri, Jan 21, 2011 at 09:13:41AM -0800, H. Peter Anvin wrote:
> >>>We could do cmpxchg with a structure... the problem with a lon int
> >>>type is that Cristoph ran into bugs with __int128 on 64 bits.
> 
> On 01/21/2011 09:19 AM, Tejun Heo wrote:
> >>But, IIRC, the problem with int128 was with passing it as parameter
> >>and return value. We don't have to do that. We'll be just using it
> >>as a data storage / container type. Or even that is broken?
> 
> On 1/24/11 8:01 AM, H. Peter Anvin wrote:
> >Well, part of the point was to pass in registers.
> >
> >No idea on the data storage type.
> 
> Ping? The current situation is that we're unable to merge a
> perfectly good SLUB performance optimization because we can't seem
> to agree on the this_cpu_cmpxchg_double() API.

I thought cl was preparing new version of the patchset.  Christoph?

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-02-25 13:19                                 ` Tejun Heo
@ 2011-02-25 16:26                                   ` Christoph Lameter
  2011-02-25 16:37                                     ` Tejun Heo
  2011-02-25 16:38                                     ` Eric Dumazet
  0 siblings, 2 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-02-25 16:26 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Pekka Enberg, H. Peter Anvin, Mathieu Desnoyers, akpm,
	linux-kernel, Eric Dumazet

On Fri, 25 Feb 2011, Tejun Heo wrote:

> > Ping? The current situation is that we're unable to merge a
> > perfectly good SLUB performance optimization because we can't seem
> > to agree on the this_cpu_cmpxchg_double() API.
>
> I thought cl was preparing new version of the patchset.  Christoph?

In order to do that I would have to see a way to improve the
patches. None of the suggestions so far seem to be realizable.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-02-25 16:26                                   ` Christoph Lameter
@ 2011-02-25 16:37                                     ` Tejun Heo
  2011-02-25 16:43                                       ` Christoph Lameter
  2011-02-25 16:38                                     ` Eric Dumazet
  1 sibling, 1 reply; 44+ messages in thread
From: Tejun Heo @ 2011-02-25 16:37 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, H. Peter Anvin, Mathieu Desnoyers, akpm,
	linux-kernel, Eric Dumazet

On Fri, Feb 25, 2011 at 10:26:43AM -0600, Christoph Lameter wrote:
> On Fri, 25 Feb 2011, Tejun Heo wrote:
> 
> > > Ping? The current situation is that we're unable to merge a
> > > perfectly good SLUB performance optimization because we can't seem
> > > to agree on the this_cpu_cmpxchg_double() API.
> >
> > I thought cl was preparing new version of the patchset.  Christoph?
> 
> In order to do that I would have to see a way to improve the
> patches. None of the suggestions so far seem to be realizable.

I see.  For some reason, I thought you and hpa agreed on something and
you were writing up the patches.  I'll read the thread again.  Let's
just pick one.  It's not like this is an API which will be used
througout the tree.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-02-25 16:26                                   ` Christoph Lameter
  2011-02-25 16:37                                     ` Tejun Heo
@ 2011-02-25 16:38                                     ` Eric Dumazet
  2011-02-25 16:45                                       ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: Eric Dumazet @ 2011-02-25 16:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, Pekka Enberg, H. Peter Anvin, Mathieu Desnoyers, akpm,
	linux-kernel

Le vendredi 25 février 2011 à 10:26 -0600, Christoph Lameter a écrit :
> On Fri, 25 Feb 2011, Tejun Heo wrote:
> 
> > > Ping? The current situation is that we're unable to merge a
> > > perfectly good SLUB performance optimization because we can't seem
> > > to agree on the this_cpu_cmpxchg_double() API.
> >
> > I thought cl was preparing new version of the patchset.  Christoph?
> 
> In order to do that I would have to see a way to improve the
> patches. None of the suggestions so far seem to be realizable.
> 
> 

Just respin your patches then.

I suspect we'll have so few users of this API, that we can easily change
it later if really someone brings something better.

"Le mieux est l'ennemi du bien." (The perfect is the enemy of the good).



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-02-25 16:37                                     ` Tejun Heo
@ 2011-02-25 16:43                                       ` Christoph Lameter
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-02-25 16:43 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Pekka Enberg, H. Peter Anvin, Mathieu Desnoyers, akpm,
	linux-kernel, Eric Dumazet

On Fri, 25 Feb 2011, Tejun Heo wrote:

> On Fri, Feb 25, 2011 at 10:26:43AM -0600, Christoph Lameter wrote:
> > On Fri, 25 Feb 2011, Tejun Heo wrote:
> >
> > > > Ping? The current situation is that we're unable to merge a
> > > > perfectly good SLUB performance optimization because we can't seem
> > > > to agree on the this_cpu_cmpxchg_double() API.
> > >
> > > I thought cl was preparing new version of the patchset.  Christoph?
> >
> > In order to do that I would have to see a way to improve the
> > patches. None of the suggestions so far seem to be realizable.
>
> I see.  For some reason, I thought you and hpa agreed on something and
> you were writing up the patches.  I'll read the thread again.  Let's
> just pick one.  It's not like this is an API which will be used
> througout the tree.

We were discussing some hypothetical changes but as far as I can see all
those scenarios lead to more issues.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double
  2011-02-25 16:38                                     ` Eric Dumazet
@ 2011-02-25 16:45                                       ` Christoph Lameter
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2011-02-25 16:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tejun Heo, Pekka Enberg, H. Peter Anvin, Mathieu Desnoyers, akpm,
	linux-kernel

On Fri, 25 Feb 2011, Eric Dumazet wrote:

> Just respin your patches then.

Ok. Will do that now.


^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2011-02-25 16:45 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-06 20:45 [cpuops cmpxchg double V2 0/4] this_cpu_cmpxchg_double support Christoph Lameter
2011-01-06 20:45 ` [cpuops cmpxchg double V2 1/4] Generic support for this_cpu_cmpxchg_double Christoph Lameter
2011-01-06 21:08   ` Mathieu Desnoyers
2011-01-06 21:43     ` Christoph Lameter
2011-01-06 22:05   ` H. Peter Anvin
2011-01-07 15:15     ` Christoph Lameter
2011-01-07 18:04       ` Mathieu Desnoyers
2011-01-07 18:41         ` Christoph Lameter
2011-01-08 17:24           ` Tejun Heo
2011-01-09  8:33             ` Pekka Enberg
2011-01-21  7:31             ` Pekka Enberg
2011-01-21  9:26               ` Tejun Heo
2011-01-21 15:31                 ` H. Peter Anvin
2011-01-21 15:48                   ` Tejun Heo
2011-01-21 16:30                     ` H. Peter Anvin
2011-01-21 16:34                       ` Tejun Heo
2011-01-21 16:54                     ` Mathieu Desnoyers
2011-01-21 17:07                       ` Christoph Lameter
2011-01-21 17:50                         ` Mathieu Desnoyers
2011-01-21 18:06                           ` Christoph Lameter
2011-01-21 18:37                             ` Mathieu Desnoyers
2011-01-21 17:08                       ` Tejun Heo
2011-01-21 17:13                         ` H. Peter Anvin
2011-01-21 17:19                           ` Tejun Heo
2011-01-24  6:01                             ` H. Peter Anvin
2011-02-25 13:09                               ` Pekka Enberg
2011-02-25 13:19                                 ` Tejun Heo
2011-02-25 16:26                                   ` Christoph Lameter
2011-02-25 16:37                                     ` Tejun Heo
2011-02-25 16:43                                       ` Christoph Lameter
2011-02-25 16:38                                     ` Eric Dumazet
2011-02-25 16:45                                       ` Christoph Lameter
2011-01-21 17:24                           ` Christoph Lameter
2011-01-21 17:42                             ` Mathieu Desnoyers
2011-01-21 17:50                               ` Christoph Lameter
2011-01-21 18:10                                 ` Mathieu Desnoyers
2011-01-21 18:42                                   ` Christoph Lameter
2011-01-21 18:31                               ` H. Peter Anvin
2011-01-21 18:46                                 ` Christoph Lameter
2011-01-21 19:32                         ` Mathieu Desnoyers
2011-01-23 18:00                           ` H. Peter Anvin
2011-01-06 20:45 ` [cpuops cmpxchg double V2 2/4] x86: this_cpu_cmpxchg_double() support Christoph Lameter
2011-01-06 20:45 ` [cpuops cmpxchg double V2 3/4] slub: Get rid of slab_free_hook_irq() Christoph Lameter
2011-01-06 20:45 ` [cpuops cmpxchg double V2 4/4] Lockless (and preemptless) fastpaths for slub Christoph Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.