All of lore.kernel.org
 help / color / mirror / Atom feed
* [cpuops cmpxchg V1 0/4] Cmpxchg and xchg operations
@ 2010-12-08 17:55 Christoph Lameter
  2010-12-08 17:55 ` [cpuops cmpxchg V1 1/4] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 17:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

Add cmpxchg and xchg operations to the cpu ops and use them for irq handling
and for vm statistics.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [cpuops cmpxchg V1 1/4] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support
  2010-12-08 17:55 [cpuops cmpxchg V1 0/4] Cmpxchg and xchg operations Christoph Lameter
@ 2010-12-08 17:55 ` Christoph Lameter
  2010-12-08 17:55 ` [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 17:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_generic --]
[-- Type: text/plain, Size: 6174 bytes --]

Generic code to provide new per cpu atomic features

	this_cpu_cmpxchg
	this_cpu_xchg

Fallback occurs to functions using interrupts disable/enable
to ensure correct per cpu atomicity.

Fallback to regular cmpxchg and xchg is not possible since per cpu atomic
semantics include the guarantee that the current cpus per cpu data is
accessed. Use of regular cmpxchg and xchg requires the determination of
the address of the per cpu data which cannot be atomically included in
the xchg or cmpxchg.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |  130 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 129 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-12-06 12:21:16.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-12-06 13:45:01.000000000 -0600
@@ -322,6 +322,106 @@ do {									\
 # define this_cpu_read(pcp)	__pcpu_size_call_return(this_cpu_read_, (pcp))
 #endif
 
+#define __this_cpu_generic_xchg(pcp, nval)				\
+({	typeof(pcp) ret__;						\
+	ret__ = __this_cpu_read(pcp);					\
+	__this_cpu_write(pcp, nval);					\
+	ret__;								\
+})
+
+#ifndef __this_cpu_xchg
+# ifndef __this_cpu_xchg_1
+#  define __this_cpu_xchg_1(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_2
+#  define __this_cpu_xchg_2(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_4
+#  define __this_cpu_xchg_4(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_8
+#  define __this_cpu_xchg_8(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# define __this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(__this_cpu_xchg_, (pcp), nval)
+#endif
+
+#define _this_cpu_generic_xchg(pcp, nval)				\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	ret__ = __this_cpu_read(pcp);					\
+	__this_cpu_write(pcp, nval);					\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_xchg
+# ifndef this_cpu_xchg_1
+#  define this_cpu_xchg_1(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_2
+#  define this_cpu_xchg_2(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_4
+#  define this_cpu_xchg_4(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_8
+#  define this_cpu_xchg_8(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# define this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(this_cpu_xchg_, (pcp), nval)
+#endif
+
+#define _this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg
+# ifndef this_cpu_cmpxchg_1
+#  define this_cpu_cmpxchg_1(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_2
+#  define this_cpu_cmpxchg_2(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_4
+#  define this_cpu_cmpxchg_4(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_8
+#  define this_cpu_cmpxchg_8(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(this_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
+#define __this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({									\
+	typeof(pcp) ret__;						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	ret__;								\
+})
+
+#ifndef __this_cpu_cmpxchg
+# ifndef __this_cpu_cmpxchg_1
+#  define __this_cpu_cmpxchg_1(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_2
+#  define __this_cpu_cmpxchg_2(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_4
+#  define __this_cpu_cmpxchg_4(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_8
+#  define __this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define __this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(\
+				__this_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	preempt_disable();						\
@@ -610,7 +710,7 @@ do {									\
  * IRQ safe versions of the per cpu RMW operations. Note that these operations
  * are *not* safe against modification of the same variable from another
  * processors (which one gets when using regular atomic operations)
- . They are guaranteed to be atomic vs. local interrupts and
+ * They are guaranteed to be atomic vs. local interrupts and
  * preemption only.
  */
 #define irqsafe_cpu_generic_to_op(pcp, val, op)				\
@@ -697,4 +797,32 @@ do {									\
 # define irqsafe_cpu_xor(pcp, val) __pcpu_size_call(irqsafe_cpu_xor_, (val))
 #endif
 
+#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({									\
+	typeof(pcp) ret__;						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cpu_cmpxchg
+# ifndef irqsafe_cpu_cmpxchg_1
+#  define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_2
+#  define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_4
+#  define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_8
+#  define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define irqsafe_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
 #endif /* __LINUX_PERCPU_H */


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-08 17:55 [cpuops cmpxchg V1 0/4] Cmpxchg and xchg operations Christoph Lameter
  2010-12-08 17:55 ` [cpuops cmpxchg V1 1/4] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
@ 2010-12-08 17:55 ` Christoph Lameter
  2010-12-08 18:08   ` Christoph Lameter
  2010-12-08 22:20   ` Christoph Lameter
  2010-12-08 17:55 ` [cpuops cmpxchg V1 3/4] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 17:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_x86 --]
[-- Type: text/plain, Size: 7011 bytes --]

Provide support as far as the hardware capabilities of the x86 cpus
allow.

Define CONFIG_CMPXCHG_LOCAL in Kconfig.cpu to allow core code to test for
fast cpuops implementations.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/Kconfig.cpu          |    3 
 arch/x86/include/asm/percpu.h |  129 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 131 insertions(+), 1 deletion(-)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-08 11:33:48.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-08 11:34:44.000000000 -0600
@@ -212,6 +212,83 @@ do {									\
 	ret__;								\
 })
 
+/*
+ * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
+ * full lock semantics even though they are not needed.
+ */
+#define percpu_xchg_op(var, nval)					\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __new = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("xchgb %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "q" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("xchgw %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("xchgl %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("xchgq %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	__ret;								\
+})
+
+/*
+ * cmpxchg has no such implied lock semantics as a result it is much
+ * more efficient for cpu local operations.
+ */
+#define percpu_cmpxchg_op(var, oval, nval)				\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __old = (oval);					\
+	typeof(var) __new = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("cmpxchgb %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "q" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("cmpxchgw %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("cmpxchgl %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("cmpxchgq %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	__ret;								\
+})
+
 #define percpu_from_op(op, var, constraint)		\
 ({							\
 	typeof(var) pfo_ret__;				\
@@ -335,6 +412,16 @@ do {									\
 #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
 
+#define __this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define __this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define __this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+
 #ifndef CONFIG_M386
 #define __this_cpu_add_return_1(pcp, val)	percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_2(pcp, val)	percpu_add_return_op(pcp, val)
@@ -342,7 +429,39 @@ do {									\
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_4(pcp, val)		percpu_add_return_op(pcp, val)
-#endif
+
+#define __this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#endif /* !CONFIG_M386 */
+
+#ifndef CONFIG_X86_64
+#ifdef CONFIG_X86_CMPXCHG64
+/* We can support a 8 byte cmpxchg with a special instruction on 32 bit */
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)				\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __old = (oval);					\
+	typeof(var) __new = (nval);					\
+	asm("cmpxchg8b %2, "__percpu_arg(1)				\
+	    : "=A" (__ret), "+m" (&pcp)					\
+	    : "b" (((u32)new), "c" ((u32)(new >> 32)),  "0" (__old)	\
+	    : "memory");						\
+	__ret;								\
+})
+
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)
+#define irqsafe_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)
+
+#endif /* CONFIG_X86_CMPXCHG64 */
+#endif /* !CONFIG_X86_64 */
+
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -370,6 +489,14 @@ do {									\
 #define __this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
 
+#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+
 #endif
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
Index: linux-2.6/arch/x86/Kconfig.cpu
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig.cpu	2010-12-08 11:33:48.000000000 -0600
+++ linux-2.6/arch/x86/Kconfig.cpu	2010-12-08 11:33:53.000000000 -0600
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
+config CMPXCHG_LOCAL
+	def_bool X86_64 || (X86_32 && !M386)
+
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [cpuops cmpxchg V1 3/4] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-08 17:55 [cpuops cmpxchg V1 0/4] Cmpxchg and xchg operations Christoph Lameter
  2010-12-08 17:55 ` [cpuops cmpxchg V1 1/4] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
  2010-12-08 17:55 ` [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
@ 2010-12-08 17:55 ` Christoph Lameter
  2010-12-08 17:55 ` [cpuops cmpxchg V1 4/4] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
  2010-12-08 22:22 ` cpuops cmpxchg: Provide 64 bit this_cpu_xx for 32 bit x86 using cmpxchg8b Christoph Lameter
  4 siblings, 0 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 17:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Peter Zijlstra, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_irq --]
[-- Type: text/plain, Size: 2112 bytes --]

The irq work queue is a per cpu object and it is sufficient for
synchronization if per cpu atomics are used. Doing so simplifies
the code and reduces the overhead of the code.

Before:

christoph@linux-2.6$ size kernel/irq_work.o
   text	   data	    bss	    dec	    hex	filename
    451	      8	      1	    460	    1cc	kernel/irq_work.o

After:

christoph@linux-2.6$ size kernel/irq_work.o 
   text	   data	    bss	    dec	    hex	filename
    438	      8	      1	    447	    1bf	kernel/irq_work.o

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Christoph Lameter <cl@linux.com>

---
 kernel/irq_work.c |   18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

Index: linux-2.6/kernel/irq_work.c
===================================================================
--- linux-2.6.orig/kernel/irq_work.c	2010-12-07 10:24:30.000000000 -0600
+++ linux-2.6/kernel/irq_work.c	2010-12-07 10:26:45.000000000 -0600
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
  */
 static void __irq_work_queue(struct irq_work *entry)
 {
-	struct irq_work **head, *next;
+	struct irq_work *next;
 
-	head = &get_cpu_var(irq_work_list);
+	preempt_disable();
 
 	do {
-		next = *head;
+		next = __this_cpu_read(irq_work_list);
 		/* Can assign non-atomic because we keep the flags set. */
 		entry->next = next_flags(next, IRQ_WORK_FLAGS);
-	} while (cmpxchg(head, next, entry) != next);
+	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
 
 	/* The list was empty, raise self-interrupt to start processing. */
 	if (!irq_work_next(entry))
 		arch_irq_work_raise();
 
-	put_cpu_var(irq_work_list);
+	preempt_enable();
 }
 
 /*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
  */
 void irq_work_run(void)
 {
-	struct irq_work *list, **head;
+	struct irq_work *list;
 
-	head = &__get_cpu_var(irq_work_list);
-	if (*head == NULL)
+	if (this_cpu_read(irq_work_list) == NULL)
 		return;
 
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
 
-	list = xchg(head, NULL);
+	list = this_cpu_xchg(irq_work_list, NULL);
+
 	while (list != NULL) {
 		struct irq_work *entry = list;
 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [cpuops cmpxchg V1 4/4] vmstat: User per cpu atomics to avoid interrupt disable / enable
  2010-12-08 17:55 [cpuops cmpxchg V1 0/4] Cmpxchg and xchg operations Christoph Lameter
                   ` (2 preceding siblings ...)
  2010-12-08 17:55 ` [cpuops cmpxchg V1 3/4] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
@ 2010-12-08 17:55 ` Christoph Lameter
  2010-12-08 22:22 ` cpuops cmpxchg: Provide 64 bit this_cpu_xx for 32 bit x86 using cmpxchg8b Christoph Lameter
  4 siblings, 0 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 17:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_vmstat --]
[-- Type: text/plain, Size: 4660 bytes --]

Currently the operations to increment vm counters must disable interrupts
in order to not mess up their housekeeping of counters.

So use this_cpu_cmpxchg() to avoid the overhead. Since we can no longer
count on preremption being disabled we still have some minor issues.
The fetching of the counter thresholds is racy.
A threshold from another cpu may be applied if we happen to be
rescheduled on another cpu.  However, the following vmstat operation
will then bring the counter again under the threshold limit.

The operations for __xxx_zone_state are not changed since the caller
has taken care of the synchronization needs (and therefore the cycle
count is even less than the optimized version for the irq disable case
provided here).

The optimization using this_cpu_cmpxchg will only be used if the arch
supports efficient this_cpu_ops (must have CONFIG_CMPXCHG_LOCAL set!)

The use of this_cpu_cmpxchg reduces the cycle count for the counter
operations by %80 (inc_zone_page_state goes from 170 cycles to 32).

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/vmstat.c |  101 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 14 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2010-12-01 10:04:19.000000000 -0600
+++ linux-2.6/mm/vmstat.c	2010-12-01 10:09:06.000000000 -0600
@@ -185,20 +185,6 @@ void __mod_zone_page_state(struct zone *
 EXPORT_SYMBOL(__mod_zone_page_state);
 
 /*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
-					int delta)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_zone_page_state(zone, item, delta);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
  * Optimized increment and decrement functions.
  *
  * These are only for a single page and therefore can take a struct page *
@@ -265,6 +251,92 @@ void __dec_zone_page_state(struct page *
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ *     0       No overstepping
+ *     1       Overstepping half of threshold
+ *     -1      Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+       enum zone_stat_item item, int delta, int overstep_mode)
+{
+	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	s8 __percpu *p = pcp->vm_stat_diff + item;
+	long o, n, t, z;
+
+	do {
+		z = 0;  /* overflow to zone counters */
+
+		/*
+		 * The fetching of the stat_threshold is racy. We may apply
+		 * a counter threshold to the wrong the cpu if we get
+		 * rescheduled while executing here. However, the following
+		 * will apply the threshold again and therefore bring the
+		 * counter under the threshold.
+		 */
+		t = this_cpu_read(pcp->stat_threshold);
+
+		o = this_cpu_read(*p);
+		n = delta + o;
+
+		if (n > t || n < -t) {
+			int os = overstep_mode * (t >> 1) ;
+
+			/* Overflow must be added to zone counters */
+			z = n + os;
+			n = -os;
+		}
+	} while (this_cpu_cmpxchg(*p, o, n) != o);
+
+	if (z)
+		zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+					int delta)
+{
+	mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+	mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+					int delta)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_zone_page_state(zone, item, delta);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
 	unsigned long flags;
@@ -295,6 +367,7 @@ void dec_zone_page_state(struct page *pa
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
+#endif
 
 /*
  * Update the zone counters for one cpu.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-08 17:55 ` [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
@ 2010-12-08 18:08   ` Christoph Lameter
  2010-12-08 18:17     ` Mathieu Desnoyers
  2010-12-08 22:20   ` Christoph Lameter
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 18:08 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

Alternate approach: Could also use cmpxchg for xchg..


Subject: cpuops: Use cmpxchg for xchg to avoid lock semantics

Cmpxchg has a lower cycle count due to the implied lock semantics of xchg.

Simulate the xchg through cmpxchg for the cpu ops.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   68 +++++++-----------------------------------
 1 file changed, 12 insertions(+), 56 deletions(-)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-08 11:43:50.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-08 12:00:21.000000000 -0600
@@ -212,48 +212,6 @@ do {									\
 	ret__;								\
 })

-/*
- * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
- * full lock semantics even though they are not needed.
- */
-#define percpu_xchg_op(var, nval)					\
-({									\
-	typeof(var) __ret;						\
-	typeof(var) __new = (nval);					\
-	switch (sizeof(var)) {						\
-	case 1:								\
-		asm("xchgb %2, "__percpu_arg(1)			\
-			    : "=a" (__ret), "+m" (var)			\
-			    : "q" (__new)				\
-			    : "memory");				\
-		break;							\
-	case 2:								\
-		asm("xchgw %2, "__percpu_arg(1)			\
-			    : "=a" (__ret), "+m" (var)			\
-			    : "r" (__new)				\
-			    : "memory");				\
-		break;							\
-	case 4:								\
-		asm("xchgl %2, "__percpu_arg(1)			\
-			    : "=a" (__ret), "+m" (var)			\
-			    : "r" (__new)				\
-			    : "memory");				\
-		break;							\
-	case 8:								\
-		asm("xchgq %2, "__percpu_arg(1)			\
-			    : "=a" (__ret), "+m" (var)			\
-			    : "r" (__new)				\
-			    : "memory");				\
-		break;							\
-	default: __bad_percpu_size();					\
-	}								\
-	__ret;								\
-})
-
-/*
- * cmpxchg has no such implied lock semantics as a result it is much
- * more efficient for cpu local operations.
- */
 #define percpu_cmpxchg_op(var, oval, nval)				\
 ({									\
 	typeof(var) __ret;						\
@@ -412,16 +370,6 @@ do {									\
 #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)

-#define __this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define __this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define __this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
-
 #ifndef CONFIG_M386
 #define __this_cpu_add_return_1(pcp, val)	percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_2(pcp, val)	percpu_add_return_op(pcp, val)
@@ -489,16 +437,24 @@ do {									\
 #define __this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)

-#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
-
 #define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)

 #endif

+#define this_cpu_xchg(pcp, val) \
+({									\
+	typeof(val) __o;						\
+	do {								\
+	 	__o = __this_cpu_read(pcp);				\
+	} while (this_cpu_cmpxchg(pcp, __o, val) != __o);		\
+	__o;								\
+})
+
+#define __this_cpu_xchg this_cpu_xchg
+#define irqsafe_cpu_xchg this_cpu_xchg
+
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
 #define x86_test_and_clear_bit_percpu(bit, var)				\
 ({									\


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-08 18:08   ` Christoph Lameter
@ 2010-12-08 18:17     ` Mathieu Desnoyers
  2010-12-09  6:26       ` H. Peter Anvin
  0 siblings, 1 reply; 11+ messages in thread
From: Mathieu Desnoyers @ 2010-12-08 18:17 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Arjan van de Ven, hpa

* Christoph Lameter (cl@linux.com) wrote:
> Alternate approach: Could also use cmpxchg for xchg..
> 
> 
> Subject: cpuops: Use cmpxchg for xchg to avoid lock semantics
> 
> Cmpxchg has a lower cycle count due to the implied lock semantics of xchg.
> 
> Simulate the xchg through cmpxchg for the cpu ops.

Hi Christoph,

Can you show if this provides savings in terms of:

- instruction cache footprint
- cycles required to run
- large-scale impact on the branch prediction buffers

Given that this targets per-cpu data only, the additional impact on cache-line
exchange traffic of using cmpxchg over xchg (cache-line not grabbed as exclusive
by the initial read) should not really matter.

I'm CCing Arjan and HPA, because they might have some interesting insight into
the performance impact of lock-prefixed xchg vs using local cmpxchg in a loop.

Thanks,

Mathieu


> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  arch/x86/include/asm/percpu.h |   68 +++++++-----------------------------------
>  1 file changed, 12 insertions(+), 56 deletions(-)
> 
> Index: linux-2.6/arch/x86/include/asm/percpu.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-08 11:43:50.000000000 -0600
> +++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-08 12:00:21.000000000 -0600
> @@ -212,48 +212,6 @@ do {									\
>  	ret__;								\
>  })
> 
> -/*
> - * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
> - * full lock semantics even though they are not needed.
> - */
> -#define percpu_xchg_op(var, nval)					\
> -({									\
> -	typeof(var) __ret;						\
> -	typeof(var) __new = (nval);					\
> -	switch (sizeof(var)) {						\
> -	case 1:								\
> -		asm("xchgb %2, "__percpu_arg(1)			\
> -			    : "=a" (__ret), "+m" (var)			\
> -			    : "q" (__new)				\
> -			    : "memory");				\
> -		break;							\
> -	case 2:								\
> -		asm("xchgw %2, "__percpu_arg(1)			\
> -			    : "=a" (__ret), "+m" (var)			\
> -			    : "r" (__new)				\
> -			    : "memory");				\
> -		break;							\
> -	case 4:								\
> -		asm("xchgl %2, "__percpu_arg(1)			\
> -			    : "=a" (__ret), "+m" (var)			\
> -			    : "r" (__new)				\
> -			    : "memory");				\
> -		break;							\
> -	case 8:								\
> -		asm("xchgq %2, "__percpu_arg(1)			\
> -			    : "=a" (__ret), "+m" (var)			\
> -			    : "r" (__new)				\
> -			    : "memory");				\
> -		break;							\
> -	default: __bad_percpu_size();					\
> -	}								\
> -	__ret;								\
> -})
> -
> -/*
> - * cmpxchg has no such implied lock semantics as a result it is much
> - * more efficient for cpu local operations.
> - */
>  #define percpu_cmpxchg_op(var, oval, nval)				\
>  ({									\
>  	typeof(var) __ret;						\
> @@ -412,16 +370,6 @@ do {									\
>  #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
>  #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
> 
> -#define __this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define __this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define __this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
> -
>  #ifndef CONFIG_M386
>  #define __this_cpu_add_return_1(pcp, val)	percpu_add_return_op(pcp, val)
>  #define __this_cpu_add_return_2(pcp, val)	percpu_add_return_op(pcp, val)
> @@ -489,16 +437,24 @@ do {									\
>  #define __this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
>  #define this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
> 
> -#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
> -#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
> -
>  #define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
>  #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
>  #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
> 
>  #endif
> 
> +#define this_cpu_xchg(pcp, val) \
> +({									\
> +	typeof(val) __o;						\
> +	do {								\
> +	 	__o = __this_cpu_read(pcp);				\
> +	} while (this_cpu_cmpxchg(pcp, __o, val) != __o);		\
> +	__o;								\
> +})
> +
> +#define __this_cpu_xchg this_cpu_xchg
> +#define irqsafe_cpu_xchg this_cpu_xchg
> +
>  /* This is not atomic against other CPUs -- CPU preemption needs to be off */
>  #define x86_test_and_clear_bit_percpu(bit, var)				\
>  ({									\
> 

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-08 17:55 ` [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
  2010-12-08 18:08   ` Christoph Lameter
@ 2010-12-08 22:20   ` Christoph Lameter
  1 sibling, 0 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 22:20 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

Argh. I included __this_cpu_cmpxchg_8 done with cmpxchg8b for 32 bit. That
has some issues....


Subject: Fixup __this_cpu_cmpxchg_8

__this_cpu_cmpxchg_8 can cause compilation failure since it is expanded
for any this_cpu_cmpxchg independent of the size or type of argument.
However. __this_cpu_cmpxchg_8 only generates code for the 8 byte
case. Use a union to get around the compilation failures.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-08 14:48:02.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-08 16:01:24.000000000 -0600
@@ -391,15 +391,23 @@ do {									\

 #ifndef CONFIG_X86_64
 #ifdef CONFIG_X86_CMPXCHG64
-/* We can support a 8 byte cmpxchg with a special instruction on 32 bit */
+/*
+ * We can support a 8 byte cmpxchg with a special instruction on 32 bit.
+ *
+ * Note that some of the strangeness here with the __new variable is due to
+ * the need to expand and compile this function for arbitary types even
+ * though the code will not be included in the .o file if used for a smaller
+ * type.
+ */
 #define __this_cpu_cmpxchg_8(pcp, oval, nval)				\
 ({									\
-	typeof(var) __ret;						\
-	typeof(var) __old = (oval);					\
-	typeof(var) __new = (nval);					\
+	typeof(pcp) __ret;						\
+	typeof(pcp) __old = (oval);					\
+	union x { typeof(pcp) n;u64 m; } __new;				\
+	__new.n = (nval);						\
 	asm("cmpxchg8b %2, "__percpu_arg(1)				\
-	    : "=A" (__ret), "+m" (&pcp)					\
-	    : "b" (((u32)new), "c" ((u32)(new >> 32)),  "0" (__old)	\
+	    : "=A" (__ret), "+m" (pcp)					\
+	    : "b" ((u32)__new.m), "c" ((u32)(__new.m >> 32)),  "0" (__old)	\
 	    : "memory");						\
 	__ret;								\
 })


^ permalink raw reply	[flat|nested] 11+ messages in thread

* cpuops cmpxchg: Provide 64 bit this_cpu_xx for 32 bit x86 using cmpxchg8b
  2010-12-08 17:55 [cpuops cmpxchg V1 0/4] Cmpxchg and xchg operations Christoph Lameter
                   ` (3 preceding siblings ...)
  2010-12-08 17:55 ` [cpuops cmpxchg V1 4/4] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
@ 2010-12-08 22:22 ` Christoph Lameter
  4 siblings, 0 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-08 22:22 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

Another patch in the series. This may allow better 64 bit counter
management and avoids preempt on/off. Maybe not since we have an
additional branch and load.

Subject: cpuops: Provide 64 bit this_cpu_xx for 32 bit x86

The 64 bit this_cpu_cmpxchg can be used to create a set of 64 bit operations
so that 64 bit entities can also be handled by the this_cpu ops on 32 bit.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   64 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)


Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-08 16:02:39.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-08 16:12:48.000000000 -0600
@@ -405,7 +405,7 @@ do {									\
 	typeof(pcp) __old = (oval);					\
 	union x { typeof(pcp) n;u64 m; } __new;				\
 	__new.n = (nval);						\
-	asm("cmpxchg8b %2, "__percpu_arg(1)				\
+	asm("cmpxchg8b "__percpu_arg(1)					\
 	    : "=A" (__ret), "+m" (pcp)					\
 	    : "b" ((u32)__new.m), "c" ((u32)(__new.m >> 32)),  "0" (__old)	\
 	    : "memory");						\
@@ -415,6 +415,68 @@ do {									\
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)
 #define irqsafe_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)

+/*
+ * cmpxchg_8 can be used to create support for the 64 bit operations
+ * that are missing on 32 bit.
+ */
+
+#define this_cpu_write_8(pcp, val)					\
+({									\
+	typeof(val) __v = (val);					\
+	typeof(val) __o;						\
+	do {								\
+		__o = this_cpu_read_8(pcp);				\
+	} while (this_cpu_cmpxchg_8(pcp, __o, __v) != __o);		\
+})
+
+#define this_cpu_add_8(pcp, val)					\
+({									\
+	typeof(val) __v = (val);					\
+	typeof(val) __o;						\
+	do {								\
+		__o = this_cpu_read_8(pcp);				\
+	} while (this_cpu_cmpxchg_8(pcp, __o, __o + __v) != __o);	\
+})
+
+#define this_cpu_and_8(pcp, val)					\
+({									\
+	typeof(val) __v = (val);					\
+	typeof(val) __o;						\
+	do {								\
+		__o = this_cpu_read_8(pcp);				\
+	} while (this_cpu_cmpxchg_8(pcp, __o, __o & __v) != __o);	\
+})
+
+#define this_cpu_or_8(pcp, val)						\
+({									\
+	typeof(val) __v = (val);					\
+	typeof(val) __o;						\
+	do {								\
+		__o = this_cpu_read_8(pcp);				\
+	} while (this_cpu_cmpxchg_8(pcp, __o, __o | __v) != __o);		\
+})
+
+#define this_cpu_xor_8(pcp, val)					\
+({									\
+	typeof(val) __v = (val);					\
+	typeof(val) __o;						\
+	do {								\
+		__o = this_cpu_read_8(pcp);				\
+	} while (this_cpu_cmpxchg_8(pcp, __o, __o ^ __v) != __o);		\
+})
+
+#define this_cpu_inc_return_8(pcp, val)				\
+({									\
+	typeof(val) __v = (val);					\
+	typeof(val) __o;						\
+	typeof(val) __r;						\
+	do {								\
+		__o = this_cpu_read_8(pcp);				\
+		__r = __o  + __v;					\
+	} while (this_cpu_cmpxchg_8(pcp, __o, __r) != __o);		\
+	r;								\
+})
+
 #endif /* CONFIG_X86_CMPXCHG64 */
 #endif /* !CONFIG_X86_64 */



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-08 18:17     ` Mathieu Desnoyers
@ 2010-12-09  6:26       ` H. Peter Anvin
  2010-12-09 23:40         ` Christoph Lameter
  0 siblings, 1 reply; 11+ messages in thread
From: H. Peter Anvin @ 2010-12-09  6:26 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Christoph Lameter, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet, Arjan van de Ven

On 12/08/2010 10:17 AM, Mathieu Desnoyers wrote:
> 
> Hi Christoph,
> 
> Can you show if this provides savings in terms of:
> 
> - instruction cache footprint
> - cycles required to run
> - large-scale impact on the branch prediction buffers
> 
> Given that this targets per-cpu data only, the additional impact on cache-line
> exchange traffic of using cmpxchg over xchg (cache-line not grabbed as exclusive
> by the initial read) should not really matter.
> 
> I'm CCing Arjan and HPA, because they might have some interesting insight into
> the performance impact of lock-prefixed xchg vs using local cmpxchg in a loop.
> 

XCHG is always locked; it doesn't need the prefix.  Unfortunately,
unlike on the 8086 on modern processors locks have a real cost.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-09  6:26       ` H. Peter Anvin
@ 2010-12-09 23:40         ` Christoph Lameter
  0 siblings, 0 replies; 11+ messages in thread
From: Christoph Lameter @ 2010-12-09 23:40 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Mathieu Desnoyers, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet, Arjan van de Ven

On Wed, 8 Dec 2010, H. Peter Anvin wrote:

> On 12/08/2010 10:17 AM, Mathieu Desnoyers wrote:
> >
> > Hi Christoph,
> >
> > Can you show if this provides savings in terms of:
> >
> > - instruction cache footprint
> > - cycles required to run
> > - large-scale impact on the branch prediction buffers
> >
> > Given that this targets per-cpu data only, the additional impact on cache-line
> > exchange traffic of using cmpxchg over xchg (cache-line not grabbed as exclusive
> > by the initial read) should not really matter.
> >
> > I'm CCing Arjan and HPA, because they might have some interesting insight into
> > the performance impact of lock-prefixed xchg vs using local cmpxchg in a loop.
> >
>
> XCHG is always locked; it doesn't need the prefix.  Unfortunately,
> unlike on the 8086 on modern processors locks have a real cost.

So should we use xchg or a loop using prefixless cmpxchg instead when
referring to per cpu data and requiring only per cpu atomicness?


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-12-09 23:40 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-08 17:55 [cpuops cmpxchg V1 0/4] Cmpxchg and xchg operations Christoph Lameter
2010-12-08 17:55 ` [cpuops cmpxchg V1 1/4] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
2010-12-08 17:55 ` [cpuops cmpxchg V1 2/4] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
2010-12-08 18:08   ` Christoph Lameter
2010-12-08 18:17     ` Mathieu Desnoyers
2010-12-09  6:26       ` H. Peter Anvin
2010-12-09 23:40         ` Christoph Lameter
2010-12-08 22:20   ` Christoph Lameter
2010-12-08 17:55 ` [cpuops cmpxchg V1 3/4] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
2010-12-08 17:55 ` [cpuops cmpxchg V1 4/4] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
2010-12-08 22:22 ` cpuops cmpxchg: Provide 64 bit this_cpu_xx for 32 bit x86 using cmpxchg8b Christoph Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.