linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [cpuops cmpxchg V2 0/5] Cmpxchg and xchg operations
@ 2010-12-14 16:28 Christoph Lameter
  2010-12-14 16:28 ` [cpuops cmpxchg V2 1/5] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
                   ` (4 more replies)
  0 siblings, 5 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 16:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

Add cmpxchg and xchg operations to the cpu ops and use them for irq handling
and for vm statistics.

Requires the per cpu patches for and_return cpu ops.

V1->V2:
	- Determine that cmpxchg without LOCK is faster than xchg with
	 	implied lock (numbers are in the patch description).
	- Drop 64 bit cpuops operations on 32 bit x86 (resulted in various
		nasty things such as 64 bit alignment requirements for 64 bit
		entities on 32 bit, sizable inline code and very complex macros)



^ permalink raw reply	[flat|nested] 49+ messages in thread

* [cpuops cmpxchg V2 1/5] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support
  2010-12-14 16:28 [cpuops cmpxchg V2 0/5] Cmpxchg and xchg operations Christoph Lameter
@ 2010-12-14 16:28 ` Christoph Lameter
  2010-12-17 14:55   ` Tejun Heo
  2010-12-14 16:28 ` [cpuops cmpxchg V2 2/5] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 16:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_generic --]
[-- Type: text/plain, Size: 7164 bytes --]

Generic code to provide new per cpu atomic features

	this_cpu_cmpxchg
	this_cpu_xchg

Fallback occurs to functions using interrupts disable/enable
to ensure correct per cpu atomicity.

Fallback to regular cmpxchg and xchg is not possible since per cpu atomic
semantics include the guarantee that the current cpus per cpu data is
accessed atomically. Use of regular cmpxchg and xchg requires the
determination of the address of the per cpu data before regular cmpxchg
or xchg which therefore cannot be atomically included in an xchg or
cmpxchg without segment override.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |  142 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 135 insertions(+), 7 deletions(-)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-12-08 13:16:22.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-12-08 14:43:46.000000000 -0600
@@ -242,21 +242,21 @@ extern void __bad_size_call_parameter(vo
 
 #define __pcpu_size_call_return2(stem, pcp, ...)			\
 ({									\
-	typeof(pcp) ret__;						\
+	typeof(pcp) pscr2_ret__;					\
 	__verify_pcpu_ptr(&(pcp));					\
 	switch(sizeof(pcp)) {						\
-	case 1: ret__ = stem##1(pcp, __VA_ARGS__);			\
+	case 1: pscr2_ret__ = stem##1(pcp, __VA_ARGS__);		\
 		break;							\
-	case 2: ret__ = stem##2(pcp, __VA_ARGS__);			\
+	case 2: pscr2_ret__ = stem##2(pcp, __VA_ARGS__);		\
 		break;							\
-	case 4: ret__ = stem##4(pcp, __VA_ARGS__);			\
+	case 4: pscr2_ret__ = stem##4(pcp, __VA_ARGS__);		\
 		break;							\
-	case 8: ret__ = stem##8(pcp, __VA_ARGS__);			\
+	case 8: pscr2_ret__ = stem##8(pcp, __VA_ARGS__);		\
 		break;							\
 	default:							\
 		__bad_size_call_parameter();break;			\
 	}								\
-	ret__;								\
+	pscr2_ret__;							\
 })
 
 #define __pcpu_size_call(stem, variable, ...)				\
@@ -322,6 +322,106 @@ do {									\
 # define this_cpu_read(pcp)	__pcpu_size_call_return(this_cpu_read_, (pcp))
 #endif
 
+#define __this_cpu_generic_xchg(pcp, nval)				\
+({	typeof(pcp) ret__;						\
+	ret__ = __this_cpu_read(pcp);					\
+	__this_cpu_write(pcp, nval);					\
+	ret__;								\
+})
+
+#ifndef __this_cpu_xchg
+# ifndef __this_cpu_xchg_1
+#  define __this_cpu_xchg_1(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_2
+#  define __this_cpu_xchg_2(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_4
+#  define __this_cpu_xchg_4(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef __this_cpu_xchg_8
+#  define __this_cpu_xchg_8(pcp, nval)	__this_cpu_generic_xchg(pcp, nval)
+# endif
+# define __this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(__this_cpu_xchg_, (pcp), nval)
+#endif
+
+#define _this_cpu_generic_xchg(pcp, nval)				\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	ret__ = __this_cpu_read(pcp);					\
+	__this_cpu_write(pcp, nval);					\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_xchg
+# ifndef this_cpu_xchg_1
+#  define this_cpu_xchg_1(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_2
+#  define this_cpu_xchg_2(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_4
+#  define this_cpu_xchg_4(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# ifndef this_cpu_xchg_8
+#  define this_cpu_xchg_8(pcp, nval)	_this_cpu_generic_xchg(pcp, nval)
+# endif
+# define this_cpu_xchg(pcp, nval)	__pcpu_size_call_return2(this_cpu_xchg_, (pcp), nval)
+#endif
+
+#define _this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg
+# ifndef this_cpu_cmpxchg_1
+#  define this_cpu_cmpxchg_1(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_2
+#  define this_cpu_cmpxchg_2(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_4
+#  define this_cpu_cmpxchg_4(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_8
+#  define this_cpu_cmpxchg_8(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
+#endif
+
+#define __this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({									\
+	typeof(pcp) ret__;						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	ret__;								\
+})
+
+#ifndef __this_cpu_cmpxchg
+# ifndef __this_cpu_cmpxchg_1
+#  define __this_cpu_cmpxchg_1(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_2
+#  define __this_cpu_cmpxchg_2(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_4
+#  define __this_cpu_cmpxchg_4(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_8
+#  define __this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define __this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(\
+				__this_cpu_cmpxchg_, pcp, oval, nval)
+#endif
+
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	preempt_disable();						\
@@ -610,7 +710,7 @@ do {									\
  * IRQ safe versions of the per cpu RMW operations. Note that these operations
  * are *not* safe against modification of the same variable from another
  * processors (which one gets when using regular atomic operations)
- . They are guaranteed to be atomic vs. local interrupts and
+ * They are guaranteed to be atomic vs. local interrupts and
  * preemption only.
  */
 #define irqsafe_cpu_generic_to_op(pcp, val, op)				\
@@ -697,4 +797,32 @@ do {									\
 # define irqsafe_cpu_xor(pcp, val) __pcpu_size_call(irqsafe_cpu_xor_, (val))
 #endif
 
+#define irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({									\
+	typeof(pcp) ret__;						\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cpu_cmpxchg
+# ifndef irqsafe_cpu_cmpxchg_1
+#  define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_2
+#  define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_4
+#  define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef irqsafe_cpu_cmpxchg_8
+#  define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	irqsafe_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define irqsafe_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(irqsafe_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
 #endif /* __LINUX_PERCPU_H */


^ permalink raw reply	[flat|nested] 49+ messages in thread

* [cpuops cmpxchg V2 2/5] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-14 16:28 [cpuops cmpxchg V2 0/5] Cmpxchg and xchg operations Christoph Lameter
  2010-12-14 16:28 ` [cpuops cmpxchg V2 1/5] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
@ 2010-12-14 16:28 ` Christoph Lameter
  2010-12-17 15:22   ` Tejun Heo
  2010-12-14 16:28 ` [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 16:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_x86 --]
[-- Type: text/plain, Size: 6324 bytes --]

Provide support as far as the hardware capabilities of the x86 cpus
allow.

Define CONFIG_CMPXCHG_LOCAL in Kconfig.cpu to allow core code to test for
fast cpuops implementations.

V1->V2:
	- Take out the definition for this_cpu_cmpxchg_8 and move it into
	  a separate patch.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/Kconfig.cpu          |    3 +
 arch/x86/include/asm/percpu.h |  109 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 111 insertions(+), 1 deletion(-)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-10 12:18:46.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-10 12:42:18.000000000 -0600
@@ -212,6 +212,83 @@ do {									\
 	ret__;								\
 })
 
+/*
+ * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
+ * full lock semantics even though they are not needed.
+ */
+#define percpu_xchg_op(var, nval)					\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __new = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("xchgb %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "q" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("xchgw %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("xchgl %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("xchgq %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new)				\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	__ret;								\
+})
+
+/*
+ * cmpxchg has no such implied lock semantics as a result it is much
+ * more efficient for cpu local operations.
+ */
+#define percpu_cmpxchg_op(var, oval, nval)				\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __old = (oval);					\
+	typeof(var) __new = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("cmpxchgb %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "q" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("cmpxchgw %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("cmpxchgl %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("cmpxchgq %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	__ret;								\
+})
+
 #define percpu_from_op(op, var, constraint)		\
 ({							\
 	typeof(var) pfo_ret__;				\
@@ -335,6 +412,17 @@ do {									\
 #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
 
+/*
+ * Generic fallback operations for __this_cpu_xchg are okay and much faster
+ * than an xchg with forced lock semantics.
+ */
+#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
+
 #ifndef CONFIG_M386
 #define __this_cpu_add_return_1(pcp, val)	percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_2(pcp, val)	percpu_add_return_op(pcp, val)
@@ -342,7 +430,18 @@ do {									\
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_4(pcp, val)		percpu_add_return_op(pcp, val)
-#endif
+
+#define __this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#endif /* !CONFIG_M386 */
+
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -370,6 +469,14 @@ do {									\
 #define __this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
 #define this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)
 
+#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+
 #endif
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
Index: linux-2.6/arch/x86/Kconfig.cpu
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig.cpu	2010-12-10 12:18:46.000000000 -0600
+++ linux-2.6/arch/x86/Kconfig.cpu	2010-12-10 12:18:49.000000000 -0600
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
+config CMPXCHG_LOCAL
+	def_bool X86_64 || (X86_32 && !M386)
+
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC


^ permalink raw reply	[flat|nested] 49+ messages in thread

* [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-14 16:28 [cpuops cmpxchg V2 0/5] Cmpxchg and xchg operations Christoph Lameter
  2010-12-14 16:28 ` [cpuops cmpxchg V2 1/5] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
  2010-12-14 16:28 ` [cpuops cmpxchg V2 2/5] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
@ 2010-12-14 16:28 ` Christoph Lameter
  2010-12-15 16:32   ` Tejun Heo
  2010-12-18 15:32   ` Tejun Heo
  2010-12-14 16:28 ` [cpuops cmpxchg V2 4/5] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
  2010-12-14 16:28 ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics Christoph Lameter
  4 siblings, 2 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 16:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Peter Zijlstra, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin, Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_irq --]
[-- Type: text/plain, Size: 2112 bytes --]

The irq work queue is a per cpu object and it is sufficient for
synchronization if per cpu atomics are used. Doing so simplifies
the code and reduces the overhead of the code.

Before:

christoph@linux-2.6$ size kernel/irq_work.o
   text	   data	    bss	    dec	    hex	filename
    451	      8	      1	    460	    1cc	kernel/irq_work.o

After:

christoph@linux-2.6$ size kernel/irq_work.o 
   text	   data	    bss	    dec	    hex	filename
    438	      8	      1	    447	    1bf	kernel/irq_work.o

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Christoph Lameter <cl@linux.com>

---
 kernel/irq_work.c |   18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

Index: linux-2.6/kernel/irq_work.c
===================================================================
--- linux-2.6.orig/kernel/irq_work.c	2010-12-07 10:24:30.000000000 -0600
+++ linux-2.6/kernel/irq_work.c	2010-12-07 10:26:45.000000000 -0600
@@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)
  */
 static void __irq_work_queue(struct irq_work *entry)
 {
-	struct irq_work **head, *next;
+	struct irq_work *next;
 
-	head = &get_cpu_var(irq_work_list);
+	preempt_disable();
 
 	do {
-		next = *head;
+		next = __this_cpu_read(irq_work_list);
 		/* Can assign non-atomic because we keep the flags set. */
 		entry->next = next_flags(next, IRQ_WORK_FLAGS);
-	} while (cmpxchg(head, next, entry) != next);
+	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
 
 	/* The list was empty, raise self-interrupt to start processing. */
 	if (!irq_work_next(entry))
 		arch_irq_work_raise();
 
-	put_cpu_var(irq_work_list);
+	preempt_enable();
 }
 
 /*
@@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
  */
 void irq_work_run(void)
 {
-	struct irq_work *list, **head;
+	struct irq_work *list;
 
-	head = &__get_cpu_var(irq_work_list);
-	if (*head == NULL)
+	if (this_cpu_read(irq_work_list) == NULL)
 		return;
 
 	BUG_ON(!in_irq());
 	BUG_ON(!irqs_disabled());
 
-	list = xchg(head, NULL);
+	list = this_cpu_xchg(irq_work_list, NULL);
+
 	while (list != NULL) {
 		struct irq_work *entry = list;
 


^ permalink raw reply	[flat|nested] 49+ messages in thread

* [cpuops cmpxchg V2 4/5] vmstat: User per cpu atomics to avoid interrupt disable / enable
  2010-12-14 16:28 [cpuops cmpxchg V2 0/5] Cmpxchg and xchg operations Christoph Lameter
                   ` (2 preceding siblings ...)
  2010-12-14 16:28 ` [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
@ 2010-12-14 16:28 ` Christoph Lameter
  2010-12-15 16:45   ` Tejun Heo
  2010-12-14 16:28 ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics Christoph Lameter
  4 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 16:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_cmpxchg_vmstat --]
[-- Type: text/plain, Size: 4660 bytes --]

Currently the operations to increment vm counters must disable interrupts
in order to not mess up their housekeeping of counters.

So use this_cpu_cmpxchg() to avoid the overhead. Since we can no longer
count on preremption being disabled we still have some minor issues.
The fetching of the counter thresholds is racy.
A threshold from another cpu may be applied if we happen to be
rescheduled on another cpu.  However, the following vmstat operation
will then bring the counter again under the threshold limit.

The operations for __xxx_zone_state are not changed since the caller
has taken care of the synchronization needs (and therefore the cycle
count is even less than the optimized version for the irq disable case
provided here).

The optimization using this_cpu_cmpxchg will only be used if the arch
supports efficient this_cpu_ops (must have CONFIG_CMPXCHG_LOCAL set!)

The use of this_cpu_cmpxchg reduces the cycle count for the counter
operations by %80 (inc_zone_page_state goes from 170 cycles to 32).

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/vmstat.c |  101 +++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 14 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2010-12-01 10:04:19.000000000 -0600
+++ linux-2.6/mm/vmstat.c	2010-12-01 10:09:06.000000000 -0600
@@ -185,20 +185,6 @@ void __mod_zone_page_state(struct zone *
 EXPORT_SYMBOL(__mod_zone_page_state);
 
 /*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
-					int delta)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_zone_page_state(zone, item, delta);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
  * Optimized increment and decrement functions.
  *
  * These are only for a single page and therefore can take a struct page *
@@ -265,6 +251,92 @@ void __dec_zone_page_state(struct page *
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
 
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ *     0       No overstepping
+ *     1       Overstepping half of threshold
+ *     -1      Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+       enum zone_stat_item item, int delta, int overstep_mode)
+{
+	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	s8 __percpu *p = pcp->vm_stat_diff + item;
+	long o, n, t, z;
+
+	do {
+		z = 0;  /* overflow to zone counters */
+
+		/*
+		 * The fetching of the stat_threshold is racy. We may apply
+		 * a counter threshold to the wrong the cpu if we get
+		 * rescheduled while executing here. However, the following
+		 * will apply the threshold again and therefore bring the
+		 * counter under the threshold.
+		 */
+		t = this_cpu_read(pcp->stat_threshold);
+
+		o = this_cpu_read(*p);
+		n = delta + o;
+
+		if (n > t || n < -t) {
+			int os = overstep_mode * (t >> 1) ;
+
+			/* Overflow must be added to zone counters */
+			z = n + os;
+			n = -os;
+		}
+	} while (this_cpu_cmpxchg(*p, o, n) != o);
+
+	if (z)
+		zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+					int delta)
+{
+	mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+	mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+	mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+					int delta)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__mod_zone_page_state(zone, item, delta);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
 	unsigned long flags;
@@ -295,6 +367,7 @@ void dec_zone_page_state(struct page *pa
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
+#endif
 
 /*
  * Update the zone counters for one cpu.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 16:28 [cpuops cmpxchg V2 0/5] Cmpxchg and xchg operations Christoph Lameter
                   ` (3 preceding siblings ...)
  2010-12-14 16:28 ` [cpuops cmpxchg V2 4/5] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
@ 2010-12-14 16:28 ` Christoph Lameter
  2010-12-14 16:35   ` Mathieu Desnoyers
                     ` (2 more replies)
  4 siblings, 3 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 16:28 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

[-- Attachment #1: cpuops_xchg_with_cmpxchg --]
[-- Type: text/plain, Size: 2614 bytes --]

Use cmpxchg instead of xchg to realize this_cpu_xchg.

xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
will not.

Baselines:

xchg()		= 18 cycles (no segment prefix, LOCK semantics)
__this_cpu_xchg = 1 cycle

(simulated using this_cpu_read/write, two prefixes. Looks like the
cpu can use loop optimization to get rid of most of the overhead)

Cycles before:

this_cpu_xchg	 = 37 cycles (segment prefix and LOCK (implied by xchg))

After:

this_cpu_xchg	= 11 cycle (using cmpxchg without lock semantics)

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-10 12:46:31.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-10 13:25:21.000000000 -0600
@@ -213,8 +213,9 @@ do {									\
 })
 
 /*
- * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
- * full lock semantics even though they are not needed.
+ * xchg is implemented using cmpxchg without a lock prefix. xchg is
+ * expensive due to the implied lock prefix. The processor cannot prefetch
+ * cachelines if xchg is used.
  */
 #define percpu_xchg_op(var, nval)					\
 ({									\
@@ -222,25 +223,33 @@ do {									\
 	typeof(var) __new = (nval);					\
 	switch (sizeof(var)) {						\
 	case 1:								\
-		asm("xchgb %2, "__percpu_arg(1)			\
+		asm("\n1:mov "__percpu_arg(1)",%%al"			\
+		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\
+		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "q" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 2:								\
-		asm("xchgw %2, "__percpu_arg(1)			\
+		asm("\n1:mov "__percpu_arg(1)",%%ax"			\
+		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\
+		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 4:								\
-		asm("xchgl %2, "__percpu_arg(1)			\
+		asm("\n1:mov "__percpu_arg(1)",%%eax"			\
+		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\
+		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 8:								\
-		asm("xchgq %2, "__percpu_arg(1)			\
+		asm("\n1:mov "__percpu_arg(1)",%%rax"			\
+		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\
+		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\
 			    : "memory");				\


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 16:28 ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics Christoph Lameter
@ 2010-12-14 16:35   ` Mathieu Desnoyers
  2010-12-14 16:44   ` Eric Dumazet
  2010-12-15 16:47   ` Tejun Heo
  2 siblings, 0 replies; 49+ messages in thread
From: Mathieu Desnoyers @ 2010-12-14 16:35 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin

* Christoph Lameter (cl@linux.com) wrote:
> Use cmpxchg instead of xchg to realize this_cpu_xchg.
> 
> xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
> will not.
> 
> Baselines:
> 
> xchg()		= 18 cycles (no segment prefix, LOCK semantics)
> __this_cpu_xchg = 1 cycle
> 
> (simulated using this_cpu_read/write, two prefixes. Looks like the
> cpu can use loop optimization to get rid of most of the overhead)
> 
> Cycles before:
> 
> this_cpu_xchg	 = 37 cycles (segment prefix and LOCK (implied by xchg))
> 
> After:
> 
> this_cpu_xchg	= 11 cycle (using cmpxchg without lock semantics)

Cool! Thanks for benchmarking these, it's really worth it.

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  arch/x86/include/asm/percpu.h |   21 +++++++++++++++------
>  1 file changed, 15 insertions(+), 6 deletions(-)
> 
> Index: linux-2.6/arch/x86/include/asm/percpu.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-10 12:46:31.000000000 -0600
> +++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-10 13:25:21.000000000 -0600
> @@ -213,8 +213,9 @@ do {									\
>  })
>  
>  /*
> - * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
> - * full lock semantics even though they are not needed.
> + * xchg is implemented using cmpxchg without a lock prefix. xchg is
> + * expensive due to the implied lock prefix. The processor cannot prefetch
> + * cachelines if xchg is used.
>   */
>  #define percpu_xchg_op(var, nval)					\
>  ({									\
> @@ -222,25 +223,33 @@ do {									\
>  	typeof(var) __new = (nval);					\
>  	switch (sizeof(var)) {						\
>  	case 1:								\
> -		asm("xchgb %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%al"			\
> +		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\
>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "q" (__new)				\
>  			    : "memory");				\
>  		break;							\
>  	case 2:								\
> -		asm("xchgw %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%ax"			\
> +		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\
>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "r" (__new)				\
>  			    : "memory");				\
>  		break;							\
>  	case 4:								\
> -		asm("xchgl %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%eax"			\
> +		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\
>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "r" (__new)				\
>  			    : "memory");				\
>  		break;							\
>  	case 8:								\
> -		asm("xchgq %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%rax"			\
> +		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\
>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "r" (__new)				\
>  			    : "memory");				\
> 

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 16:28 ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics Christoph Lameter
  2010-12-14 16:35   ` Mathieu Desnoyers
@ 2010-12-14 16:44   ` Eric Dumazet
  2010-12-14 16:55     ` Christoph Lameter
  2010-12-15 16:47   ` Tejun Heo
  2 siblings, 1 reply; 49+ messages in thread
From: Eric Dumazet @ 2010-12-14 16:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, H. Peter Anvin,
	Mathieu Desnoyers

Le mardi 14 décembre 2010 à 10:28 -0600, Christoph Lameter a écrit :
> pièce jointe document texte brut (cpuops_xchg_with_cmpxchg)
> Use cmpxchg instead of xchg to realize this_cpu_xchg.
> 
> xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
> will not.
> 
> Baselines:
> 
> xchg()		= 18 cycles (no segment prefix, LOCK semantics)
> __this_cpu_xchg = 1 cycle
> 
> (simulated using this_cpu_read/write, two prefixes. Looks like the
> cpu can use loop optimization to get rid of most of the overhead)
> 
> Cycles before:
> 
> this_cpu_xchg	 = 37 cycles (segment prefix and LOCK (implied by xchg))
> 
> After:
> 
> this_cpu_xchg	= 11 cycle (using cmpxchg without lock semantics)
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  arch/x86/include/asm/percpu.h |   21 +++++++++++++++------
>  1 file changed, 15 insertions(+), 6 deletions(-)
> 
> Index: linux-2.6/arch/x86/include/asm/percpu.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-10 12:46:31.000000000 -0600
> +++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-10 13:25:21.000000000 -0600
> @@ -213,8 +213,9 @@ do {									\
>  })
>  
>  /*
> - * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
> - * full lock semantics even though they are not needed.
> + * xchg is implemented using cmpxchg without a lock prefix. xchg is
> + * expensive due to the implied lock prefix. The processor cannot prefetch
> + * cachelines if xchg is used.
>   */
>  #define percpu_xchg_op(var, nval)					\
>  ({									\
> @@ -222,25 +223,33 @@ do {									\
>  	typeof(var) __new = (nval);					\
>  	switch (sizeof(var)) {						\
>  	case 1:								\
> -		asm("xchgb %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%al"			\
> +		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\


You should use the fact that the failed cmpxchg loads in al/ax/eax/rax
the current value, so :

	"\n\tmov "__percpu_arg(1)",%%al"
	"\n1:\tcmpxchgb %2, "__percpu_arg(1)
	"\n\tjnz 1b"

(No need to reload the value again)


>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "q" (__new)				\
>  			    : "memory");				\
>  		break;							\
>  	case 2:								\
> -		asm("xchgw %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%ax"			\
> +		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\
>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "r" (__new)				\
>  			    : "memory");				\
>  		break;							\
>  	case 4:								\
> -		asm("xchgl %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%eax"			\
> +		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\
>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "r" (__new)				\
>  			    : "memory");				\
>  		break;							\
>  	case 8:								\
> -		asm("xchgq %2, "__percpu_arg(1)			\
> +		asm("\n1:mov "__percpu_arg(1)",%%rax"			\
> +		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\
> +		    "\n\tjnz 1b"					\
>  			    : "=a" (__ret), "+m" (var)			\
>  			    : "r" (__new)				\
>  			    : "memory");				\
> 



^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 16:44   ` Eric Dumazet
@ 2010-12-14 16:55     ` Christoph Lameter
  2010-12-14 17:00       ` H. Peter Anvin
  0 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 16:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, H. Peter Anvin,
	Mathieu Desnoyers

On Tue, 14 Dec 2010, Eric Dumazet wrote:

> You should use the fact that the failed cmpxchg loads in al/ax/eax/rax
> the current value, so :
>
> 	"\n\tmov "__percpu_arg(1)",%%al"
> 	"\n1:\tcmpxchgb %2, "__percpu_arg(1)
> 	"\n\tjnz 1b"
>
> (No need to reload the value again)

Subject: cpuops xchg: Exploit the fact that failed cmpxchges return old value

We do not need to reload the value as pointed out by Eric. It is already in
the correct register so just rerun the cmpxchg without the load.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-14 10:51:57.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-14 10:54:46.000000000 -0600
@@ -223,32 +223,32 @@ do {									\
 	typeof(var) __new = (nval);					\
 	switch (sizeof(var)) {						\
 	case 1:								\
-		asm("\n1:mov "__percpu_arg(1)",%%al"			\
-		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%al"			\
+		    "\n1:cmpxchgb %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "q" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 2:								\
-		asm("\n1:mov "__percpu_arg(1)",%%ax"			\
-		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%ax"			\
+		    "\n1:cmpxchgw %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 4:								\
-		asm("\n1:mov "__percpu_arg(1)",%%eax"			\
-		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%eax"			\
+		    "\n1:cmpxchgl %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 8:								\
-		asm("\n1:mov "__percpu_arg(1)",%%rax"			\
-		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\
+		asm("\n\tmov "__percpu_arg(1)",%%rax"			\
+		    "\n1:cmpxchgq %2, "__percpu_arg(1)		\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 16:55     ` Christoph Lameter
@ 2010-12-14 17:00       ` H. Peter Anvin
  2010-12-14 17:19         ` Christoph Lameter
  0 siblings, 1 reply; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-14 17:00 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers

On 12/14/2010 08:55 AM, Christoph Lameter wrote:
> 
> We do not need to reload the value as pointed out by Eric. It is already in
> the correct register so just rerun the cmpxchg without the load.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 

Is it genuinely faster to do the pre-load mov, or can we drop that too?
 My guess would be that yes it is, but if it happens not to be it would
be nice to reduce the code size.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 17:00       ` H. Peter Anvin
@ 2010-12-14 17:19         ` Christoph Lameter
  2010-12-14 17:22           ` H. Peter Anvin
  0 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 17:19 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Eric Dumazet, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers



On Tue, 14 Dec 2010, H. Peter Anvin wrote:

> On 12/14/2010 08:55 AM, Christoph Lameter wrote:
> >
> > We do not need to reload the value as pointed out by Eric. It is already in
> > the correct register so just rerun the cmpxchg without the load.
> >
> > Signed-off-by: Christoph Lameter <cl@linux.com>
> >
>
> Is it genuinely faster to do the pre-load mov, or can we drop that too?
>  My guess would be that yes it is, but if it happens not to be it would
> be nice to reduce the code size.

Dropping the load increases the cycle count from 11 to 16.

---
 arch/x86/include/asm/percpu.h |   12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-12-14 11:07:18.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-12-14 11:14:37.000000000 -0600
@@ -223,32 +223,28 @@ do {									\
 	typeof(var) __new = (nval);					\
 	switch (sizeof(var)) {						\
 	case 1:								\
-		asm("\n1:mov "__percpu_arg(1)",%%al"			\
-		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\
+		asm("\n1:cmpxchgb %2, "__percpu_arg(1)			\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "q" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 2:								\
-		asm("\n1:mov "__percpu_arg(1)",%%ax"			\
-		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\
+		asm("\n1:cmpxchgw %2, "__percpu_arg(1)			\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 4:								\
-		asm("\n1:mov "__percpu_arg(1)",%%eax"			\
-		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\
+		asm("\n1:cmpxchgl %2, "__percpu_arg(1)			\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\
 			    : "memory");				\
 		break;							\
 	case 8:								\
-		asm("\n1:mov "__percpu_arg(1)",%%rax"			\
-		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\
+		asm("\n1:cmpxchgq %2, "__percpu_arg(1)			\
 		    "\n\tjnz 1b"					\
 			    : "=a" (__ret), "+m" (var)			\
 			    : "r" (__new)				\

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 17:19         ` Christoph Lameter
@ 2010-12-14 17:22           ` H. Peter Anvin
  2010-12-14 17:29             ` Tejun Heo
  0 siblings, 1 reply; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-14 17:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers

On 12/14/2010 09:19 AM, Christoph Lameter wrote:
>>
>> Is it genuinely faster to do the pre-load mov, or can we drop that too?
>>  My guess would be that yes it is, but if it happens not to be it would
>> be nice to reduce the code size.
> 
> Dropping the load increases the cycle count from 11 to 16.
> 

Great, that answers that!  I'll pick up the patch hopefully today (I'm
finally ramping back up on arch/x86 again after having been diverted to
an internal project for a while...)

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 17:22           ` H. Peter Anvin
@ 2010-12-14 17:29             ` Tejun Heo
  2010-12-14 17:35               ` Christoph Lameter
  2010-12-15  1:06               ` H. Peter Anvin
  0 siblings, 2 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-14 17:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

Hello,

On 12/14/2010 06:22 PM, H. Peter Anvin wrote:
> On 12/14/2010 09:19 AM, Christoph Lameter wrote:
>>>
>>> Is it genuinely faster to do the pre-load mov, or can we drop that too?
>>>  My guess would be that yes it is, but if it happens not to be it would
>>> be nice to reduce the code size.
>>
>> Dropping the load increases the cycle count from 11 to 16.
> 
> Great, that answers that!  I'll pick up the patch hopefully today (I'm
> finally ramping back up on arch/x86 again after having been diverted to
> an internal project for a while...)

How do you want to route these?  All patches before this series is
already in the percpu tree.  I can pull the generic bits and leave out
the x86 bits so that x86 tree can pull in percpu bits and then put x86
stuff on top of it.  If you wanna go that way, I would drop all x86
related patches from the previous patchsets too.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 17:29             ` Tejun Heo
@ 2010-12-14 17:35               ` Christoph Lameter
  2010-12-15  1:06               ` H. Peter Anvin
  1 sibling, 0 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-14 17:35 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers

On Tue, 14 Dec 2010, Tejun Heo wrote:

> > Great, that answers that!  I'll pick up the patch hopefully today (I'm
> > finally ramping back up on arch/x86 again after having been diverted to
> > an internal project for a while...)
>
> How do you want to route these?  All patches before this series is
> already in the percpu tree.  I can pull the generic bits and leave out
> the x86 bits so that x86 tree can pull in percpu bits and then put x86
> stuff on top of it.  If you wanna go that way, I would drop all x86
> related patches from the previous patchsets too.

I think it is better to merge the generic pieces and x86 at the same time.
Since this has worked so well with the percpu tree so far I'd say we
continue that way.

Regardless of how this will be merged: It would be good if hpa would take
a look at the patches we have accumulated so far.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 17:29             ` Tejun Heo
  2010-12-14 17:35               ` Christoph Lameter
@ 2010-12-15  1:06               ` H. Peter Anvin
  2010-12-15 16:29                 ` Tejun Heo
  1 sibling, 1 reply; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-15  1:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On 12/14/2010 09:29 AM, Tejun Heo wrote:
> Hello,
> 
> On 12/14/2010 06:22 PM, H. Peter Anvin wrote:
>> On 12/14/2010 09:19 AM, Christoph Lameter wrote:
>>>>
>>>> Is it genuinely faster to do the pre-load mov, or can we drop that too?
>>>>  My guess would be that yes it is, but if it happens not to be it would
>>>> be nice to reduce the code size.
>>>
>>> Dropping the load increases the cycle count from 11 to 16.
>>
>> Great, that answers that!  I'll pick up the patch hopefully today (I'm
>> finally ramping back up on arch/x86 again after having been diverted to
>> an internal project for a while...)
> 
> How do you want to route these?  All patches before this series is
> already in the percpu tree.  I can pull the generic bits and leave out
> the x86 bits so that x86 tree can pull in percpu bits and then put x86
> stuff on top of it.  If you wanna go that way, I would drop all x86
> related patches from the previous patchsets too.
> 

It's probably easiest if you just add them with my Acked-by: then.
Alternatively, I could pick them up so they go into -tip and the tip
test machinery.

	-hpa

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-15  1:06               ` H. Peter Anvin
@ 2010-12-15 16:29                 ` Tejun Heo
  2010-12-15 16:35                   ` H. Peter Anvin
  0 siblings, 1 reply; 49+ messages in thread
From: Tejun Heo @ 2010-12-15 16:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

Hello,

On 12/15/2010 02:06 AM, H. Peter Anvin wrote:
> It's probably easiest if you just add them with my Acked-by: then.
> Alternatively, I could pick them up so they go into -tip and the tip
> test machinery.

All patches accepted into percpu till now are in the following branch.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git for-next

The tree is based on v2.6.37-rc5 and v2.6.37-rc5..for-next would give
21 patches.  I think most of these don't fit x86 tree but there are
several which would fit there much better.  d80aadf9 (x86: Replace
uses of current_cpu_data with this_cpu ops) and e5195e91 (x86: Use
this_cpu_inc_return for nmi counter).

As it would be beneficial to push these through -tip testing anyway,
how about the following?

* If you review and ack the x86 related bits in the series, I'll
  regenerated and add the ACKs.

* It would be better if the two commits mentioned above get routed
  through x86 tree rather than percpu tree, so I'll drop the above two
  from percpu tree and you can pull percpu into x86 and then apply
  those in x86 tree.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-14 16:28 ` [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
@ 2010-12-15 16:32   ` Tejun Heo
  2010-12-15 16:34     ` H. Peter Anvin
  2010-12-15 16:50     ` Peter Zijlstra
  2010-12-18 15:32   ` Tejun Heo
  1 sibling, 2 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-15 16:32 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Peter Zijlstra, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin, Mathieu Desnoyers

On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> The irq work queue is a per cpu object and it is sufficient for
> synchronization if per cpu atomics are used. Doing so simplifies
> the code and reduces the overhead of the code.
> 
> Before:
> 
> christoph@linux-2.6$ size kernel/irq_work.o
>    text	   data	    bss	    dec	    hex	filename
>     451	      8	      1	    460	    1cc	kernel/irq_work.o
> 
> After:
> 
> christoph@linux-2.6$ size kernel/irq_work.o 
>    text	   data	    bss	    dec	    hex	filename
>     438	      8	      1	    447	    1bf	kernel/irq_work.o
> 
> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>

Peter, can you please ack this one?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-15 16:32   ` Tejun Heo
@ 2010-12-15 16:34     ` H. Peter Anvin
  2010-12-15 16:50     ` Peter Zijlstra
  1 sibling, 0 replies; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-15 16:34 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, akpm, Peter Zijlstra, Pekka Enberg,
	linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 12/15/2010 08:32 AM, Tejun Heo wrote:
> On 12/14/2010 05:28 PM, Christoph Lameter wrote:
>> The irq work queue is a per cpu object and it is sufficient for
>> synchronization if per cpu atomics are used. Doing so simplifies
>> the code and reduces the overhead of the code.
>>
>> Before:
>>
>> christoph@linux-2.6$ size kernel/irq_work.o
>>    text	   data	    bss	    dec	    hex	filename
>>     451	      8	      1	    460	    1cc	kernel/irq_work.o
>>
>> After:
>>
>> christoph@linux-2.6$ size kernel/irq_work.o 
>>    text	   data	    bss	    dec	    hex	filename
>>     438	      8	      1	    447	    1bf	kernel/irq_work.o
>>
>> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> 
> Peter, can you please ack this one?
> 
> Thanks.
> 

Acked-by: H. Peter Anvin <hpa@zytor.com>

... for tis whole series.

	-hpa


-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-15 16:29                 ` Tejun Heo
@ 2010-12-15 16:35                   ` H. Peter Anvin
  2010-12-15 16:39                     ` Tejun Heo
  0 siblings, 1 reply; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-15 16:35 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On 12/15/2010 08:29 AM, Tejun Heo wrote:
> Hello,
> 
> On 12/15/2010 02:06 AM, H. Peter Anvin wrote:
>> It's probably easiest if you just add them with my Acked-by: then.
>> Alternatively, I could pick them up so they go into -tip and the tip
>> test machinery.
> 
> All patches accepted into percpu till now are in the following branch.
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git for-next
> 
> The tree is based on v2.6.37-rc5 and v2.6.37-rc5..for-next would give
> 21 patches.  I think most of these don't fit x86 tree but there are
> several which would fit there much better.  d80aadf9 (x86: Replace
> uses of current_cpu_data with this_cpu ops) and e5195e91 (x86: Use
> this_cpu_inc_return for nmi counter).
> 
> As it would be beneficial to push these through -tip testing anyway,
> how about the following?
> 
> * If you review and ack the x86 related bits in the series, I'll
>   regenerated and add the ACKs.
> 
> * It would be better if the two commits mentioned above get routed
>   through x86 tree rather than percpu tree, so I'll drop the above two
>   from percpu tree and you can pull percpu into x86 and then apply
>   those in x86 tree.
> 

I think we can do that... let me look through the tree first officially
committing.  Alternatively, I can pull the patches from your tree and
individually review them and add them to a tip branch, if you prefer.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-15 16:35                   ` H. Peter Anvin
@ 2010-12-15 16:39                     ` Tejun Heo
  2010-12-16 16:14                       ` Tejun Heo
  0 siblings, 1 reply; 49+ messages in thread
From: Tejun Heo @ 2010-12-15 16:39 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

Hello,

On 12/15/2010 05:35 PM, H. Peter Anvin wrote:
> I think we can do that... let me look through the tree first officially
> committing.  Alternatively, I can pull the patches from your tree and
> individually review them and add them to a tip branch, if you prefer.

I'd prefer percpu things going through percpu tree, if for nothing
else for git history's sake, but I don't think it really matters.  The
series is spread all over the place anyway.  As long as each
maintainer is properly alerted about the changes, it should be okay.
Please let me know whether you agree with the changes currently queued
in percpu#for-next.  I'll update the tree with your Acked-by's and
freeze it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 4/5] vmstat: User per cpu atomics to avoid interrupt disable / enable
  2010-12-14 16:28 ` [cpuops cmpxchg V2 4/5] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
@ 2010-12-15 16:45   ` Tejun Heo
  2010-12-15 17:01     ` Christoph Lameter
  0 siblings, 1 reply; 49+ messages in thread
From: Tejun Heo @ 2010-12-15 16:45 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> Currently the operations to increment vm counters must disable interrupts
> in order to not mess up their housekeeping of counters.
> 
> So use this_cpu_cmpxchg() to avoid the overhead. Since we can no longer
> count on preremption being disabled we still have some minor issues.
> The fetching of the counter thresholds is racy.
> A threshold from another cpu may be applied if we happen to be
> rescheduled on another cpu.  However, the following vmstat operation
> will then bring the counter again under the threshold limit.
> 
> The operations for __xxx_zone_state are not changed since the caller
> has taken care of the synchronization needs (and therefore the cycle
> count is even less than the optimized version for the irq disable case
> provided here).
> 
> The optimization using this_cpu_cmpxchg will only be used if the arch
> supports efficient this_cpu_ops (must have CONFIG_CMPXCHG_LOCAL set!)
> 
> The use of this_cpu_cmpxchg reduces the cycle count for the counter
> operations by %80 (inc_zone_page_state goes from 170 cycles to 32).
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
>
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ *     0       No overstepping
+ *     1       Overstepping half of threshold
+ *     -1      Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+       enum zone_stat_item item, int delta, int overstep_mode)
+{
+	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	s8 __percpu *p = pcp->vm_stat_diff + item;
+	long o, n, t, z;
+
+	do {
+		z = 0;  /* overflow to zone counters */
+
+		/*
+		 * The fetching of the stat_threshold is racy. We may apply
+		 * a counter threshold to the wrong the cpu if we get
+		 * rescheduled while executing here. However, the following
+		 * will apply the threshold again and therefore bring the
+		 * counter under the threshold.
+		 */

What does "the following" mean here?  Later executions of the
function?  It seems like the counter can go out of the threshold at
least temporarily, which probably is okay but I think the comment can
be improved a bit.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-14 16:28 ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics Christoph Lameter
  2010-12-14 16:35   ` Mathieu Desnoyers
  2010-12-14 16:44   ` Eric Dumazet
@ 2010-12-15 16:47   ` Tejun Heo
  2 siblings, 0 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-15 16:47 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> Use cmpxchg instead of xchg to realize this_cpu_xchg.
> 
> xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
> will not.
> 
> Baselines:
> 
> xchg()		= 18 cycles (no segment prefix, LOCK semantics)
> __this_cpu_xchg = 1 cycle
> 
> (simulated using this_cpu_read/write, two prefixes. Looks like the
> cpu can use loop optimization to get rid of most of the overhead)
> 
> Cycles before:
> 
> this_cpu_xchg	 = 37 cycles (segment prefix and LOCK (implied by xchg))
> 
> After:
> 
> this_cpu_xchg	= 11 cycle (using cmpxchg without lock semantics)
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>

It's not a bad idea to keep this patch separate from the original one
but as both are not applied yet, it probably is better to put this
right after the original addition if you end up re-posting the series;
otherwise, I'll just reorder it when I apply.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-15 16:32   ` Tejun Heo
  2010-12-15 16:34     ` H. Peter Anvin
@ 2010-12-15 16:50     ` Peter Zijlstra
  2010-12-15 17:04       ` Christoph Lameter
  1 sibling, 1 reply; 49+ messages in thread
From: Peter Zijlstra @ 2010-12-15 16:50 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet, H. Peter Anvin, Mathieu Desnoyers

On Wed, 2010-12-15 at 17:32 +0100, Tejun Heo wrote:
> On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> > The irq work queue is a per cpu object and it is sufficient for
> > synchronization if per cpu atomics are used. Doing so simplifies
> > the code and reduces the overhead of the code.
> > 
> > Before:
> > 
> > christoph@linux-2.6$ size kernel/irq_work.o
> >    text	   data	    bss	    dec	    hex	filename
> >     451	      8	      1	    460	    1cc	kernel/irq_work.o
> > 
> > After:
> > 
> > christoph@linux-2.6$ size kernel/irq_work.o 
> >    text	   data	    bss	    dec	    hex	filename
> >     438	      8	      1	    447	    1bf	kernel/irq_work.o
> > 
> > Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> 
> Peter, can you please ack this one?

I guess so, I don't much like the bare preempt_disable/enable there, and
I'm wondering, aren't %fs prefixed insn slower than regular insn? Does
it really pay to avoid this one address computation if there's multiple
users in a function. %fs prefixes do take another byte, so it will also
result in larger code at some point.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 4/5] vmstat: User per cpu atomics to avoid interrupt disable / enable
  2010-12-15 16:45   ` Tejun Heo
@ 2010-12-15 17:01     ` Christoph Lameter
  0 siblings, 0 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-15 17:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

On Wed, 15 Dec 2010, Tejun Heo wrote:

> +		/*
> +		 * The fetching of the stat_threshold is racy. We may apply
> +		 * a counter threshold to the wrong the cpu if we get
> +		 * rescheduled while executing here. However, the following
> +		 * will apply the threshold again and therefore bring the
> +		 * counter under the threshold.
> +		 */
>
> What does "the following" mean here?  Later executions of the
> function?  It seems like the counter can go out of the threshold at
> least temporarily, which probably is okay but I think the comment can
> be improved a bit.

I meant later execution of the function. I will fix the comment.

Subject: vmstat comment fix

Clarify comments for the threshold a bit.

Signed-off-by: Christoh Lameter <cl@linux.com>

---
 mm/vmstat.c |    9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2010-12-15 10:57:44.000000000 -0600
+++ linux-2.6/mm/vmstat.c	2010-12-15 10:58:46.000000000 -0600
@@ -277,9 +277,12 @@ static inline void mod_state(struct zone
 		/*
 		 * The fetching of the stat_threshold is racy. We may apply
 		 * a counter threshold to the wrong the cpu if we get
-		 * rescheduled while executing here. However, the following
-		 * will apply the threshold again and therefore bring the
-		 * counter under the threshold.
+		 * rescheduled while executing here. However, the next
+		 * counter update will apply the threshold again and
+		 * therefore bring the counter under the threshold again.
+		 *
+		 * Most of the time the thresholds are the same anyways
+		 * for all cpus in a zone.
 		 */
 		t = this_cpu_read(pcp->stat_threshold);


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-15 16:50     ` Peter Zijlstra
@ 2010-12-15 17:04       ` Christoph Lameter
  2010-12-15 17:18         ` Peter Zijlstra
  0 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-15 17:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin, Mathieu Desnoyers

On Wed, 15 Dec 2010, Peter Zijlstra wrote:

> On Wed, 2010-12-15 at 17:32 +0100, Tejun Heo wrote:
> > On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> > > The irq work queue is a per cpu object and it is sufficient for
> > > synchronization if per cpu atomics are used. Doing so simplifies
> > > the code and reduces the overhead of the code.
> > >
> > > Before:
> > >
> > > christoph@linux-2.6$ size kernel/irq_work.o
> > >    text	   data	    bss	    dec	    hex	filename
> > >     451	      8	      1	    460	    1cc	kernel/irq_work.o
> > >
> > > After:
> > >
> > > christoph@linux-2.6$ size kernel/irq_work.o
> > >    text	   data	    bss	    dec	    hex	filename
> > >     438	      8	      1	    447	    1bf	kernel/irq_work.o
> > >
> > > Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> >
> > Peter, can you please ack this one?
>
> I guess so, I don't much like the bare preempt_disable/enable there, and
> I'm wondering, aren't %fs prefixed insn slower than regular insn? Does
> it really pay to avoid this one address computation if there's multiple
> users in a function. %fs prefixes do take another byte, so it will also
> result in larger code at some point.

Prefixes are faster than explicit address calculations. A prefix allows
you to integrate the per cpu address calculation into an arithmetic
operation.

A prefix is one byte which is less that multiple arithmetic operations to
calculate an address.

I am not sure that the preempt_disable/enable is needed. They are just
there because you had a get/put_cpu there.

If the code is run from hardirq context then preempt is already disabled.
We can just drop those then.






^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-15 17:04       ` Christoph Lameter
@ 2010-12-15 17:18         ` Peter Zijlstra
  2010-12-15 17:31           ` H. Peter Anvin
  2010-12-15 17:32           ` Christoph Lameter
  0 siblings, 2 replies; 49+ messages in thread
From: Peter Zijlstra @ 2010-12-15 17:18 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin, Mathieu Desnoyers

On Wed, 2010-12-15 at 11:04 -0600, Christoph Lameter wrote:

> Prefixes are faster than explicit address calculations. A prefix allows
> you to integrate the per cpu address calculation into an arithmetic
> operation.

Well, depends on how often you need that address I'd think. If you'd
have a per-cpu struct and need to frob lots of variables in that struct
it might be cheaper to simply compute the struct address once and then
use relative addresses than to prefix everything with %fs.

> A prefix is one byte which is less that multiple arithmetic operations to
> calculate an address.

I thought you'd only need a single arithmetic op to calculate the
address, anyway at some point those 1 byte prefixes will add up to more
than the ops saved.

In the current code you add 2 bytes (although you safe one from loosing
the LOCK prefix, but that could have been achieved by using
cmpxchg_local() as well. These 2 bytes are probably less than the
address computation for head (and not needing the head pointer again
saves on register pressure) so its probably a win here.

Still, non of this is really fast-path code, so I really wonder why
we're optimizing this over keeping the code obvious.

> I am not sure that the preempt_disable/enable is needed. They are just
> there because you had a get/put_cpu there.
> 
> If the code is run from hardirq context then preempt is already disabled.
> We can just drop those then.

Afaik the current callers are all from IRQ/NMI context, but I don't want
to mandate callers be from such contexts.

The problem is that we need to guarantee we raise the self-IPI on the
same cpu we queued the worklet on.

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-15 17:18         ` Peter Zijlstra
@ 2010-12-15 17:31           ` H. Peter Anvin
  2010-12-15 17:32           ` Christoph Lameter
  1 sibling, 0 replies; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-15 17:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Tejun Heo, akpm, Pekka Enberg, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers

On 12/15/2010 09:18 AM, Peter Zijlstra wrote:
> On Wed, 2010-12-15 at 11:04 -0600, Christoph Lameter wrote:
> 
>> Prefixes are faster than explicit address calculations. A prefix allows
>> you to integrate the per cpu address calculation into an arithmetic
>> operation.
> 
> Well, depends on how often you need that address I'd think. If you'd
> have a per-cpu struct and need to frob lots of variables in that struct
> it might be cheaper to simply compute the struct address once and then
> use relative addresses than to prefix everything with %fs.
> 

Let's just make it clear -- current x86 CPUs generally do not have a
penalty for prefixes (it might be that under very unusual pipeline
conditions they do, I am not 100% sure.)  In fact, we changed patching
LOCK prefixes from NOP to %ds: because it made the code faster.

Some older CPUs do, but those are no longer relevant for performance
decisions.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-15 17:18         ` Peter Zijlstra
  2010-12-15 17:31           ` H. Peter Anvin
@ 2010-12-15 17:32           ` Christoph Lameter
  1 sibling, 0 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-15 17:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin, Mathieu Desnoyers

On Wed, 15 Dec 2010, Peter Zijlstra wrote:

> > A prefix is one byte which is less that multiple arithmetic operations to
> > calculate an address.
>
> I thought you'd only need a single arithmetic op to calculate the
> address, anyway at some point those 1 byte prefixes will add up to more
> than the ops saved.

Yes if you have enough of those then it may no longer be worth it. But
here you only have two. And the segment address calculation is efficiently
implemented in the processors.

> Still, non of this is really fast-path code, so I really wonder why
> we're optimizing this over keeping the code obvious.

I was looking for usecases that already exist for cmpxchg (because the
patches introduce this_cpu_cmpxchg also for other uses) and found this one
here and it looked like an easy way to use the prefixed operation.  Its
not that high of a priority but it seems that we are saving code and
cycles.

The code is cleaner IMHO. this_cpu_xx documents that the actions here
only have local effects and that there is no need of serialization with
other processors.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-15 16:39                     ` Tejun Heo
@ 2010-12-16 16:14                       ` Tejun Heo
  2010-12-16 18:13                         ` x86: Use this_cpu_has for thermal_interrupt Christoph Lameter
                                           ` (5 more replies)
  0 siblings, 6 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-16 16:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

Hey, again.

On 12/15/2010 05:39 PM, Tejun Heo wrote:
> I'd prefer percpu things going through percpu tree, if for nothing
> else for git history's sake, but I don't think it really matters.  The
> series is spread all over the place anyway.  As long as each
> maintainer is properly alerted about the changes, it should be okay.
> Please let me know whether you agree with the changes currently queued
> in percpu#for-next.  I'll update the tree with your Acked-by's and
> freeze it.

Are you okay with the patches currently in percpu#for-next?  If so,
I'll regenerate patches with your acked-by and pop the two previously
mentioned commits and proceed with the rest of the series.

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* x86: Use this_cpu_has for thermal_interrupt
  2010-12-16 16:14                       ` Tejun Heo
@ 2010-12-16 18:13                         ` Christoph Lameter
  2010-12-18 15:35                           ` Tejun Heo
  2010-12-16 18:14                         ` x86: udelay: Use this_cpu_read to avoid address calculation Christoph Lameter
                                           ` (4 subsequent siblings)
  5 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-16 18:13 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers

It is more effective to use a segment prefix instead of calculating the
address of the current cpu area amd then testing flags.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/kernel/cpu/mcheck/therm_throt.c |    7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

Index: linux-2.6/arch/x86/kernel/cpu/mcheck/therm_throt.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/therm_throt.c	2010-12-16 11:41:48.000000000 -0600
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/therm_throt.c	2010-12-16 11:43:42.000000000 -0600
@@ -317,7 +317,6 @@ device_initcall(thermal_throttle_init_de
 static void intel_thermal_interrupt(void)
 {
 	__u64 msr_val;
-	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());

 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);

@@ -326,19 +325,19 @@ static void intel_thermal_interrupt(void
 				CORE_LEVEL) != 0)
 		mce_log_therm_throt_event(CORE_THROTTLED | msr_val);

-	if (cpu_has(c, X86_FEATURE_PLN))
+	if (this_cpu_has(X86_FEATURE_PLN))
 		if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
 					POWER_LIMIT_EVENT,
 					CORE_LEVEL) != 0)
 			mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);

-	if (cpu_has(c, X86_FEATURE_PTS)) {
+	if (this_cpu_has(X86_FEATURE_PTS)) {
 		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
 					THERMAL_THROTTLING_EVENT,
 					PACKAGE_LEVEL) != 0)
 			mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
-		if (cpu_has(c, X86_FEATURE_PLN))
+		if (this_cpu_has(X86_FEATURE_PLN))
 			if (therm_throt_process(msr_val &
 					PACKAGE_THERM_STATUS_POWER_LIMIT,
 					POWER_LIMIT_EVENT,


^ permalink raw reply	[flat|nested] 49+ messages in thread

* x86: udelay: Use this_cpu_read to avoid address calculation
  2010-12-16 16:14                       ` Tejun Heo
  2010-12-16 18:13                         ` x86: Use this_cpu_has for thermal_interrupt Christoph Lameter
@ 2010-12-16 18:14                         ` Christoph Lameter
  2010-12-16 18:15                         ` gameport: use this_cpu_read instead of lookup Christoph Lameter
                                           ` (3 subsequent siblings)
  5 siblings, 0 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-16 18:14 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers

The code will use a segment prefix instead of doing the lookup and calculation.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/lib/delay.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6/arch/x86/lib/delay.c
===================================================================
--- linux-2.6.orig/arch/x86/lib/delay.c	2010-12-16 11:45:12.000000000 -0600
+++ linux-2.6/arch/x86/lib/delay.c	2010-12-16 11:45:37.000000000 -0600
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long
 	asm("mull %%edx"
 		:"=d" (xloops), "=&a" (d0)
 		:"1" (xloops), "0"
-		(cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
+		(this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));

 	__delay(++xloops);
 }


^ permalink raw reply	[flat|nested] 49+ messages in thread

* gameport: use this_cpu_read instead of lookup
  2010-12-16 16:14                       ` Tejun Heo
  2010-12-16 18:13                         ` x86: Use this_cpu_has for thermal_interrupt Christoph Lameter
  2010-12-16 18:14                         ` x86: udelay: Use this_cpu_read to avoid address calculation Christoph Lameter
@ 2010-12-16 18:15                         ` Christoph Lameter
  2010-12-18 15:34                           ` Tejun Heo
  2010-12-16 18:16                         ` acpi throttling: Use this_cpu_has and simplify code Christoph Lameter
                                           ` (2 subsequent siblings)
  5 siblings, 1 reply; 49+ messages in thread
From: Christoph Lameter @ 2010-12-16 18:15 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers



Signed-off-by: Christoph Lameter <cl@linux.com>

---
 drivers/input/gameport/gameport.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6/drivers/input/gameport/gameport.c
===================================================================
--- linux-2.6.orig/drivers/input/gameport/gameport.c	2010-12-16 11:46:18.000000000 -0600
+++ linux-2.6/drivers/input/gameport/gameport.c	2010-12-16 11:46:56.000000000 -0600
@@ -123,7 +123,7 @@ static int gameport_measure_speed(struct
 	}

 	gameport_close(gameport);
-	return (cpu_data(raw_smp_processor_id()).loops_per_jiffy *
+	return (this_cpu_read(cpu_info.loops_per_jiffy) *
 		(unsigned long)HZ / (1000 / 50)) / (tx < 1 ? 1 : tx);

 #else


^ permalink raw reply	[flat|nested] 49+ messages in thread

* acpi throttling: Use this_cpu_has and simplify code
  2010-12-16 16:14                       ` Tejun Heo
                                           ` (2 preceding siblings ...)
  2010-12-16 18:15                         ` gameport: use this_cpu_read instead of lookup Christoph Lameter
@ 2010-12-16 18:16                         ` Christoph Lameter
  2010-12-18 15:50                           ` Tejun Heo
  2010-12-21  4:28                           ` Len Brown
  2010-12-16 18:19                         ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics H. Peter Anvin
  2010-12-16 20:42                         ` H. Peter Anvin
  5 siblings, 2 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-16 18:16 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers


With the this_cpu_xx we no longer need to pass an acpi
structure to the msr management code. Simplifies code and improves
performance.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 drivers/acpi/processor_throttling.c |   32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

Index: linux-2.6/drivers/acpi/processor_throttling.c
===================================================================
--- linux-2.6.orig/drivers/acpi/processor_throttling.c	2010-12-16 11:56:57.000000000 -0600
+++ linux-2.6/drivers/acpi/processor_throttling.c	2010-12-16 12:00:17.000000000 -0600
@@ -662,20 +662,14 @@ static int acpi_processor_get_throttling
 }

 #ifdef CONFIG_X86
-static int acpi_throttling_rdmsr(struct acpi_processor *pr,
-					u64 *value)
+static int acpi_throttling_rdmsr(u64 *value)
 {
-	struct cpuinfo_x86 *c;
 	u64 msr_high, msr_low;
-	unsigned int cpu;
 	u64 msr = 0;
 	int ret = -1;

-	cpu = pr->id;
-	c = &cpu_data(cpu);
-
-	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
-		!cpu_has(c, X86_FEATURE_ACPI)) {
+	if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
+		!this_cpu_has(X86_FEATURE_ACPI)) {
 		printk(KERN_ERR PREFIX
 			"HARDWARE addr space,NOT supported yet\n");
 	} else {
@@ -690,18 +684,13 @@ static int acpi_throttling_rdmsr(struct
 	return ret;
 }

-static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
+static int acpi_throttling_wrmsr(u64 value)
 {
-	struct cpuinfo_x86 *c;
-	unsigned int cpu;
 	int ret = -1;
 	u64 msr;

-	cpu = pr->id;
-	c = &cpu_data(cpu);
-
-	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
-		!cpu_has(c, X86_FEATURE_ACPI)) {
+	if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
+		!this_cpu_has(X86_FEATURE_ACPI)) {
 		printk(KERN_ERR PREFIX
 			"HARDWARE addr space,NOT supported yet\n");
 	} else {
@@ -713,15 +702,14 @@ static int acpi_throttling_wrmsr(struct
 	return ret;
 }
 #else
-static int acpi_throttling_rdmsr(struct acpi_processor *pr,
-				u64 *value)
+static int acpi_throttling_rdmsr(u64 *value)
 {
 	printk(KERN_ERR PREFIX
 		"HARDWARE addr space,NOT supported yet\n");
 	return -1;
 }

-static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
+static int acpi_throttling_wrmsr(u64 value)
 {
 	printk(KERN_ERR PREFIX
 		"HARDWARE addr space,NOT supported yet\n");
@@ -753,7 +741,7 @@ static int acpi_read_throttling_status(s
 		ret = 0;
 		break;
 	case ACPI_ADR_SPACE_FIXED_HARDWARE:
-		ret = acpi_throttling_rdmsr(pr, value);
+		ret = acpi_throttling_rdmsr(value);
 		break;
 	default:
 		printk(KERN_ERR PREFIX "Unknown addr space %d\n",
@@ -786,7 +774,7 @@ static int acpi_write_throttling_state(s
 		ret = 0;
 		break;
 	case ACPI_ADR_SPACE_FIXED_HARDWARE:
-		ret = acpi_throttling_wrmsr(pr, value);
+		ret = acpi_throttling_wrmsr(value);
 		break;
 	default:
 		printk(KERN_ERR PREFIX "Unknown addr space %d\n",


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-16 16:14                       ` Tejun Heo
                                           ` (3 preceding siblings ...)
  2010-12-16 18:16                         ` acpi throttling: Use this_cpu_has and simplify code Christoph Lameter
@ 2010-12-16 18:19                         ` H. Peter Anvin
  2010-12-16 18:55                           ` Tejun Heo
  2010-12-16 20:42                         ` H. Peter Anvin
  5 siblings, 1 reply; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-16 18:19 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On 12/16/2010 08:14 AM, Tejun Heo wrote:
> 
> Are you okay with the patches currently in percpu#for-next?  If so,
> I'll regenerate patches with your acked-by and pop the two previously
> mentioned commits and proceed with the rest of the series.
> 
> Thank you.
> 

Still looking through them (sorry.)  I note that we probably do need to
get Christoph's followup patches into -tip, so we need to get it all
into tip; as such, even if it goes through your tree I'll need to pull
it into a tip branch.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-16 18:19                         ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics H. Peter Anvin
@ 2010-12-16 18:55                           ` Tejun Heo
  0 siblings, 0 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-16 18:55 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

Hello,

On 12/16/2010 07:19 PM, H. Peter Anvin wrote:
> On 12/16/2010 08:14 AM, Tejun Heo wrote:
>>
>> Are you okay with the patches currently in percpu#for-next?  If so,
>> I'll regenerate patches with your acked-by and pop the two previously
>> mentioned commits and proceed with the rest of the series.
> 
> Still looking through them (sorry.)  I note that we probably do need to
> get Christoph's followup patches into -tip, so we need to get it all
> into tip; as such, even if it goes through your tree I'll need to pull
> it into a tip branch.

Yeah, no problem.  Pekka also wants to pull the essential part into
the memory allocator part, so I think it would be best to keep at
least the proper percpu part and x86 specific ops in percpu tree tho.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics
  2010-12-16 16:14                       ` Tejun Heo
                                           ` (4 preceding siblings ...)
  2010-12-16 18:19                         ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics H. Peter Anvin
@ 2010-12-16 20:42                         ` H. Peter Anvin
  5 siblings, 0 replies; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-16 20:42 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On 12/16/2010 08:14 AM, Tejun Heo wrote:
> Hey, again.
> 
> On 12/15/2010 05:39 PM, Tejun Heo wrote:
>> I'd prefer percpu things going through percpu tree, if for nothing
>> else for git history's sake, but I don't think it really matters.  The
>> series is spread all over the place anyway.  As long as each
>> maintainer is properly alerted about the changes, it should be okay.
>> Please let me know whether you agree with the changes currently queued
>> in percpu#for-next.  I'll update the tree with your Acked-by's and
>> freeze it.
> 
> Are you okay with the patches currently in percpu#for-next?  If so,
> I'll regenerate patches with your acked-by and pop the two previously
> mentioned commits and proceed with the rest of the series.
> 

Just finished reviewing the patches in percpu#for-next.

Acked-by: H. Peter Anvin <hpa@zytor.com>

Look good.  Let me know when you have a baseline I can pull into a tip
branch so we can build the rest of the patches on top.

	-hpa

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 1/5] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support
  2010-12-14 16:28 ` [cpuops cmpxchg V2 1/5] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
@ 2010-12-17 14:55   ` Tejun Heo
  0 siblings, 0 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-17 14:55 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

Hello, Christoph.

On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> Index: linux-2.6/include/linux/percpu.h
> ===================================================================
> --- linux-2.6.orig/include/linux/percpu.h	2010-12-08 13:16:22.000000000 -0600
> +++ linux-2.6/include/linux/percpu.h	2010-12-08 14:43:46.000000000 -0600
> @@ -242,21 +242,21 @@ extern void __bad_size_call_parameter(vo
>  
>  #define __pcpu_size_call_return2(stem, pcp, ...)			\
>  ({									\
> -	typeof(pcp) ret__;						\
> +	typeof(pcp) pscr2_ret__;					\
>  	__verify_pcpu_ptr(&(pcp));					\
>  	switch(sizeof(pcp)) {						\
> -	case 1: ret__ = stem##1(pcp, __VA_ARGS__);			\
> +	case 1: pscr2_ret__ = stem##1(pcp, __VA_ARGS__);		\
>  		break;							\
> -	case 2: ret__ = stem##2(pcp, __VA_ARGS__);			\
> +	case 2: pscr2_ret__ = stem##2(pcp, __VA_ARGS__);		\
>  		break;							\
> -	case 4: ret__ = stem##4(pcp, __VA_ARGS__);			\
> +	case 4: pscr2_ret__ = stem##4(pcp, __VA_ARGS__);		\
>  		break;							\
> -	case 8: ret__ = stem##8(pcp, __VA_ARGS__);			\
> +	case 8: pscr2_ret__ = stem##8(pcp, __VA_ARGS__);		\
>  		break;							\
>  	default:							\
>  		__bad_size_call_parameter();break;			\
>  	}								\
> -	ret__;								\
> +	pscr2_ret__;							\
>  })

This chunk doesn't belong here.  It's the change I've made while
applying your earlier patch.  Dropping this part.

Relocated xchg and cmpxchg ops so that they're first grouped by
preemption safeness and put them after this_cpu_add_return() and
friends.

>   * IRQ safe versions of the per cpu RMW operations. Note that these operations
>   * are *not* safe against modification of the same variable from another
>   * processors (which one gets when using regular atomic operations)
> - . They are guaranteed to be atomic vs. local interrupts and
> + * They are guaranteed to be atomic vs. local interrupts and

Noted this in the patch description.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 2/5] x86: this_cpu_cmpxchg and this_cpu_xchg operations
  2010-12-14 16:28 ` [cpuops cmpxchg V2 2/5] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
@ 2010-12-17 15:22   ` Tejun Heo
  0 siblings, 0 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-17 15:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, H. Peter Anvin,
	Mathieu Desnoyers

On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> Provide support as far as the hardware capabilities of the x86 cpus
> allow.
> 
> Define CONFIG_CMPXCHG_LOCAL in Kconfig.cpu to allow core code to test for
> fast cpuops implementations.
> 
> V1->V2:
> 	- Take out the definition for this_cpu_cmpxchg_8 and move it into
> 	  a separate patch.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>

I reorderd ops so that it follow the organization better.  I'll soon
post the updated patch.  Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics
  2010-12-14 16:28 ` [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
  2010-12-15 16:32   ` Tejun Heo
@ 2010-12-18 15:32   ` Tejun Heo
  1 sibling, 0 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-18 15:32 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Peter Zijlstra, Pekka Enberg, linux-kernel, Eric Dumazet,
	H. Peter Anvin, Mathieu Desnoyers

On 12/14/2010 05:28 PM, Christoph Lameter wrote:
> The irq work queue is a per cpu object and it is sufficient for
> synchronization if per cpu atomics are used. Doing so simplifies
> the code and reduces the overhead of the code.
> 
> Before:
> 
> christoph@linux-2.6$ size kernel/irq_work.o
>    text	   data	    bss	    dec	    hex	filename
>     451	      8	      1	    460	    1cc	kernel/irq_work.o
> 
> After:
> 
> christoph@linux-2.6$ size kernel/irq_work.o 
>    text	   data	    bss	    dec	    hex	filename
>     438	      8	      1	    447	    1bf	kernel/irq_work.o
> 
> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Signed-off-by: Christoph Lameter <cl@linux.com>

Applied 3 and 4 to percpu#for-2.6.38.

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: gameport: use this_cpu_read instead of lookup
  2010-12-16 18:15                         ` gameport: use this_cpu_read instead of lookup Christoph Lameter
@ 2010-12-18 15:34                           ` Tejun Heo
  0 siblings, 0 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-18 15:34 UTC (permalink / raw)
  To: Christoph Lameter, H. Peter Anvin
  Cc: Eric Dumazet, akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers

On 12/16/2010 07:15 PM, Christoph Lameter wrote:
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>

Acked-by: Tejun Heo <tj@kernel.org>

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: x86: Use this_cpu_has for thermal_interrupt
  2010-12-16 18:13                         ` x86: Use this_cpu_has for thermal_interrupt Christoph Lameter
@ 2010-12-18 15:35                           ` Tejun Heo
  2010-12-21  0:56                             ` H. Peter Anvin
  0 siblings, 1 reply; 49+ messages in thread
From: Tejun Heo @ 2010-12-18 15:35 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers

On 12/16/2010 07:13 PM, Christoph Lameter wrote:
> It is more effective to use a segment prefix instead of calculating the
> address of the current cpu area amd then testing flags.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>

For this and the other x86 patch.

Acked-by: Tejun Heo <tj@kernel.org>

I suppose these two x86 patches and the gameport one would go through
the x86 tree, right?

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: acpi throttling: Use this_cpu_has and simplify code
  2010-12-16 18:16                         ` acpi throttling: Use this_cpu_has and simplify code Christoph Lameter
@ 2010-12-18 15:50                           ` Tejun Heo
  2010-12-21  1:52                             ` ykzhao
  2010-12-21 22:43                             ` Christoph Lameter
  2010-12-21  4:28                           ` Len Brown
  1 sibling, 2 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-18 15:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers, rui.zhang, lenb, linux-acpi

(cc'ing ACPI ppl and quoting the whole body)

On 12/16/2010 07:16 PM, Christoph Lameter wrote:
> 
> With the this_cpu_xx we no longer need to pass an acpi
> structure to the msr management code. Simplifies code and improves
> performance.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  drivers/acpi/processor_throttling.c |   32 ++++++++++----------------------
>  1 file changed, 10 insertions(+), 22 deletions(-)
> 
> Index: linux-2.6/drivers/acpi/processor_throttling.c
> ===================================================================
> --- linux-2.6.orig/drivers/acpi/processor_throttling.c	2010-12-16 11:56:57.000000000 -0600
> +++ linux-2.6/drivers/acpi/processor_throttling.c	2010-12-16 12:00:17.000000000 -0600
> @@ -662,20 +662,14 @@ static int acpi_processor_get_throttling
>  }
> 
>  #ifdef CONFIG_X86
> -static int acpi_throttling_rdmsr(struct acpi_processor *pr,
> -					u64 *value)
> +static int acpi_throttling_rdmsr(u64 *value)
>  {
> -	struct cpuinfo_x86 *c;
>  	u64 msr_high, msr_low;
> -	unsigned int cpu;
>  	u64 msr = 0;
>  	int ret = -1;
> 
> -	cpu = pr->id;
> -	c = &cpu_data(cpu);
> -
> -	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
> -		!cpu_has(c, X86_FEATURE_ACPI)) {
> +	if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
> +		!this_cpu_has(X86_FEATURE_ACPI)) {
>  		printk(KERN_ERR PREFIX
>  			"HARDWARE addr space,NOT supported yet\n");
>  	} else {
> @@ -690,18 +684,13 @@ static int acpi_throttling_rdmsr(struct
>  	return ret;
>  }
> 
> -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
> +static int acpi_throttling_wrmsr(u64 value)
>  {
> -	struct cpuinfo_x86 *c;
> -	unsigned int cpu;
>  	int ret = -1;
>  	u64 msr;
> 
> -	cpu = pr->id;
> -	c = &cpu_data(cpu);
> -
> -	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
> -		!cpu_has(c, X86_FEATURE_ACPI)) {
> +	if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
> +		!this_cpu_has(X86_FEATURE_ACPI)) {
>  		printk(KERN_ERR PREFIX
>  			"HARDWARE addr space,NOT supported yet\n");
>  	} else {
> @@ -713,15 +702,14 @@ static int acpi_throttling_wrmsr(struct
>  	return ret;
>  }
>  #else
> -static int acpi_throttling_rdmsr(struct acpi_processor *pr,
> -				u64 *value)
> +static int acpi_throttling_rdmsr(u64 *value)
>  {
>  	printk(KERN_ERR PREFIX
>  		"HARDWARE addr space,NOT supported yet\n");
>  	return -1;
>  }
> 
> -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
> +static int acpi_throttling_wrmsr(u64 value)
>  {
>  	printk(KERN_ERR PREFIX
>  		"HARDWARE addr space,NOT supported yet\n");
> @@ -753,7 +741,7 @@ static int acpi_read_throttling_status(s
>  		ret = 0;
>  		break;
>  	case ACPI_ADR_SPACE_FIXED_HARDWARE:
> -		ret = acpi_throttling_rdmsr(pr, value);
> +		ret = acpi_throttling_rdmsr(value);
>  		break;
>  	default:
>  		printk(KERN_ERR PREFIX "Unknown addr space %d\n",
> @@ -786,7 +774,7 @@ static int acpi_write_throttling_state(s
>  		ret = 0;
>  		break;
>  	case ACPI_ADR_SPACE_FIXED_HARDWARE:
> -		ret = acpi_throttling_wrmsr(pr, value);
> +		ret = acpi_throttling_wrmsr(value);
>  		break;
>  	default:
>  		printk(KERN_ERR PREFIX "Unknown addr space %d\n",
> 

It's bothersome that these methods don't have any indication that
they're bound to local CPU when they can't be called with @pr for
another CPU as MSRs can only be accessed from local CPU.

In the longer run, it would be nice if there's an indication that this
is only for the local CPU and maybe a WARN_ON_ONCE().  Maybe dropping
@pr and using this_cpu_*() is better for performance too?

Anyways, the above doesn't make the situation any worse, so...

 Acked-by: Tejun Heo <tj@kernel.org>

I think this one too fits the x86 tree better.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: x86: Use this_cpu_has for thermal_interrupt
  2010-12-18 15:35                           ` Tejun Heo
@ 2010-12-21  0:56                             ` H. Peter Anvin
  2010-12-30 11:29                               ` Tejun Heo
  0 siblings, 1 reply; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-21  0:56 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On 12/18/2010 07:35 AM, Tejun Heo wrote:
> On 12/16/2010 07:13 PM, Christoph Lameter wrote:
>> It is more effective to use a segment prefix instead of calculating the
>> address of the current cpu area amd then testing flags.
>>
>> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> For this and the other x86 patch.
> 
> Acked-by: Tejun Heo <tj@kernel.org>
> 
> I suppose these two x86 patches and the gameport one would go through
> the x86 tree, right?
> 

Yes, as long as I can get a stable base to pull into -tip.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: acpi throttling: Use this_cpu_has and simplify code
  2010-12-18 15:50                           ` Tejun Heo
@ 2010-12-21  1:52                             ` ykzhao
  2010-12-21 22:43                             ` Christoph Lameter
  1 sibling, 0 replies; 49+ messages in thread
From: ykzhao @ 2010-12-21  1:52 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, H. Peter Anvin, Eric Dumazet, akpm,
	Pekka Enberg, linux-kernel, Mathieu Desnoyers, Zhang, Rui, lenb,
	linux-acpi

On Sat, 2010-12-18 at 23:50 +0800, Tejun Heo wrote:
> (cc'ing ACPI ppl and quoting the whole body)
> 
> On 12/16/2010 07:16 PM, Christoph Lameter wrote:
> > 
> > With the this_cpu_xx we no longer need to pass an acpi
> > structure to the msr management code. Simplifies code and improves
> > performance.
> > 
> > Signed-off-by: Christoph Lameter <cl@linux.com>
> > 
> > ---
> >  drivers/acpi/processor_throttling.c |   32 ++++++++++----------------------
> >  1 file changed, 10 insertions(+), 22 deletions(-)
> > 
> > Index: linux-2.6/drivers/acpi/processor_throttling.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/acpi/processor_throttling.c	2010-12-16 11:56:57.000000000 -0600
> > +++ linux-2.6/drivers/acpi/processor_throttling.c	2010-12-16 12:00:17.000000000 -0600
> > @@ -662,20 +662,14 @@ static int acpi_processor_get_throttling
> >  }
> > 
> >  #ifdef CONFIG_X86
> > -static int acpi_throttling_rdmsr(struct acpi_processor *pr,
> > -					u64 *value)
> > +static int acpi_throttling_rdmsr(u64 *value)
> >  {
> > -	struct cpuinfo_x86 *c;
> >  	u64 msr_high, msr_low;
> > -	unsigned int cpu;
> >  	u64 msr = 0;
> >  	int ret = -1;
> > 
> > -	cpu = pr->id;
> > -	c = &cpu_data(cpu);
> > -
> > -	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
> > -		!cpu_has(c, X86_FEATURE_ACPI)) {
> > +	if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
> > +		!this_cpu_has(X86_FEATURE_ACPI)) {
> >  		printk(KERN_ERR PREFIX
> >  			"HARDWARE addr space,NOT supported yet\n");
> >  	} else {
> > @@ -690,18 +684,13 @@ static int acpi_throttling_rdmsr(struct
> >  	return ret;
> >  }
> > 
> > -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
> > +static int acpi_throttling_wrmsr(u64 value)
> >  {
> > -	struct cpuinfo_x86 *c;
> > -	unsigned int cpu;
> >  	int ret = -1;
> >  	u64 msr;
> > 
> > -	cpu = pr->id;
> > -	c = &cpu_data(cpu);
> > -
> > -	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
> > -		!cpu_has(c, X86_FEATURE_ACPI)) {
> > +	if ((this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_INTEL) ||
> > +		!this_cpu_has(X86_FEATURE_ACPI)) {
> >  		printk(KERN_ERR PREFIX
> >  			"HARDWARE addr space,NOT supported yet\n");
> >  	} else {
> > @@ -713,15 +702,14 @@ static int acpi_throttling_wrmsr(struct
> >  	return ret;
> >  }
> >  #else
> > -static int acpi_throttling_rdmsr(struct acpi_processor *pr,
> > -				u64 *value)
> > +static int acpi_throttling_rdmsr(u64 *value)
> >  {
> >  	printk(KERN_ERR PREFIX
> >  		"HARDWARE addr space,NOT supported yet\n");
> >  	return -1;
> >  }
> > 
> > -static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value)
> > +static int acpi_throttling_wrmsr(u64 value)
> >  {
> >  	printk(KERN_ERR PREFIX
> >  		"HARDWARE addr space,NOT supported yet\n");
> > @@ -753,7 +741,7 @@ static int acpi_read_throttling_status(s
> >  		ret = 0;
> >  		break;
> >  	case ACPI_ADR_SPACE_FIXED_HARDWARE:
> > -		ret = acpi_throttling_rdmsr(pr, value);
> > +		ret = acpi_throttling_rdmsr(value);
> >  		break;
> >  	default:
> >  		printk(KERN_ERR PREFIX "Unknown addr space %d\n",
> > @@ -786,7 +774,7 @@ static int acpi_write_throttling_state(s
> >  		ret = 0;
> >  		break;
> >  	case ACPI_ADR_SPACE_FIXED_HARDWARE:
> > -		ret = acpi_throttling_wrmsr(pr, value);
> > +		ret = acpi_throttling_wrmsr(value);
> >  		break;
> >  	default:
> >  		printk(KERN_ERR PREFIX "Unknown addr space %d\n",
> > 
> 
> It's bothersome that these methods don't have any indication that
> they're bound to local CPU when they can't be called with @pr for
> another CPU as MSRs can only be accessed from local CPU.

The above function may be called under the scenario with irq disabled.
In such case if the corresponding MRS is accessed by using remote
method, it will complain the oops. Maybe it will be safer to firstly
switch to the local CPU and then read/write the MRS register related
with the throttling.

> 
> In the longer run, it would be nice if there's an indication that this
> is only for the local CPU and maybe a WARN_ON_ONCE().  Maybe dropping
> @pr and using this_cpu_*() is better for performance too?

It is also ok to me that drops the pr input argument of
acpi_throttling_rdmsr/wrmsr as it is already switched to the local cpu.

> 
> Anyways, the above doesn't make the situation any worse, so...
> 
>  Acked-by: Tejun Heo <tj@kernel.org>
> 
> I think this one too fits the x86 tree better.
> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: acpi throttling: Use this_cpu_has and simplify code
  2010-12-16 18:16                         ` acpi throttling: Use this_cpu_has and simplify code Christoph Lameter
  2010-12-18 15:50                           ` Tejun Heo
@ 2010-12-21  4:28                           ` Len Brown
  1 sibling, 0 replies; 49+ messages in thread
From: Len Brown @ 2010-12-21  4:28 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

This patch is a valid cleanup, and I've applied it to the acpi-test tree.

The routines named *rdmsr() and *wrmsr() run on the local CPU
by definition.  I think that after this change to get rid of pr*,
that is even more clear.

The part about checking that you are on the right processor
should really be done in the caller.  Yakui should send a patch
for the driver to check for successful return from
set_cpus_allowed_ptr() before invoking this code.

> With the this_cpu_xx we no longer need to pass an acpi
> structure to the msr management code. Simplifies code and improves
> performance.

Simplifies, yes.
Performance -- not the focus here.
If you are running routines to throttle the clock frequency,
then you have already decided that performance isn't your top priority:-)

thanks,
Len Brown, Intel Open Source Technology Center

> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  drivers/acpi/processor_throttling.c |   32 ++++++++++----------------------
>  1 file changed, 10 insertions(+), 22 deletions(-)


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: acpi throttling: Use this_cpu_has and simplify code
  2010-12-18 15:50                           ` Tejun Heo
  2010-12-21  1:52                             ` ykzhao
@ 2010-12-21 22:43                             ` Christoph Lameter
  1 sibling, 0 replies; 49+ messages in thread
From: Christoph Lameter @ 2010-12-21 22:43 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers, rui.zhang, lenb, linux-acpi

On Sat, 18 Dec 2010, Tejun Heo wrote:

> It's bothersome that these methods don't have any indication that
> they're bound to local CPU when they can't be called with @pr for
> another CPU as MSRs can only be accessed from local CPU.

Right.

> In the longer run, it would be nice if there's an indication that this
> is only for the local CPU and maybe a WARN_ON_ONCE().  Maybe dropping
> @pr and using this_cpu_*() is better for performance too?

That is precisely the difficulty with many other functions that take
pointers to cpu_info. If we can establish that they are called *only* with
pointers to the current cpu then we can improve the function by dropping
the parameter and using this_cpu ops to access the cpu_info fields.

Using this_cpu ops instead of access to a field of a struct pointed to by
an array avoids an addition of pointers and replacest with a operation
that essentially takes a global address with a segment prefix. It reduces
register pressure and avoids the additions of pointers but it will encode
the address in the instruction.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: x86: Use this_cpu_has for thermal_interrupt
  2010-12-21  0:56                             ` H. Peter Anvin
@ 2010-12-30 11:29                               ` Tejun Heo
  2010-12-30 18:19                                 ` H. Peter Anvin
  0 siblings, 1 reply; 49+ messages in thread
From: Tejun Heo @ 2010-12-30 11:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On Mon, Dec 20, 2010 at 04:56:19PM -0800, H. Peter Anvin wrote:
> On 12/18/2010 07:35 AM, Tejun Heo wrote:
> > On 12/16/2010 07:13 PM, Christoph Lameter wrote:
> >> It is more effective to use a segment prefix instead of calculating the
> >> address of the current cpu area amd then testing flags.
> >>
> >> Signed-off-by: Christoph Lameter <cl@linux.com>
> > 
> > For this and the other x86 patch.
> > 
> > Acked-by: Tejun Heo <tj@kernel.org>
> > 
> > I suppose these two x86 patches and the gameport one would go through
> > the x86 tree, right?
> > 
> 
> Yes, as long as I can get a stable base to pull into -tip.

I suppose I should take the two x86 and the gameport patches here into
percpu tree with your ACKs too.  Would that be okay with you?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: x86: Use this_cpu_has for thermal_interrupt
  2010-12-30 11:29                               ` Tejun Heo
@ 2010-12-30 18:19                                 ` H. Peter Anvin
  2010-12-31 12:43                                   ` Tejun Heo
  0 siblings, 1 reply; 49+ messages in thread
From: H. Peter Anvin @ 2010-12-30 18:19 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On 12/30/2010 03:29 AM, Tejun Heo wrote:
> On Mon, Dec 20, 2010 at 04:56:19PM -0800, H. Peter Anvin wrote:
>> On 12/18/2010 07:35 AM, Tejun Heo wrote:
>>> On 12/16/2010 07:13 PM, Christoph Lameter wrote:
>>>> It is more effective to use a segment prefix instead of calculating the
>>>> address of the current cpu area amd then testing flags.
>>>>
>>>> Signed-off-by: Christoph Lameter <cl@linux.com>
>>>
>>> For this and the other x86 patch.
>>>
>>> Acked-by: Tejun Heo <tj@kernel.org>
>>>
>>> I suppose these two x86 patches and the gameport one would go through
>>> the x86 tree, right?
>>>
>>
>> Yes, as long as I can get a stable base to pull into -tip.
> 
> I suppose I should take the two x86 and the gameport patches here into
> percpu tree with your ACKs too.  Would that be okay with you?
> 

Yes, please.

Acked-by: H. Peter Anvin <hpa@zytor.com>

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 49+ messages in thread

* Re: x86: Use this_cpu_has for thermal_interrupt
  2010-12-30 18:19                                 ` H. Peter Anvin
@ 2010-12-31 12:43                                   ` Tejun Heo
  0 siblings, 0 replies; 49+ messages in thread
From: Tejun Heo @ 2010-12-31 12:43 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Christoph Lameter, Eric Dumazet, akpm, Pekka Enberg,
	linux-kernel, Mathieu Desnoyers

On Thu, Dec 30, 2010 at 10:19:52AM -0800, H. Peter Anvin wrote:
> Acked-by: H. Peter Anvin <hpa@zytor.com>

Three patches committed to percpu#for-2.6.38.  Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 49+ messages in thread

end of thread, other threads:[~2010-12-31 12:43 UTC | newest]

Thread overview: 49+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-12-14 16:28 [cpuops cmpxchg V2 0/5] Cmpxchg and xchg operations Christoph Lameter
2010-12-14 16:28 ` [cpuops cmpxchg V2 1/5] percpu: Generic this_cpu_cmpxchg() and this_cpu_xchg support Christoph Lameter
2010-12-17 14:55   ` Tejun Heo
2010-12-14 16:28 ` [cpuops cmpxchg V2 2/5] x86: this_cpu_cmpxchg and this_cpu_xchg operations Christoph Lameter
2010-12-17 15:22   ` Tejun Heo
2010-12-14 16:28 ` [cpuops cmpxchg V2 3/5] irq_work: Use per cpu atomics instead of regular atomics Christoph Lameter
2010-12-15 16:32   ` Tejun Heo
2010-12-15 16:34     ` H. Peter Anvin
2010-12-15 16:50     ` Peter Zijlstra
2010-12-15 17:04       ` Christoph Lameter
2010-12-15 17:18         ` Peter Zijlstra
2010-12-15 17:31           ` H. Peter Anvin
2010-12-15 17:32           ` Christoph Lameter
2010-12-18 15:32   ` Tejun Heo
2010-12-14 16:28 ` [cpuops cmpxchg V2 4/5] vmstat: User per cpu atomics to avoid interrupt disable / enable Christoph Lameter
2010-12-15 16:45   ` Tejun Heo
2010-12-15 17:01     ` Christoph Lameter
2010-12-14 16:28 ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics Christoph Lameter
2010-12-14 16:35   ` Mathieu Desnoyers
2010-12-14 16:44   ` Eric Dumazet
2010-12-14 16:55     ` Christoph Lameter
2010-12-14 17:00       ` H. Peter Anvin
2010-12-14 17:19         ` Christoph Lameter
2010-12-14 17:22           ` H. Peter Anvin
2010-12-14 17:29             ` Tejun Heo
2010-12-14 17:35               ` Christoph Lameter
2010-12-15  1:06               ` H. Peter Anvin
2010-12-15 16:29                 ` Tejun Heo
2010-12-15 16:35                   ` H. Peter Anvin
2010-12-15 16:39                     ` Tejun Heo
2010-12-16 16:14                       ` Tejun Heo
2010-12-16 18:13                         ` x86: Use this_cpu_has for thermal_interrupt Christoph Lameter
2010-12-18 15:35                           ` Tejun Heo
2010-12-21  0:56                             ` H. Peter Anvin
2010-12-30 11:29                               ` Tejun Heo
2010-12-30 18:19                                 ` H. Peter Anvin
2010-12-31 12:43                                   ` Tejun Heo
2010-12-16 18:14                         ` x86: udelay: Use this_cpu_read to avoid address calculation Christoph Lameter
2010-12-16 18:15                         ` gameport: use this_cpu_read instead of lookup Christoph Lameter
2010-12-18 15:34                           ` Tejun Heo
2010-12-16 18:16                         ` acpi throttling: Use this_cpu_has and simplify code Christoph Lameter
2010-12-18 15:50                           ` Tejun Heo
2010-12-21  1:52                             ` ykzhao
2010-12-21 22:43                             ` Christoph Lameter
2010-12-21  4:28                           ` Len Brown
2010-12-16 18:19                         ` [cpuops cmpxchg V2 5/5] cpuops: Use cmpxchg for xchg to avoid lock semantics H. Peter Anvin
2010-12-16 18:55                           ` Tejun Heo
2010-12-16 20:42                         ` H. Peter Anvin
2010-12-15 16:47   ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).