linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops
@ 2010-11-23 23:51 Christoph Lameter
  2010-11-23 23:51 ` [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options Christoph Lameter
                   ` (9 more replies)
  0 siblings, 10 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

A patchset that adds more this_cpu operations and in particular RMV operations
that can be used in various places to avoid address calculations and
memory accesses.

Also adds this_cpu_cmpxchg_double() which is a locally atomic version of
cmpxchg and uses that to demo how a lockless, preemptless fastpath for
memory allocation could work. Works good enough so that I can write this
email with that fastpath scheme.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options.
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-24  7:07   ` Pekka Enberg
  2010-11-26 15:43   ` Tejun Heo
  2010-11-23 23:51 ` [thiscpuops upgrade 02/10] vmstat: Optimize zone counter modifications through the use of this cpu operations Christoph Lameter
                   ` (8 subsequent siblings)
  9 siblings, 2 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: percpu_fixes --]
[-- Type: text/plain, Size: 4149 bytes --]

The this_cpu_* options can be used to optimize __percpu_counter_add a bit. Avoids
some address arithmetic and saves 12 bytes.

Before:


00000000000001d3 <__percpu_counter_add>:
 1d3:	55                   	push   %rbp
 1d4:	48 89 e5             	mov    %rsp,%rbp
 1d7:	41 55                	push   %r13
 1d9:	41 54                	push   %r12
 1db:	53                   	push   %rbx
 1dc:	48 89 fb             	mov    %rdi,%rbx
 1df:	48 83 ec 08          	sub    $0x8,%rsp
 1e3:	4c 8b 67 30          	mov    0x30(%rdi),%r12
 1e7:	65 4c 03 24 25 00 00 	add    %gs:0x0,%r12
 1ee:	00 00 
 1f0:	4d 63 2c 24          	movslq (%r12),%r13
 1f4:	48 63 c2             	movslq %edx,%rax
 1f7:	49 01 f5             	add    %rsi,%r13
 1fa:	49 39 c5             	cmp    %rax,%r13
 1fd:	7d 0a                	jge    209 <__percpu_counter_add+0x36>
 1ff:	f7 da                	neg    %edx
 201:	48 63 d2             	movslq %edx,%rdx
 204:	49 39 d5             	cmp    %rdx,%r13
 207:	7f 1e                	jg     227 <__percpu_counter_add+0x54>
 209:	48 89 df             	mov    %rbx,%rdi
 20c:	e8 00 00 00 00       	callq  211 <__percpu_counter_add+0x3e>
 211:	4c 01 6b 18          	add    %r13,0x18(%rbx)
 215:	48 89 df             	mov    %rbx,%rdi
 218:	41 c7 04 24 00 00 00 	movl   $0x0,(%r12)
 21f:	00 
 220:	e8 00 00 00 00       	callq  225 <__percpu_counter_add+0x52>
 225:	eb 04                	jmp    22b <__percpu_counter_add+0x58>
 227:	45 89 2c 24          	mov    %r13d,(%r12)
 22b:	5b                   	pop    %rbx
 22c:	5b                   	pop    %rbx
 22d:	41 5c                	pop    %r12
 22f:	41 5d                	pop    %r13
 231:	c9                   	leaveq 
 232:	c3                   	retq   


After:

00000000000001d3 <__percpu_counter_add>:
 1d3:	55                   	push   %rbp
 1d4:	48 63 ca             	movslq %edx,%rcx
 1d7:	48 89 e5             	mov    %rsp,%rbp
 1da:	41 54                	push   %r12
 1dc:	53                   	push   %rbx
 1dd:	48 89 fb             	mov    %rdi,%rbx
 1e0:	48 8b 47 30          	mov    0x30(%rdi),%rax
 1e4:	65 44 8b 20          	mov    %gs:(%rax),%r12d
 1e8:	4d 63 e4             	movslq %r12d,%r12
 1eb:	49 01 f4             	add    %rsi,%r12
 1ee:	49 39 cc             	cmp    %rcx,%r12
 1f1:	7d 0a                	jge    1fd <__percpu_counter_add+0x2a>
 1f3:	f7 da                	neg    %edx
 1f5:	48 63 d2             	movslq %edx,%rdx
 1f8:	49 39 d4             	cmp    %rdx,%r12
 1fb:	7f 21                	jg     21e <__percpu_counter_add+0x4b>
 1fd:	48 89 df             	mov    %rbx,%rdi
 200:	e8 00 00 00 00       	callq  205 <__percpu_counter_add+0x32>
 205:	4c 01 63 18          	add    %r12,0x18(%rbx)
 209:	48 8b 43 30          	mov    0x30(%rbx),%rax
 20d:	48 89 df             	mov    %rbx,%rdi
 210:	65 c7 00 00 00 00 00 	movl   $0x0,%gs:(%rax)
 217:	e8 00 00 00 00       	callq  21c <__percpu_counter_add+0x49>
 21c:	eb 04                	jmp    222 <__percpu_counter_add+0x4f>
 21e:	65 44 89 20          	mov    %r12d,%gs:(%rax)
 222:	5b                   	pop    %rbx
 223:	41 5c                	pop    %r12
 225:	c9                   	leaveq 
 226:	c3                   	retq   

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 lib/percpu_counter.c |    8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

Index: linux-2.6/lib/percpu_counter.c
===================================================================
--- linux-2.6.orig/lib/percpu_counter.c	2010-11-03 12:25:37.000000000 -0500
+++ linux-2.6/lib/percpu_counter.c	2010-11-03 12:27:27.000000000 -0500
@@ -72,18 +72,16 @@ EXPORT_SYMBOL(percpu_counter_set);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
 {
 	s64 count;
-	s32 *pcount;
 
 	preempt_disable();
-	pcount = this_cpu_ptr(fbc->counters);
-	count = *pcount + amount;
+	count = __this_cpu_read(*fbc->counters) + amount;
 	if (count >= batch || count <= -batch) {
 		spin_lock(&fbc->lock);
 		fbc->count += count;
-		*pcount = 0;
+		__this_cpu_write(*fbc->counters, 0);
 		spin_unlock(&fbc->lock);
 	} else {
-		*pcount = count;
+		__this_cpu_write(*fbc->counters, count);
 	}
 	preempt_enable();
 }


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 02/10] vmstat: Optimize zone counter modifications through the use of this cpu operations
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
  2010-11-23 23:51 ` [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-26 16:25   ` Tejun Heo
  2010-11-23 23:51 ` [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return Christoph Lameter
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: vmstat_this_cpu --]
[-- Type: text/plain, Size: 2986 bytes --]

this cpu operations can be used to slightly optimize the function. The
changes will avoid some address calculations and replace them with the
use of the percpu segment register.

If one would have this_cpu_inc_return and this_cpu_dec_return then it
would be possible to optimize inc_zone_page_state and dec_zone_page_state even
more.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/vmstat.c |   56 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2010-11-09 11:18:22.000000000 -0600
+++ linux-2.6/mm/vmstat.c	2010-11-09 11:19:06.000000000 -0600
@@ -167,18 +167,20 @@ static void refresh_zone_stat_thresholds
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
 				int delta)
 {
-	struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
-
-	s8 *p = pcp->vm_stat_diff + item;
+	struct per_cpu_pageset * __percpu pcp = zone->pageset;
+	s8 * __percpu p = pcp->vm_stat_diff + item;
 	long x;
+	long t;
+
+	x = delta + __this_cpu_read(*p);
 
-	x = delta + *p;
+	t = __this_cpu_read(pcp->stat_threshold);
 
-	if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+	if (unlikely(x > t || x < -t)) {
 		zone_page_state_add(x, zone, item);
 		x = 0;
 	}
-	*p = x;
+	__this_cpu_write(*p, x);
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 
@@ -221,16 +223,19 @@ EXPORT_SYMBOL(mod_zone_page_state);
  */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
-	s8 *p = pcp->vm_stat_diff + item;
-
-	(*p)++;
+	struct per_cpu_pageset * __percpu pcp = zone->pageset;
+	s8 * __percpu p = pcp->vm_stat_diff + item;
+	int v, t;
+
+	__this_cpu_inc(*p);
+
+	v = __this_cpu_read(*p);
+	t = __this_cpu_read(pcp->stat_threshold);
+	if (unlikely(v > t)) {
+		int overstep = t / 2;
 
-	if (unlikely(*p > pcp->stat_threshold)) {
-		int overstep = pcp->stat_threshold / 2;
-
-		zone_page_state_add(*p + overstep, zone, item);
-		*p = -overstep;
+		zone_page_state_add(v + overstep, zone, item);
+		__this_cpu_write(*p, overstep);
 	}
 }
 
@@ -242,16 +247,19 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-	struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
-	s8 *p = pcp->vm_stat_diff + item;
-
-	(*p)--;
-
-	if (unlikely(*p < - pcp->stat_threshold)) {
-		int overstep = pcp->stat_threshold / 2;
+	struct per_cpu_pageset * __percpu pcp = zone->pageset;
+	s8 * __percpu p = pcp->vm_stat_diff + item;
+	int v, t;
+
+	__this_cpu_dec(*p);
+
+	v = __this_cpu_read(*p);
+	t = __this_cpu_read(pcp->stat_threshold);
+	if (unlikely(v < - t)) {
+		int overstep = t / 2;
 
-		zone_page_state_add(*p - overstep, zone, item);
-		*p = overstep;
+		zone_page_state_add(v - overstep, zone, item);
+		__this_cpu_write(*p, overstep);
 	}
 }
 


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
  2010-11-23 23:51 ` [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options Christoph Lameter
  2010-11-23 23:51 ` [thiscpuops upgrade 02/10] vmstat: Optimize zone counter modifications through the use of this cpu operations Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-26 16:31   ` Tejun Heo
  2010-11-23 23:51 ` [thiscpuops upgrade 04/10] x86: Support " Christoph Lameter
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_add_dec_return --]
[-- Type: text/plain, Size: 3673 bytes --]

Introduce generic support for this_cpu_add_return etc.

The fallback is to realize these operations with __this_cpu_ops.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |   70 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-11-23 17:29:46.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-11-23 17:31:14.000000000 -0600
@@ -240,6 +240,20 @@ extern void __bad_size_call_parameter(vo
 	pscr_ret__;							\
 })
 
+#define __pcpu_size_call_return2(stem, variable, ...)			\
+({	typeof(variable) pscr_ret__;					\
+	__verify_pcpu_ptr(&(variable));					\
+	switch(sizeof(variable)) {					\
+	case 1: pscr_ret__ = stem##1(variable, __VA_ARGS__);break;	\
+	case 2: pscr_ret__ = stem##2(variable, __VA_ARGS__);break;	\
+	case 4: pscr_ret__ = stem##4(variable, __VA_ARGS__);break;	\
+	case 8: pscr_ret__ = stem##8(variable, __VA_ARGS__);break;	\
+	default:							\
+		__bad_size_call_parameter();break;			\
+	}								\
+	pscr_ret__;							\
+})
+
 #define __pcpu_size_call(stem, variable, ...)				\
 do {									\
 	__verify_pcpu_ptr(&(variable));					\
@@ -529,6 +543,62 @@ do {									\
 # define __this_cpu_xor(pcp, val)	__pcpu_size_call(__this_cpu_xor_, (pcp), (val))
 #endif
 
+#define _this_cpu_generic_add_return(pcp, val)				\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	__this_cpu_add((pcp), val);					\
+	ret__ = __this_cpu_read((pcp));					\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_add_return
+# ifndef this_cpu_add_return_1
+#  define this_cpu_add_return_1(pcp, val)	_this_cpu_generic_add_return(pcp, val)
+# endif
+# ifndef this_cpu_add_return_2
+#  define this_cpu_add_return_2(pcp, val)	_this_cpu_generic_add_return(pcp, val)
+# endif
+# ifndef this_cpu_add_return_4
+#  define this_cpu_add_return_4(pcp, val)	_this_cpu_generic_add_return(pcp, val)
+# endif
+# ifndef this_cpu_add_return_8
+#  define this_cpu_add_return_8(pcp, val)	_this_cpu_generic_add_return(pcp, val)
+# endif
+# define this_cpu_add_return(pcp, val)	__pcpu_size_call_return2(this_cpu_add_return_, (pcp), val)
+#endif
+
+#define this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(val))
+#define this_cpu_inc_return(pcp)	this_cpu_add_return(pcp, 1)
+#define this_cpu_dec_return(pcp)	this_cpu_add_return(pcp, -1)
+
+#define __this_cpu_generic_add_return(pcp, val)				\
+({	typeof(pcp) ret__;						\
+	__this_cpu_add((pcp), val);					\
+	ret__ = __this_cpu_read((pcp));					\
+	ret__;								\
+})
+
+#ifndef __this_cpu_add_return
+# ifndef __this_cpu_add_return_1
+#  define __this_cpu_add_return_1(pcp, val)	__this_cpu_generic_add_return(pcp, val)
+# endif
+# ifndef __this_cpu_add_return_2
+#  define __this_cpu_add_return_2(pcp, val)	__this_cpu_generic_add_return(pcp, val)
+# endif
+# ifndef __this_cpu_add_return_4
+#  define __this_cpu_add_return_4(pcp, val)	__this_cpu_generic_add_return(pcp, val)
+# endif
+# ifndef __this_cpu_add_return_8
+#  define __this_cpu_add_return_8(pcp, val)	__this_cpu_generic_add_return(pcp, val)
+# endif
+# define __this_cpu_add_return(pcp, val)	__pcpu_size_call_return2(this_cpu_add_return_, (pcp), val)
+#endif
+
+#define __this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(val))
+#define __this_cpu_inc_return(pcp)	this_cpu_add_return(pcp, 1)
+#define __this_cpu_dec_return(pcp)	this_cpu_add_return(pcp, -1)
+
 /*
  * IRQ safe versions of the per cpu RMW operations. Note that these operations
  * are *not* safe against modification of the same variable from another


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 04/10] x86: Support for this_cpu_add,sub,dec,inc_return
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
                   ` (2 preceding siblings ...)
  2010-11-23 23:51 ` [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-26 16:33   ` Tejun Heo
  2010-11-23 23:51 ` [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter Christoph Lameter
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_add_x86 --]
[-- Type: text/plain, Size: 2865 bytes --]

Supply an implementation for x86 in order to generate more efficient code.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |   50 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-11-23 16:35:19.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-11-23 16:36:05.000000000 -0600
@@ -177,6 +177,45 @@ do {									\
 	}								\
 } while (0)
 
+
+/*
+ * Add return operation
+ */
+#define percpu_add_return_op(var, val)					\
+({									\
+	typedef typeof(var) pao_T__;					\
+	typeof(var) pfo_ret__ = val;					\
+	if (0) {							\
+		pao_T__ pao_tmp__;					\
+		pao_tmp__ = (val);					\
+		(void)pao_tmp__;					\
+	}								\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("xaddb %0, "__percpu_arg(1)				\
+			    : "+q" (pfo_ret__), "+m" (var)		\
+			    : : "memory");				\
+		break;							\
+	case 2:								\
+		asm("xaddw %0, "__percpu_arg(1)				\
+			    : "+r" (pfo_ret__), "+m" (var)		\
+			    : : "memory");				\
+		break;							\
+	case 4:								\
+		asm("xaddl %0, "__percpu_arg(1)				\
+			    : "+r"(pfo_ret__), "+m" (var)		\
+			    : : "memory");				\
+		break;							\
+	case 8:								\
+		asm("xaddq %0, "__percpu_arg(1)				\
+			    : "+re" (pfo_ret__),  "+m" (var)		\
+			    : : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	pfo_ret__ + (val);						\
+})
+
 #define percpu_from_op(op, var, constraint)		\
 ({							\
 	typeof(var) pfo_ret__;				\
@@ -300,6 +339,14 @@ do {									\
 #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
 
+#ifndef CONFIG_M386
+#define __this_cpu_add_return_1(pcp, val)	percpu_add_return_op((pcp), val)
+#define __this_cpu_add_return_2(pcp, val)	percpu_add_return_op((pcp), val)
+#define __this_cpu_add_return_4(pcp, val)	percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_1(pcp, val)		percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_2(pcp, val)		percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_4(pcp, val)		percpu_add_return_op((pcp), val)
+#endif
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -324,6 +371,9 @@ do {									\
 #define irqsafe_cpu_or_8(pcp, val)	percpu_to_op("or", (pcp), val)
 #define irqsafe_cpu_xor_8(pcp, val)	percpu_to_op("xor", (pcp), val)
 
+#define __this_cpu_add_return_8(pcp, val)	percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_8(pcp, val)	percpu_add_return_op((pcp), val)
+
 #endif
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
                   ` (3 preceding siblings ...)
  2010-11-23 23:51 ` [thiscpuops upgrade 04/10] x86: Support " Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-26 16:35   ` Tejun Heo
  2010-11-23 23:51 ` [thiscpuops upgrade 06/10] vmstat: Use this_cpu_inc_return for vm statistics Christoph Lameter
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_add_nmi --]
[-- Type: text/plain, Size: 859 bytes --]

this_cpu_inc_return() saves us a memory access there.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/kernel/apic/nmi.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

Index: linux-2.6/arch/x86/kernel/apic/nmi.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/apic/nmi.c	2010-11-23 16:35:19.000000000 -0600
+++ linux-2.6/arch/x86/kernel/apic/nmi.c	2010-11-23 16:38:29.000000000 -0600
@@ -432,8 +432,7 @@ nmi_watchdog_tick(struct pt_regs *regs,
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
-		__this_cpu_inc(alert_counter);
-		if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
+		if (__this_cpu_inc_return(alert_counter) == 5 * nmi_hz)
 			/*
 			 * die_nmi will return ONLY if NOTIFY_STOP happens..
 			 */


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 06/10] vmstat: Use this_cpu_inc_return for vm statistics
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
                   ` (4 preceding siblings ...)
  2010-11-23 23:51 ` [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-23 23:51 ` [thiscpuops upgrade 07/10] highmem: Use this_cpu_xx_return() operations Christoph Lameter
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_add_vmstat --]
[-- Type: text/plain, Size: 1041 bytes --]

this_cpu_inc_return() saves us a memory access there. Code
size does not change.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 mm/vmstat.c |    8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2010-11-23 16:35:19.000000000 -0600
+++ linux-2.6/mm/vmstat.c	2010-11-23 16:45:24.000000000 -0600
@@ -227,9 +227,7 @@ void __inc_zone_state(struct zone *zone,
 	s8 * __percpu p = pcp->vm_stat_diff + item;
 	int v, t;
 
-	__this_cpu_inc(*p);
-
-	v = __this_cpu_read(*p);
+	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v > t)) {
 		int overstep = t / 2;
@@ -251,9 +249,7 @@ void __dec_zone_state(struct zone *zone,
 	s8 * __percpu p = pcp->vm_stat_diff + item;
 	int v, t;
 
-	__this_cpu_dec(*p);
-
-	v = __this_cpu_read(*p);
+	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
 	if (unlikely(v < - t)) {
 		int overstep = t / 2;


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 07/10] highmem: Use this_cpu_xx_return() operations
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
                   ` (5 preceding siblings ...)
  2010-11-23 23:51 ` [thiscpuops upgrade 06/10] vmstat: Use this_cpu_inc_return for vm statistics Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-23 23:51 ` [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: highmem --]
[-- Type: text/plain, Size: 1304 bytes --]

Use this_cpu operations to optimize access primitives for highmem.

The main effect is the avoidance of address calculations through the
use of a segment prefix.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/highmem.h |    7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

Index: linux-2.6/include/linux/highmem.h
===================================================================
--- linux-2.6.orig/include/linux/highmem.h	2010-11-22 14:43:40.000000000 -0600
+++ linux-2.6/include/linux/highmem.h	2010-11-22 14:45:02.000000000 -0600
@@ -81,7 +81,8 @@ DECLARE_PER_CPU(int, __kmap_atomic_idx);
 
 static inline int kmap_atomic_idx_push(void)
 {
-	int idx = __get_cpu_var(__kmap_atomic_idx)++;
+	int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
+
 #ifdef CONFIG_DEBUG_HIGHMEM
 	WARN_ON_ONCE(in_irq() && !irqs_disabled());
 	BUG_ON(idx > KM_TYPE_NR);
@@ -91,12 +92,12 @@ static inline int kmap_atomic_idx_push(v
 
 static inline int kmap_atomic_idx(void)
 {
-	return __get_cpu_var(__kmap_atomic_idx) - 1;
+	return __this_cpu_read(__kmap_atomic_idx) - 1;
 }
 
 static inline int kmap_atomic_idx_pop(void)
 {
-	int idx = --__get_cpu_var(__kmap_atomic_idx);
+	int idx = __this_cpu_dec_return(__kmap_atomic_idx);
 #ifdef CONFIG_DEBUG_HIGHMEM
 	BUG_ON(idx < 0);
 #endif


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
                   ` (6 preceding siblings ...)
  2010-11-23 23:51 ` [thiscpuops upgrade 07/10] highmem: Use this_cpu_xx_return() operations Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-26 16:51   ` Tejun Heo
  2010-11-23 23:51 ` [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
  2010-11-23 23:51 ` [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  9 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_cmpxchg --]
[-- Type: text/plain, Size: 8267 bytes --]

Provide arch code to create the (local atomic) instructions.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/percpu.h |  179 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 178 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2010-11-23 16:49:02.000000000 -0600
+++ linux-2.6/include/linux/percpu.h	2010-11-23 16:50:45.000000000 -0600
@@ -254,6 +254,21 @@ extern void __bad_size_call_parameter(vo
 	pscr_ret__;							\
 })
 
+/* Special handling for cmpxchg_double */
+#define __pcpu_size_call_return_int(stem, pcp, ...)			\
+({	int pscr_ret__;							\
+	__verify_pcpu_ptr(pcp);						\
+	switch(sizeof(*pcp)) {						\
+	case 1: pscr_ret__ = stem##1(pcp, __VA_ARGS__);break;		\
+	case 2: pscr_ret__ = stem##2(pcp, __VA_ARGS__);break;		\
+	case 4: pscr_ret__ = stem##4(pcp, __VA_ARGS__);break;		\
+	case 8: pscr_ret__ = stem##8(pcp, __VA_ARGS__);break;		\
+	default:							\
+		__bad_size_call_parameter();break;			\
+	}								\
+	pscr_ret__;							\
+})
+
 #define __pcpu_size_call(stem, variable, ...)				\
 do {									\
 	__verify_pcpu_ptr(&(variable));					\
@@ -599,6 +614,134 @@ do {									\
 #define __this_cpu_inc_return(pcp)	this_cpu_add_return(pcp, 1)
 #define __this_cpu_dec_return(pcp)	this_cpu_add_return(pcp, -1)
 
+#define _this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({	typeof(pcp) ret__;						\
+	preempt_disable();						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg
+# ifndef this_cpu_cmpxchg_1
+#  define this_cpu_cmpxchg_1(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_2
+#  define this_cpu_cmpxchg_2(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_4
+#  define this_cpu_cmpxchg_4(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef this_cpu_cmpxchg_8
+#  define this_cpu_cmpxchg_8(pcp, oval, nval)	_this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(this_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
+#define __this_cpu_generic_cmpxchg(pcp, oval, nval)			\
+({	typeof(pcp) ret__;						\
+	ret__ = __this_cpu_read(pcp);					\
+	if (ret__ == (oval))						\
+		__this_cpu_write(pcp, nval);				\
+	ret__;								\
+})
+
+#ifndef __this_cpu_cmpxchg
+# ifndef __this_cpu_cmpxchg_1
+#  define __this_cpu_cmpxchg_1(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_2
+#  define __this_cpu_cmpxchg_2(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_4
+#  define __this_cpu_cmpxchg_4(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# ifndef __this_cpu_cmpxchg_8
+#  define __this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_generic_cmpxchg(pcp, oval, nval)
+# endif
+# define __this_cpu_cmpxchg(pcp, oval, nval)	__pcpu_size_call_return2(\
+				__this_cpu_cmpxchg_, (pcp), oval, nval)
+#endif
+
+/*
+ * cmpxchg_double replaces two adjacent scalars at once. The first parameter
+ * passed is a percpu pointer, not a scalar like the other this_cpu
+ * operations. This is so because the function operates on two scalars
+ * (must be of same size). A truth value is returned to indicate success or
+ * failure (since a double register result is difficult to handle).
+ * There is very limited hardware support for these operations. So only certain
+ * sizes may work.
+ */
+#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({	typeof(oval2) * __percpu pcp2 = (typeof(oval2) *)((pcp) + 1);	\
+	int __ret = 0;							\
+	if (__this_cpu_read(*pcp) == (oval1) &&				\
+			 __this_cpu_read(*pcp2)  == (oval2)) {		\
+		__this_cpu_write(*pcp, (nval1));			\
+		__this_cpu_write(*pcp2, (nval2));			\
+		__ret = 1;						\
+	}								\
+	(__ret);							\
+})
+
+#ifndef __this_cpu_cmpxchg_double
+# ifndef __this_cpu_cmpxchg_double_1
+#  define __this_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_2
+#  define __this_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_4
+#  define __this_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef __this_cpu_cmpxchg_double_8
+#  define __this_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	__this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define __this_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_size_call_return_int(__this_cpu_cmpxchg_double_, (pcp),	\
+					 oval1, oval2, nval1, nval2)
+#endif
+
+#define _this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({	int ret__;							\
+	preempt_disable();						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp,			\
+			oval1, oval2, nval1, nval2);			\
+	preempt_enable();						\
+	ret__;								\
+})
+
+#ifndef this_cpu_cmpxchg_double
+# ifndef this_cpu_cmpxchg_double_1
+#  define this_cpu_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_2
+#  define this_cpu_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_4
+#  define this_cpu_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef this_cpu_cmpxchg_double_8
+#  define this_cpu_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	_this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define this_cpu_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_size_call_return_int(this_cpu_cmpxchg_double_, (pcp),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
+
+
+
 #define _this_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	preempt_disable();						\
@@ -610,7 +753,7 @@ do {									\
  * IRQ safe versions of the per cpu RMW operations. Note that these operations
  * are *not* safe against modification of the same variable from another
  * processors (which one gets when using regular atomic operations)
- . They are guaranteed to be atomic vs. local interrupts and
+ * They are guaranteed to be atomic vs. local interrupts and
  * preemption only.
  */
 #define irqsafe_cpu_generic_to_op(pcp, val, op)				\
@@ -697,4 +840,38 @@ do {									\
 # define irqsafe_cpu_xor(pcp, val) __pcpu_size_call(irqsafe_cpu_xor_, (val))
 #endif
 
+#define irqsafe_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+({	int ret__;							\
+	unsigned long flags;						\
+	local_irq_save(flags);						\
+	ret__ = __this_cpu_generic_cmpxchg_double(pcp,			\
+			oval1, oval2, nval1, nval2);			\
+	local_irq_restore(flags);					\
+	ret__;								\
+})
+
+#ifndef irqsafe_cmpxchg_double
+# ifndef irqsafe_cmpxchg_double_1
+#  define irqsafe_cmpxchg_double_1(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cmpxchg_double_2
+#  define irqsafe_cmpxchg_double_2(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cmpxchg_double_4
+#  define irqsafe_cmpxchg_double_4(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# ifndef irqsafe_cmpxchg_double_8
+#  define irqsafe_cmpxchg_double_8(pcp, oval1, oval2, nval1, nval2)	\
+	irqsafe_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)
+# endif
+# define irqsafe_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
+	__pcpu_size_call_return_int(irqsafe_cmpxchg_double_, (pcp),	\
+		oval1, oval2, nval1, nval2)
+#endif
+
+
+
 #endif /* __LINUX_PERCPU_H */


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
                   ` (7 preceding siblings ...)
  2010-11-23 23:51 ` [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-24  0:41   ` Eric Dumazet
  2010-11-24  0:44   ` Mathieu Desnoyers
  2010-11-23 23:51 ` [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  9 siblings, 2 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: this_cpu_cmpxchg_x86 --]
[-- Type: text/plain, Size: 5059 bytes --]

Provide support as far as the hardware capabilities of the x86 cpus
allow.

Signed-off-by: Christoph Lameter <cl@linux.com>

---
 arch/x86/include/asm/percpu.h |  100 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h	2010-11-23 16:50:58.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h	2010-11-23 16:56:48.000000000 -0600
@@ -216,6 +216,41 @@ do {									\
 	pfo_ret__ + (val);						\
 })
 
+#define percpu_cmpxchg_op(var, oval, nval)				\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __old = (oval);					\
+	typeof(var) __new = (nval);					\
+	switch (sizeof(var)) {						\
+	case 1:								\
+		asm("cmpxchgb %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (&var)			\
+			    : "q" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 2:								\
+		asm("cmpxchgw %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (&var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 4:								\
+		asm("cmpxchgl %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (&var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	case 8:								\
+		asm("cmpxchgq %2, "__percpu_arg(1)			\
+			    : "=a" (__ret), "+m" (&var)			\
+			    : "r" (__new), "0" (__old)			\
+			    : "memory");				\
+		break;							\
+	default: __bad_percpu_size();					\
+	}								\
+	__ret;								\
+})
+
 #define percpu_from_op(op, var, constraint)		\
 ({							\
 	typeof(var) pfo_ret__;				\
@@ -346,7 +381,52 @@ do {									\
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op((pcp), val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op((pcp), val)
 #define this_cpu_add_return_4(pcp, val)		percpu_add_return_op((pcp), val)
+
+#define __this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define __this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define __this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+
+#define percpu_cmpxchg8b_double(pcp, o1, o2, n1, n2)			\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	asm("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t"		\
+		    : "=a"(__ret), "=m" (*pcp)				\
+		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
+	__ret;								\
+})
+
+#define __this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2) percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2)	percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define irqsafe_cmpxchg_double_4(pcp, o1, o2, n1, n2)	percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+
+#ifndef CONFIG_X86_64
+
+/* We can support a 8 byte cmpxchg with a special instruction on 32 bit */
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)				\
+({									\
+	typeof(var) __ret;						\
+	typeof(var) __old = (oval);					\
+	typeof(var) __new = (nval);					\
+	asm("cmpxchg8b %2, "__percpu_arg(1)				\
+	    : "=A" (__ret), "+m" (&pcp)					\
+	    : "b" (((u32)new), "c" ((u32)(new >> 32)),  "0" (__old)	\
+	    : "memory");						\
+	__ret;								\
+})
+
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)
+#define irqsafe_cmpxchg_8(pcp, oval, nval)	__this_cpu_cmpxchg_8(pcp, oval, nval)
+
+#endif
 #endif
+
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
  * 32 bit must fall back to generic operations.
@@ -374,6 +454,26 @@ do {									\
 #define __this_cpu_add_return_8(pcp, val)	percpu_add_return_op((pcp), val)
 #define this_cpu_add_return_8(pcp, val)	percpu_add_return_op((pcp), val)
 
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op((pcp), oval, nval)
+
+#define percpu_cmpxchg16b(pcp, o1, o2, n1, n2)				\
+({									\
+	char __ret;							\
+	typeof(o1) __o1 = o1;						\
+	typeof(o1) __n1 = n1;						\
+	typeof(o2) __o2 = o2;						\
+	typeof(o2) __n2 = n2;						\
+	asm("cmpxchg16b "__percpu_arg(1)"\n\tsetz %0\n\t"		\
+		    : "=a"(__ret), "=m" (*pcp)				\
+		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
+	__ret;								\
+})
+
+#define __this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2) percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define irqsafe_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+
 #endif
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
                   ` (8 preceding siblings ...)
  2010-11-23 23:51 ` [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
@ 2010-11-23 23:51 ` Christoph Lameter
  2010-11-24  0:22   ` Eric Dumazet
                     ` (3 more replies)
  9 siblings, 4 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-23 23:51 UTC (permalink / raw)
  To: akpm
  Cc: Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: slub_generation --]
[-- Type: text/plain, Size: 8592 bytes --]

Use the this_cpu_cmpxchg_double functionality to implement a lockless
allocation algorith.

Each of the per cpu pointers is paired with a transaction id that ensures
that updates of the per cpu information can only occur in sequence on
a certain cpu.

A transaction id is a "long" integer that is comprised of an event number
and the cpu number. The event number is incremented for every change to the
per cpu state. This means that the cmpxchg instruction can verify for an
update that nothing interfered and that we are updating the percpu structure
for the processor where we picked up the information and that we are also
currently on that processor when we update the information.

This results in a significant decrease of the overhead in the fastpaths. It
also makes it easy to adopt the fast path for realtime kernels since this
is lockless and does not require that the use of the current per cpu area
over the critical section. It is only important that the per cpu area is
current at the beginning of the critical section and at that end.

So there is no need even to disable preemption which will make the allocations
scale well in a RT environment.

[Beware: There have been previous attempts at lockless fastpaths that
did not succeed. We hope to have learned from these experiences but
review certainly is necessary.]

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Christoph Lameter <cl@linux.com>

---
 include/linux/slub_def.h |    3 -
 mm/slub.c                |  128 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 113 insertions(+), 18 deletions(-)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2010-11-23 16:31:06.000000000 -0600
+++ linux-2.6/include/linux/slub_def.h	2010-11-23 16:53:34.000000000 -0600
@@ -36,7 +36,8 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	void **freelist;	/* Pointer to first free per cpu object */
+	void **freelist;		/* Pointer to next available object */
+	unsigned long tid;	/* Globally unique transaction id */
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
 #ifdef CONFIG_SLUB_STATS
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-11-23 16:31:06.000000000 -0600
+++ linux-2.6/mm/slub.c	2010-11-23 16:55:49.000000000 -0600
@@ -1486,6 +1486,31 @@ static void unfreeze_slab(struct kmem_ca
 }
 
 /*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+static inline unsigned long next_tid(unsigned long tid)
+{
+	return tid + CONFIG_NR_CPUS;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+	return tid % CONFIG_NR_CPUS;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+	return tid / CONFIG_NR_CPUS;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+	return cpu;
+}
+
+/*
  * Remove the cpu slab
  */
 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -1509,6 +1534,7 @@ static void deactivate_slab(struct kmem_
 		/* Retrieve object from cpu_freelist */
 		object = c->freelist;
 		c->freelist = get_freepointer(s, c->freelist);
+		c->tid = next_tid(c->tid);
 
 		/* And put onto the regular freelist */
 		set_freepointer(s, object, page->freelist);
@@ -1646,10 +1672,15 @@ slab_out_of_memory(struct kmem_cache *s,
  * a call to the page allocator and the setup of a new slab.
  */
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
-			  unsigned long addr, struct kmem_cache_cpu *c)
+			  unsigned long addr)
 {
 	void **object;
 	struct page *new;
+	unsigned long flags;
+	struct kmem_cache_cpu *c;
+
+	local_irq_save(flags);
+	c = this_cpu_ptr(s->cpu_slab);
 
 	/* We handle __GFP_ZERO in the caller */
 	gfpflags &= ~__GFP_ZERO;
@@ -1675,7 +1706,9 @@ load_freelist:
 	c->page->freelist = NULL;
 	c->node = page_to_nid(c->page);
 unlock_out:
+	c->tid = next_tid(c->tid);
 	slab_unlock(c->page);
+	local_irq_restore(flags);
 	stat(s, ALLOC_SLOWPATH);
 	return object;
 
@@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
 {
 	void **object;
 	struct kmem_cache_cpu *c;
-	unsigned long flags;
+	unsigned long tid;
 
 	if (slab_pre_alloc_hook(s, gfpflags))
 		return NULL;
 
-	local_irq_save(flags);
+redo:
+	/*
+	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+	 * enabled. We may switch back and forth between cpus while
+	 * reading from one cpu area. That does not matter as long
+	 * as we end up on the original cpu again when doing the cmpxchg.
+	 */
 	c = __this_cpu_ptr(s->cpu_slab);
+
+	/*
+	 * The transaction ids are globally unique per cpu and per operation on
+	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+	 * occurs on the right processor and that there was no operation on the
+	 * linked list in between.
+	 */
+	tid = c->tid;
+	barrier();
+
 	object = c->freelist;
-	if (unlikely(!object || !node_match(c, node)))
+	if (unlikely(!object || !node_match(c, c->node)))
 
-		object = __slab_alloc(s, gfpflags, node, addr, c);
+		object = __slab_alloc(s, gfpflags, c->node, addr);
 
 	else {
-		c->freelist = get_freepointer(s, object);
+		/*
+		 * The cmpxchg will only match if there was not additonal
+		 * operation and if we are on the right processor.
+		 */
+		if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
+				get_freepointer(s, object), next_tid(tid)))) {
+#ifdef CONFIG_DEBUG_VM
+			unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+			printk(KERN_INFO "slab_free %s: redo cpu %u->[%u] event=[%ld] %ld->%ld\n",
+				s->name, tid_to_cpu(tid), tid_to_cpu(actual_tid),
+				tid_to_event(actual_tid), tid_to_event(tid),
+				tid_to_event(next_tid(tid)));
+#endif
+			goto redo;
+		}
 		stat(s, ALLOC_FASTPATH);
 	}
-	local_irq_restore(flags);
 
 	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, s->objsize);
@@ -1817,8 +1880,10 @@ static void __slab_free(struct kmem_cach
 {
 	void *prior;
 	void **object = (void *)x;
+	unsigned long flags;
 
 	stat(s, FREE_SLOWPATH);
+	local_irq_save(flags);
 	slab_lock(page);
 
 	if (kmem_cache_debug(s))
@@ -1849,6 +1914,7 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
+	local_irq_restore(flags);
 	return;
 
 slab_empty:
@@ -1860,6 +1926,7 @@ slab_empty:
 		stat(s, FREE_REMOVE_PARTIAL);
 	}
 	slab_unlock(page);
+	local_irq_restore(flags);
 	stat(s, FREE_SLAB);
 	discard_slab(s, page);
 	return;
@@ -1886,23 +1953,38 @@ static __always_inline void slab_free(st
 {
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
-	unsigned long flags;
+	unsigned long tid;
 
 	slab_free_hook(s, x);
 
-	local_irq_save(flags);
-	c = __this_cpu_ptr(s->cpu_slab);
-
 	slab_free_hook_irq(s, x);
 
-	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
+redo:
+	c = this_cpu_ptr(s->cpu_slab);
+	tid = c->tid;
+	barrier();
+
+	if (likely(page == c->page) &&
+			c->node != NUMA_NO_NODE) {
+
 		set_freepointer(s, object, c->freelist);
-		c->freelist = object;
+
+		if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist,
+				c->freelist, tid,
+				object, next_tid(tid)))) {
+#ifdef CONFIG_DEBUG_VM
+			unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+			printk(KERN_INFO "slab_free %s: redo cpu %d->[%d] event=[%ld] %ld->%ld\n", s->name,
+				tid_to_cpu(tid), tid_to_cpu(actual_tid),
+				tid_to_event(actual_tid), tid_to_event(tid),
+				tid_to_event(next_tid(tid)));
+#endif
+			goto redo;
+		}
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, x, addr);
-
-	local_irq_restore(flags);
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2102,12 +2184,24 @@ init_kmem_cache_node(struct kmem_cache_n
 
 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 {
+	int cpu;
+
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
 
-	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+	/*
+	 * Must align to double word boundary for the long cmpxchg instructions
+	 * to work.
+	 */
+	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+
+	if (!s->cpu_slab)
+		return 0;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
 
-	return s->cpu_slab != NULL;
+	return 1;
 }
 
 static struct kmem_cache *kmem_cache_node;


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-23 23:51 ` [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub Christoph Lameter
@ 2010-11-24  0:22   ` Eric Dumazet
  2010-11-24  3:13     ` Christoph Lameter
  2010-11-24  1:02   ` Mathieu Desnoyers
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 51+ messages in thread
From: Eric Dumazet @ 2010-11-24  0:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Mathieu Desnoyers, Tejun Heo

Le mardi 23 novembre 2010 à 17:51 -0600, Christoph Lameter a écrit :
> pièce jointe document texte brut (slub_generation)
> Use the this_cpu_cmpxchg_double functionality to implement a lockless
> allocation algorith.
> 
> Each of the per cpu pointers is paired with a transaction id that ensures
> that updates of the per cpu information can only occur in sequence on
> a certain cpu.
> 
> A transaction id is a "long" integer that is comprised of an event number
> and the cpu number. The event number is incremented for every change to the
> per cpu state. This means that the cmpxchg instruction can verify for an
> update that nothing interfered and that we are updating the percpu structure
> for the processor where we picked up the information and that we are also
> currently on that processor when we update the information.
> 
> This results in a significant decrease of the overhead in the fastpaths. It
> also makes it easy to adopt the fast path for realtime kernels since this
> is lockless and does not require that the use of the current per cpu area
> over the critical section. It is only important that the per cpu area is
> current at the beginning of the critical section and at that end.
> 
> So there is no need even to disable preemption which will make the allocations
> scale well in a RT environment.
> 
> [Beware: There have been previous attempts at lockless fastpaths that
> did not succeed. We hope to have learned from these experiences but
> review certainly is necessary.]
> 
> Cc: Ingo Molnar <mingo@elte.hu>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---

>  
>  /*
> + * Calculate the next globally unique transaction for disambiguiation
> + * during cmpxchg. The transactions start with the cpu number and are then
> + * incremented by CONFIG_NR_CPUS.
> + */
> +static inline unsigned long next_tid(unsigned long tid)
> +{
> +	return tid + CONFIG_NR_CPUS;
> +}


Hmm, this only works for power of two NR_CPUS, or else one cpu 'tid'
could wrap on another cpu tid.

I suggest using 4096  (or roundup_pow_of_two(CONFIG_NR_CPUS))




^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-11-23 23:51 ` [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
@ 2010-11-24  0:41   ` Eric Dumazet
  2010-11-24  3:11     ` Christoph Lameter
  2010-11-24  0:44   ` Mathieu Desnoyers
  1 sibling, 1 reply; 51+ messages in thread
From: Eric Dumazet @ 2010-11-24  0:41 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers, Tejun Heo

Le mardi 23 novembre 2010 à 17:51 -0600, Christoph Lameter a écrit :
> pièce jointe document texte brut (this_cpu_cmpxchg_x86)
> Provide support as far as the hardware capabilities of the x86 cpus
> allow.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>

> +
> +#define percpu_cmpxchg16b(pcp, o1, o2, n1, n2)				\
> +({									\
> +	char __ret;							\
> +	typeof(o1) __o1 = o1;						\
> +	typeof(o1) __n1 = n1;						\
> +	typeof(o2) __o2 = o2;						\
> +	typeof(o2) __n2 = n2;						\
> +	asm("cmpxchg16b "__percpu_arg(1)"\n\tsetz %0\n\t"		\
> +		    : "=a"(__ret), "=m" (*pcp)				\
> +		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
> +	__ret;								\
> +})

I thought some x86_64 cpus dont have cmpxchg16b ?

You have to check cx16 feature flag or implement a fallback ?





^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-11-23 23:51 ` [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
  2010-11-24  0:41   ` Eric Dumazet
@ 2010-11-24  0:44   ` Mathieu Desnoyers
  1 sibling, 0 replies; 51+ messages in thread
From: Mathieu Desnoyers @ 2010-11-24  0:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Tejun Heo

* Christoph Lameter (cl@linux.com) wrote:
> Provide support as far as the hardware capabilities of the x86 cpus
> allow.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 

[...]
> +#define percpu_cmpxchg16b(pcp, o1, o2, n1, n2)				\
> +({									\
> +	char __ret;							\
> +	typeof(o1) __o1 = o1;						\
> +	typeof(o1) __n1 = n1;						\
> +	typeof(o2) __o2 = o2;						\
> +	typeof(o2) __n2 = n2;						\
> +	asm("cmpxchg16b "__percpu_arg(1)"\n\tsetz %0\n\t"		\
> +		    : "=a"(__ret), "=m" (*pcp)				\
> +		    :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));	\
> +	__ret;								\
> +})

Can we add a check to ensure that the target memory location is 16-byte aligned?
This is a documented limitation of cmpxchg16b.

Thanks,

Mathieu

> +
> +#define __this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2) percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
> +#define this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
> +#define irqsafe_cmpxchg_double_8(pcp, o1, o2, n1, n2)	percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
> +
>  #endif
>  
>  /* This is not atomic against other CPUs -- CPU preemption needs to be off */
> 

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-23 23:51 ` [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  2010-11-24  0:22   ` Eric Dumazet
@ 2010-11-24  1:02   ` Mathieu Desnoyers
  2010-11-24  1:05     ` Mathieu Desnoyers
  2010-11-24  7:16   ` Pekka Enberg
  2010-11-24  8:15   ` Peter Zijlstra
  3 siblings, 1 reply; 51+ messages in thread
From: Mathieu Desnoyers @ 2010-11-24  1:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Tejun Heo

* Christoph Lameter (cl@linux.com) wrote:

[...]

> @@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
>  {
>  	void **object;
>  	struct kmem_cache_cpu *c;
> -	unsigned long flags;
> +	unsigned long tid;
>  
>  	if (slab_pre_alloc_hook(s, gfpflags))
>  		return NULL;
>  
> -	local_irq_save(flags);
> +redo:
> +	/*
> +	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
> +	 * enabled. We may switch back and forth between cpus while
> +	 * reading from one cpu area. That does not matter as long
> +	 * as we end up on the original cpu again when doing the cmpxchg.
> +	 */
>  	c = __this_cpu_ptr(s->cpu_slab);
> +
> +	/*
> +	 * The transaction ids are globally unique per cpu and per operation on
> +	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> +	 * occurs on the right processor and that there was no operation on the
> +	 * linked list in between.
> +	 */

There seems to be some voodoo magic I don't understand here. I'm curious to see
what happens if we have:

CPU A                                                  CPU B
slab_alloc()
  c = __this_cpu_ptr(s->cpu_slab);
  tid = c->tid
  thread migrated to CPU B

slab_alloc()
  c = __this_cpu_ptr(s->cpu_slab);
  tid = c->tid
  ...                                                  ...
  irqsafe_cmpxchg_double
    - expect tid, on CPU A, success
                                                       migrate back to CPU A
  irqsafe_cmpxchg_double
    - expect (same) tid, on CPU A, success

So either there is a crucially important point I am missing, or the transaction
ID does not seem to be truly unique due to migration.

Thanks,

Mathieu


> +	tid = c->tid;
> +	barrier();
> +
>  	object = c->freelist;
> -	if (unlikely(!object || !node_match(c, node)))
> +	if (unlikely(!object || !node_match(c, c->node)))
>  
> -		object = __slab_alloc(s, gfpflags, node, addr, c);
> +		object = __slab_alloc(s, gfpflags, c->node, addr);
>  
>  	else {
> -		c->freelist = get_freepointer(s, object);
> +		/*
> +		 * The cmpxchg will only match if there was not additonal
> +		 * operation and if we are on the right processor.
> +		 */
> +		if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
> +				get_freepointer(s, object), next_tid(tid)))) {


-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24  1:02   ` Mathieu Desnoyers
@ 2010-11-24  1:05     ` Mathieu Desnoyers
  2010-11-24  3:09       ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Mathieu Desnoyers @ 2010-11-24  1:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Tejun Heo

* Mathieu Desnoyers (mathieu.desnoyers@efficios.com) wrote:
> * Christoph Lameter (cl@linux.com) wrote:
> 
> [...]
> 
> > @@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
> >  {
> >  	void **object;
> >  	struct kmem_cache_cpu *c;
> > -	unsigned long flags;
> > +	unsigned long tid;
> >  
> >  	if (slab_pre_alloc_hook(s, gfpflags))
> >  		return NULL;
> >  
> > -	local_irq_save(flags);
> > +redo:
> > +	/*
> > +	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
> > +	 * enabled. We may switch back and forth between cpus while
> > +	 * reading from one cpu area. That does not matter as long
> > +	 * as we end up on the original cpu again when doing the cmpxchg.
> > +	 */
> >  	c = __this_cpu_ptr(s->cpu_slab);
> > +
> > +	/*
> > +	 * The transaction ids are globally unique per cpu and per operation on
> > +	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> > +	 * occurs on the right processor and that there was no operation on the
> > +	 * linked list in between.
> > +	 */
> 
> There seems to be some voodoo magic I don't understand here. I'm curious to see
> what happens if we have:
> 
> CPU A                                                  CPU B
> slab_alloc()
>   c = __this_cpu_ptr(s->cpu_slab);
>   tid = c->tid
>   thread migrated to CPU B
> 
> slab_alloc()
>   c = __this_cpu_ptr(s->cpu_slab);
>   tid = c->tid
>   ...                                                  ...
>   irqsafe_cmpxchg_double
>     - expect tid, on CPU A, success
>                                                        migrate back to CPU A
>   irqsafe_cmpxchg_double
>     - expect (same) tid, on CPU A, success

Ah! I knew I was missing something: the second cmpxchg will fail because it
expects "tid", but the value is now the "next_tid". So effectively, many
instances of the same transaction can run concurrently, but only one will
succeed.

Sorry for the noise.

Thanks,

Mathieu


> 
> So either there is a crucially important point I am missing, or the transaction
> ID does not seem to be truly unique due to migration.
> 
> Thanks,
> 
> Mathieu
> 
> 
> > +	tid = c->tid;
> > +	barrier();
> > +
> >  	object = c->freelist;
> > -	if (unlikely(!object || !node_match(c, node)))
> > +	if (unlikely(!object || !node_match(c, c->node)))
> >  
> > -		object = __slab_alloc(s, gfpflags, node, addr, c);
> > +		object = __slab_alloc(s, gfpflags, c->node, addr);
> >  
> >  	else {
> > -		c->freelist = get_freepointer(s, object);
> > +		/*
> > +		 * The cmpxchg will only match if there was not additonal
> > +		 * operation and if we are on the right processor.
> > +		 */
> > +		if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
> > +				get_freepointer(s, object), next_tid(tid)))) {
> 
> 
> -- 
> Mathieu Desnoyers
> Operating System Efficiency R&D Consultant
> EfficiOS Inc.
> http://www.efficios.com

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24  1:05     ` Mathieu Desnoyers
@ 2010-11-24  3:09       ` Christoph Lameter
  0 siblings, 0 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24  3:09 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Tejun Heo

On Tue, 23 Nov 2010, Mathieu Desnoyers wrote:

> Ah! I knew I was missing something: the second cmpxchg will fail because it
> expects "tid", but the value is now the "next_tid". So effectively, many
> instances of the same transaction can run concurrently, but only one will
> succeed.

Right.

> Sorry for the noise.

No its good to hear that you were not able to find a hole on first glance.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-11-24  0:41   ` Eric Dumazet
@ 2010-11-24  3:11     ` Christoph Lameter
  2010-11-24  7:05       ` Pekka Enberg
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24  3:11 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Eric Dumazet wrote:

> I thought some x86_64 cpus dont have cmpxchg16b ?

Right I vaguely remember a flag there.

> You have to check cx16 feature flag or implement a fallback ?

There is already a compile time fallback. Uhh.. How do I do this at
runtime?


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24  0:22   ` Eric Dumazet
@ 2010-11-24  3:13     ` Christoph Lameter
  2010-11-24  4:37       ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24  3:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Eric Dumazet wrote:

> Hmm, this only works for power of two NR_CPUS, or else one cpu 'tid'
> could wrap on another cpu tid.

Hmm... That essentially like using a bitfield then.

> I suggest using 4096  (or roundup_pow_of_two(CONFIG_NR_CPUS))

There could be machines with 64k cpus soon. So I guess roundupp2 it will
have to be.



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24  3:13     ` Christoph Lameter
@ 2010-11-24  4:37       ` Christoph Lameter
  0 siblings, 0 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24  4:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Mathieu Desnoyers, Tejun Heo

Use power of two increment in all cases.

Signed-off-by: Christoph Lameter <cl@linux.com>

Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2010-11-23 22:22:03.596998064 -0600
+++ linux-2.6/mm/slub.c	2010-11-23 21:53:09.422591631 -0600
@@ -1490,19 +1490,21 @@
  * during cmpxchg. The transactions start with the cpu number and are then
  * incremented by CONFIG_NR_CPUS.
  */
+#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
+
 static inline unsigned long next_tid(unsigned long tid)
 {
-	return tid + CONFIG_NR_CPUS;
+	return tid + TID_STEP;
 }

 static inline unsigned int tid_to_cpu(unsigned long tid)
 {
-	return tid % CONFIG_NR_CPUS;
+	return tid % TID_STEP;
 }

 static inline unsigned long tid_to_event(unsigned long tid)
 {
-	return tid / CONFIG_NR_CPUS;
+	return tid / TID_STEP;
 }

 static inline unsigned int init_tid(int cpu)


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations
  2010-11-24  3:11     ` Christoph Lameter
@ 2010-11-24  7:05       ` Pekka Enberg
  0 siblings, 0 replies; 51+ messages in thread
From: Pekka Enberg @ 2010-11-24  7:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, akpm, Pekka Enberg, linux-kernel,
	Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Eric Dumazet wrote:
>> You have to check cx16 feature flag or implement a fallback ?

On Wed, Nov 24, 2010 at 5:11 AM, Christoph Lameter <cl@linux.com> wrote:
> There is already a compile time fallback. Uhh.. How do I do this at
> runtime?

I think the CPU alternatives API is meant for just that. Grep for
"alternative" in arch/x86.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options.
  2010-11-23 23:51 ` [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options Christoph Lameter
@ 2010-11-24  7:07   ` Pekka Enberg
  2010-11-26 15:43   ` Tejun Heo
  1 sibling, 0 replies; 51+ messages in thread
From: Pekka Enberg @ 2010-11-24  7:07 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On Wed, Nov 24, 2010 at 1:51 AM, Christoph Lameter <cl@linux.com> wrote:
> The this_cpu_* options can be used to optimize __percpu_counter_add a bit. Avoids
> some address arithmetic and saves 12 bytes.

[snip]

> Signed-off-by: Christoph Lameter <cl@linux.com>

Reviewed-by: Pekka Enberg <penberg@kernel.org>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-23 23:51 ` [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub Christoph Lameter
  2010-11-24  0:22   ` Eric Dumazet
  2010-11-24  1:02   ` Mathieu Desnoyers
@ 2010-11-24  7:16   ` Pekka Enberg
  2010-11-24 16:17     ` Christoph Lameter
  2010-11-24  8:15   ` Peter Zijlstra
  3 siblings, 1 reply; 51+ messages in thread
From: Pekka Enberg @ 2010-11-24  7:16 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On Wed, Nov 24, 2010 at 1:51 AM, Christoph Lameter <cl@linux.com> wrote:
> @@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
>  {
>        void **object;
>        struct kmem_cache_cpu *c;
> -       unsigned long flags;
> +       unsigned long tid;
>
>        if (slab_pre_alloc_hook(s, gfpflags))
>                return NULL;
>
> -       local_irq_save(flags);
> +redo:
> +       /*
> +        * Must read kmem_cache cpu data via this cpu ptr. Preemption is
> +        * enabled. We may switch back and forth between cpus while
> +        * reading from one cpu area. That does not matter as long
> +        * as we end up on the original cpu again when doing the cmpxchg.
> +        */
>        c = __this_cpu_ptr(s->cpu_slab);
> +
> +       /*
> +        * The transaction ids are globally unique per cpu and per operation on
> +        * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> +        * occurs on the right processor and that there was no operation on the
> +        * linked list in between.
> +        */
> +       tid = c->tid;
> +       barrier();

You're using a compiler barrier after every load from c->tid. Why?

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-23 23:51 ` [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub Christoph Lameter
                     ` (2 preceding siblings ...)
  2010-11-24  7:16   ` Pekka Enberg
@ 2010-11-24  8:15   ` Peter Zijlstra
  2010-11-24 16:14     ` Christoph Lameter
  3 siblings, 1 reply; 51+ messages in thread
From: Peter Zijlstra @ 2010-11-24  8:15 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On Tue, 2010-11-23 at 17:51 -0600, Christoph Lameter wrote:
> So there is no need even to disable preemption which will make the allocations
> scale well in a RT environment. 

The RT thing isn't particularly about scaling per-se, its mostly about
working at all.

This thing still relies on disabling IRQs in the slow path, which means
its still going to be a lot of work to make it work on -rt.

It also heavily relies on bit-spinlocks, which again is going to need
changes for -rt.

But yes, the lockless fast path is nice, I'll try and get around to
reading it.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24  8:15   ` Peter Zijlstra
@ 2010-11-24 16:14     ` Christoph Lameter
  2010-11-24 17:26       ` Peter Zijlstra
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24 16:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: akpm, Pekka Enberg, Ingo Molnar, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Peter Zijlstra wrote:

> This thing still relies on disabling IRQs in the slow path, which means
> its still going to be a lot of work to make it work on -rt.

The disabling of irqs is because slab operations are used from interrupt
context. If we can avoid slab operations from interrupt contexts then we
can drop the interrupt disable in the slab allocators.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24  7:16   ` Pekka Enberg
@ 2010-11-24 16:17     ` Christoph Lameter
  2010-11-24 16:37       ` Pekka Enberg
  2010-11-24 19:37       ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24 16:17 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers, Tejun Heo

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1085 bytes --]

On Wed, 24 Nov 2010, Pekka Enberg wrote:

> > +       /*
> > +        * The transaction ids are globally unique per cpu and per operation on
> > +        * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> > +        * occurs on the right processor and that there was no operation on the
> > +        * linked list in between.
> > +        */
> > +       tid = c->tid;
> > +       barrier();
>
> You're using a compiler barrier after every load from c->tid. Why?

To make sure that the compiler does not do something like loading the tid
later. The tid must be obtained before the rest of the information from
the per cpu slab data is retrieved in order to ensure that we have a
consistent set of data to operate on.

The critical section begins with the retrieval of the tid and it ends with
the replacement of the tid with the newly generated one. This means that
all state data for the alloc and free operation needs to be retrieved in
that critical section. The change must be saved with the final
cmpxchg_double of the critical section.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 16:17     ` Christoph Lameter
@ 2010-11-24 16:37       ` Pekka Enberg
  2010-11-24 16:45         ` Christoph Lameter
  2010-11-24 19:37       ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 51+ messages in thread
From: Pekka Enberg @ 2010-11-24 16:37 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On Wed, Nov 24, 2010 at 6:17 PM, Christoph Lameter <cl@linux.com> wrote:
> On Wed, 24 Nov 2010, Pekka Enberg wrote:
>
>> > +       /*
>> > +        * The transaction ids are globally unique per cpu and per operation on
>> > +        * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
>> > +        * occurs on the right processor and that there was no operation on the
>> > +        * linked list in between.
>> > +        */
>> > +       tid = c->tid;
>> > +       barrier();
>>
>> You're using a compiler barrier after every load from c->tid. Why?
>
> To make sure that the compiler does not do something like loading the tid
> later. The tid must be obtained before the rest of the information from
> the per cpu slab data is retrieved in order to ensure that we have a
> consistent set of data to operate on.
>
> The critical section begins with the retrieval of the tid and it ends with
> the replacement of the tid with the newly generated one. This means that
> all state data for the alloc and free operation needs to be retrieved in
> that critical section. The change must be saved with the final
> cmpxchg_double of the critical section.

Right and we don't need a *memory barrier* here because we're
accessing a per-CPU variable which means operations appear in-order.

                        Pekka

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 16:37       ` Pekka Enberg
@ 2010-11-24 16:45         ` Christoph Lameter
  2010-11-24 16:47           ` Pekka Enberg
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24 16:45 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Pekka Enberg wrote:

> > The critical section begins with the retrieval of the tid and it ends with
> > the replacement of the tid with the newly generated one. This means that
> > all state data for the alloc and free operation needs to be retrieved in
> > that critical section. The change must be saved with the final
> > cmpxchg_double of the critical section.
>
> Right and we don't need a *memory barrier* here because we're
> accessing a per-CPU variable which means operations appear in-order.

The compiler is still free to rearrange the tid fetch. A possible
optimization that the compiler may do is to move the tid fetch into the
next if statement since that is the only block in which the tid variable
is actually used.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 16:45         ` Christoph Lameter
@ 2010-11-24 16:47           ` Pekka Enberg
  2010-11-24 16:55             ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Pekka Enberg @ 2010-11-24 16:47 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Pekka Enberg wrote:
>> > The critical section begins with the retrieval of the tid and it ends with
>> > the replacement of the tid with the newly generated one. This means that
>> > all state data for the alloc and free operation needs to be retrieved in
>> > that critical section. The change must be saved with the final
>> > cmpxchg_double of the critical section.
>>
>> Right and we don't need a *memory barrier* here because we're
>> accessing a per-CPU variable which means operations appear in-order.

On Wed, Nov 24, 2010 at 6:45 PM, Christoph Lameter <cl@linux.com> wrote:
> The compiler is still free to rearrange the tid fetch. A possible
> optimization that the compiler may do is to move the tid fetch into the
> next if statement since that is the only block in which the tid variable
> is actually used.

Yes, which is why we need a *compiler barrier* but not a *memory barrier*.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 16:47           ` Pekka Enberg
@ 2010-11-24 16:55             ` Christoph Lameter
  0 siblings, 0 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24 16:55 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra, linux-kernel,
	Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Pekka Enberg wrote:

> On Wed, 24 Nov 2010, Pekka Enberg wrote:
> >> > The critical section begins with the retrieval of the tid and it ends with
> >> > the replacement of the tid with the newly generated one. This means that
> >> > all state data for the alloc and free operation needs to be retrieved in
> >> > that critical section. The change must be saved with the final
> >> > cmpxchg_double of the critical section.
> >>
> >> Right and we don't need a *memory barrier* here because we're
> >> accessing a per-CPU variable which means operations appear in-order.
>
> On Wed, Nov 24, 2010 at 6:45 PM, Christoph Lameter <cl@linux.com> wrote:
> > The compiler is still free to rearrange the tid fetch. A possible
> > optimization that the compiler may do is to move the tid fetch into the
> > next if statement since that is the only block in which the tid variable
> > is actually used.
>
> Yes, which is why we need a *compiler barrier* but not a *memory barrier*.

Exactly. That is the reason there is a compiler barrier there. A memory
barrier would be smp_mb() or so.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 16:14     ` Christoph Lameter
@ 2010-11-24 17:26       ` Peter Zijlstra
  2010-11-24 18:08         ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Peter Zijlstra @ 2010-11-24 17:26 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, Ingo Molnar, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On Wed, 2010-11-24 at 10:14 -0600, Christoph Lameter wrote:
> On Wed, 24 Nov 2010, Peter Zijlstra wrote:
> 
> > This thing still relies on disabling IRQs in the slow path, which means
> > its still going to be a lot of work to make it work on -rt.
> 
> The disabling of irqs is because slab operations are used from interrupt
> context. If we can avoid slab operations from interrupt contexts then we
> can drop the interrupt disable in the slab allocators.

That's not so much the point, there's per-cpu assumptions due to that.
Not everything is under a proper lock, see for example this bit:

        new = new_slab(s, gfpflags, node);

        if (gfpflags & __GFP_WAIT)
                local_irq_disable();

        if (new) {
                c = __this_cpu_ptr(s->cpu_slab);
                stat(s, ALLOC_SLAB);
                if (c->page)
                        flush_slab(s, c);
                slab_lock(new);
                __SetPageSlubFrozen(new);
                c->page = new;
                goto load_freelist;
        }

There we have the __this_cpu_ptr, c->page deref and flush_slab()->stat()
call all before we take a lock.



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 17:26       ` Peter Zijlstra
@ 2010-11-24 18:08         ` Christoph Lameter
  0 siblings, 0 replies; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24 18:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: akpm, Pekka Enberg, Ingo Molnar, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Peter Zijlstra wrote:

> On Wed, 2010-11-24 at 10:14 -0600, Christoph Lameter wrote:
> > On Wed, 24 Nov 2010, Peter Zijlstra wrote:
> >
> > > This thing still relies on disabling IRQs in the slow path, which means
> > > its still going to be a lot of work to make it work on -rt.
> >
> > The disabling of irqs is because slab operations are used from interrupt
> > context. If we can avoid slab operations from interrupt contexts then we
> > can drop the interrupt disable in the slab allocators.
>
> That's not so much the point, there's per-cpu assumptions due to that.

Sure the exclusive access to per cpu area is exploited during
irq off sections. That would have to change.

> Not everything is under a proper lock, see for example this bit:
>
>         new = new_slab(s, gfpflags, node);
>
>         if (gfpflags & __GFP_WAIT)
>                 local_irq_disable();
>
>         if (new) {
>                 c = __this_cpu_ptr(s->cpu_slab);
>                 stat(s, ALLOC_SLAB);
>                 if (c->page)
>                         flush_slab(s, c);
>                 slab_lock(new);
>                 __SetPageSlubFrozen(new);
>                 c->page = new;
>                 goto load_freelist;
>         }
>
> There we have the __this_cpu_ptr, c->page deref and flush_slab()->stat()
> call all before we take a lock.

All per cpu data and therefore would have get different treatment
if we wanted to drop the processing in interrupt off mode.

Disabling preempt may be initially sufficient there.




^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 16:17     ` Christoph Lameter
  2010-11-24 16:37       ` Pekka Enberg
@ 2010-11-24 19:37       ` Jeremy Fitzhardinge
  2010-11-24 19:53         ` Christoph Lameter
  2010-11-24 19:56         ` Mathieu Desnoyers
  1 sibling, 2 replies; 51+ messages in thread
From: Jeremy Fitzhardinge @ 2010-11-24 19:37 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra,
	linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On 11/24/2010 08:17 AM, Christoph Lameter wrote:
> On Wed, 24 Nov 2010, Pekka Enberg wrote:
>
>>> +       /*
>>> +        * The transaction ids are globally unique per cpu and per operation on
>>> +        * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
>>> +        * occurs on the right processor and that there was no operation on the
>>> +        * linked list in between.
>>> +        */
>>> +       tid = c->tid;
>>> +       barrier();
>> You're using a compiler barrier after every load from c->tid. Why?
> To make sure that the compiler does not do something like loading the tid
> later. The tid must be obtained before the rest of the information from
> the per cpu slab data is retrieved in order to ensure that we have a
> consistent set of data to operate on.

Isn't that best expressed with ACCESS_ONCE()?

    J

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 19:37       ` Jeremy Fitzhardinge
@ 2010-11-24 19:53         ` Christoph Lameter
  2010-11-24 20:01           ` Jeremy Fitzhardinge
  2010-11-24 19:56         ` Mathieu Desnoyers
  1 sibling, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-24 19:53 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Pekka Enberg, akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra,
	linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On Wed, 24 Nov 2010, Jeremy Fitzhardinge wrote:

> On 11/24/2010 08:17 AM, Christoph Lameter wrote:
> > On Wed, 24 Nov 2010, Pekka Enberg wrote:
> >
> >>> +       /*
> >>> +        * The transaction ids are globally unique per cpu and per operation on
> >>> +        * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> >>> +        * occurs on the right processor and that there was no operation on the
> >>> +        * linked list in between.
> >>> +        */
> >>> +       tid = c->tid;
> >>> +       barrier();
> >> You're using a compiler barrier after every load from c->tid. Why?
> > To make sure that the compiler does not do something like loading the tid
> > later. The tid must be obtained before the rest of the information from
> > the per cpu slab data is retrieved in order to ensure that we have a
> > consistent set of data to operate on.
>
> Isn't that best expressed with ACCESS_ONCE()?

ACCESS_ONCE does not prevent reordering if used once it seems when one
reads the comments. ACCESS_ONCE() uses volatile? Uggh.



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 19:37       ` Jeremy Fitzhardinge
  2010-11-24 19:53         ` Christoph Lameter
@ 2010-11-24 19:56         ` Mathieu Desnoyers
  1 sibling, 0 replies; 51+ messages in thread
From: Mathieu Desnoyers @ 2010-11-24 19:56 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Christoph Lameter, Pekka Enberg, akpm, Pekka Enberg, Ingo Molnar,
	Peter Zijlstra, linux-kernel, Eric Dumazet, Tejun Heo

* Jeremy Fitzhardinge (jeremy@goop.org) wrote:
> On 11/24/2010 08:17 AM, Christoph Lameter wrote:
> > On Wed, 24 Nov 2010, Pekka Enberg wrote:
> >
> >>> +       /*
> >>> +        * The transaction ids are globally unique per cpu and per operation on
> >>> +        * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> >>> +        * occurs on the right processor and that there was no operation on the
> >>> +        * linked list in between.
> >>> +        */
> >>> +       tid = c->tid;
> >>> +       barrier();
> >> You're using a compiler barrier after every load from c->tid. Why?
> > To make sure that the compiler does not do something like loading the tid
> > later. The tid must be obtained before the rest of the information from
> > the per cpu slab data is retrieved in order to ensure that we have a
> > consistent set of data to operate on.
> 
> Isn't that best expressed with ACCESS_ONCE()?

ACCESS_ONCE() use of volatile only ensures that volatile accesses are not
reordered wrt each other. It does not ensure anything about other unrelated
memory accesses (which we want to make sure the compiler won't move before the
c->tid read). The compiler barrier() is really what seems to be needed here.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub
  2010-11-24 19:53         ` Christoph Lameter
@ 2010-11-24 20:01           ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 51+ messages in thread
From: Jeremy Fitzhardinge @ 2010-11-24 20:01 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, akpm, Pekka Enberg, Ingo Molnar, Peter Zijlstra,
	linux-kernel, Eric Dumazet, Mathieu Desnoyers, Tejun Heo

On 11/24/2010 11:53 AM, Christoph Lameter wrote:
> ACCESS_ONCE does not prevent reordering if used once it seems when one
> reads the comments. ACCESS_ONCE() uses volatile? Uggh.

Hm, good point.  I'll need to review my uses of it.

    J

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options.
  2010-11-23 23:51 ` [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options Christoph Lameter
  2010-11-24  7:07   ` Pekka Enberg
@ 2010-11-26 15:43   ` Tejun Heo
  1 sibling, 0 replies; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 15:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> The this_cpu_* options can be used to optimize __percpu_counter_add a bit. Avoids
> some address arithmetic and saves 12 bytes.
...
> Signed-off-by: Christoph Lameter <cl@linux.com>

Reviewed-by: Tejun Heo <tj@kernel.org>

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 02/10] vmstat: Optimize zone counter modifications through the use of this cpu operations
  2010-11-23 23:51 ` [thiscpuops upgrade 02/10] vmstat: Optimize zone counter modifications through the use of this cpu operations Christoph Lameter
@ 2010-11-26 16:25   ` Tejun Heo
  0 siblings, 0 replies; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 16:25 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet,
	Mathieu Desnoyers, Namhyung Kim

On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> this cpu operations can be used to slightly optimize the function. The
> changes will avoid some address calculations and replace them with the
> use of the percpu segment register.
> 
> If one would have this_cpu_inc_return and this_cpu_dec_return then it
> would be possible to optimize inc_zone_page_state and dec_zone_page_state even
> more.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>

Looks correct to me but it would be nice to run this through sparse.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return
  2010-11-23 23:51 ` [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return Christoph Lameter
@ 2010-11-26 16:31   ` Tejun Heo
  2010-11-26 16:37     ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 16:31 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> Introduce generic support for this_cpu_add_return etc.
> 
> The fallback is to realize these operations with __this_cpu_ops.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  include/linux/percpu.h |   70 +++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 70 insertions(+)
> 
> Index: linux-2.6/include/linux/percpu.h
> ===================================================================
> --- linux-2.6.orig/include/linux/percpu.h	2010-11-23 17:29:46.000000000 -0600
> +++ linux-2.6/include/linux/percpu.h	2010-11-23 17:31:14.000000000 -0600
> @@ -240,6 +240,20 @@ extern void __bad_size_call_parameter(vo
>  	pscr_ret__;							\
>  })
>  
> +#define __pcpu_size_call_return2(stem, variable, ...)			\
> +({	typeof(variable) pscr_ret__;					\
> +	__verify_pcpu_ptr(&(variable));					\
> +	switch(sizeof(variable)) {					\
> +	case 1: pscr_ret__ = stem##1(variable, __VA_ARGS__);break;	\
> +	case 2: pscr_ret__ = stem##2(variable, __VA_ARGS__);break;	\
> +	case 4: pscr_ret__ = stem##4(variable, __VA_ARGS__);break;	\
> +	case 8: pscr_ret__ = stem##8(variable, __VA_ARGS__);break;	\
> +	default:							\
> +		__bad_size_call_parameter();break;			\
> +	}								\
> +	pscr_ret__;							\
> +})

If we're gonna do __VA_ARGS__, can't we just replace
__pcpu_size_call_return()?  Other than that, looks good to me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 04/10] x86: Support for this_cpu_add,sub,dec,inc_return
  2010-11-23 23:51 ` [thiscpuops upgrade 04/10] x86: Support " Christoph Lameter
@ 2010-11-26 16:33   ` Tejun Heo
  0 siblings, 0 replies; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 16:33 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> Supply an implementation for x86 in order to generate more efficient code.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>

Reviewed-by: Tejun Heo <tj@kernel.org>

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter
  2010-11-23 23:51 ` [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter Christoph Lameter
@ 2010-11-26 16:35   ` Tejun Heo
  2010-11-26 17:02     ` Christoph Lameter
  0 siblings, 1 reply; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 16:35 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> this_cpu_inc_return() saves us a memory access there.
> 
> Signed-off-by: Christoph Lameter <cl@linux.com>
> 
> ---
>  arch/x86/kernel/apic/nmi.c |    3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> Index: linux-2.6/arch/x86/kernel/apic/nmi.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/apic/nmi.c	2010-11-23 16:35:19.000000000 -0600
> +++ linux-2.6/arch/x86/kernel/apic/nmi.c	2010-11-23 16:38:29.000000000 -0600
> @@ -432,8 +432,7 @@ nmi_watchdog_tick(struct pt_regs *regs,
>  		 * Ayiee, looks like this CPU is stuck ...
>  		 * wait a few IRQs (5 seconds) before doing the oops ...
>  		 */
> -		__this_cpu_inc(alert_counter);
> -		if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
> +		if (__this_cpu_inc_return(alert_counter) == 5 * nmi_hz)

Hmmm... one worry I have is that xadd, being not a very popular
operation, might be slower than add and read.  Using it for atomicity
would probably be beneficial in most cases but have you checked this
actually is cheaper?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return
  2010-11-26 16:31   ` Tejun Heo
@ 2010-11-26 16:37     ` Christoph Lameter
  2010-11-26 16:39       ` Tejun Heo
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-26 16:37 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On Fri, 26 Nov 2010, Tejun Heo wrote:

> If we're gonna do __VA_ARGS__, can't we just replace
> __pcpu_size_call_return()?  Other than that, looks good to me.

A single parameter would require an empty vaarg list. All single parameter
invocations would have to specify an empty additional parameter.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return
  2010-11-26 16:37     ` Christoph Lameter
@ 2010-11-26 16:39       ` Tejun Heo
  0 siblings, 0 replies; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 16:39 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 11/26/2010 05:37 PM, Christoph Lameter wrote:
> On Fri, 26 Nov 2010, Tejun Heo wrote:
> 
>> If we're gonna do __VA_ARGS__, can't we just replace
>> __pcpu_size_call_return()?  Other than that, looks good to me.
> 
> A single parameter would require an empty vaarg list. All single parameter
> invocations would have to specify an empty additional parameter.

Argh... okay.

Reviewed-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-11-23 23:51 ` [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
@ 2010-11-26 16:51   ` Tejun Heo
  2010-11-26 16:56     ` Eric Dumazet
  0 siblings, 1 reply; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 16:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> +/*
> + * cmpxchg_double replaces two adjacent scalars at once. The first parameter
> + * passed is a percpu pointer, not a scalar like the other this_cpu
> + * operations. This is so because the function operates on two scalars
> + * (must be of same size). A truth value is returned to indicate success or
> + * failure (since a double register result is difficult to handle).
> + * There is very limited hardware support for these operations. So only certain
> + * sizes may work.
> + */
> +#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\

Ah... this is scary.  If it proves to be useful enough, sure, but
otherwise I'd like to avoid it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-11-26 16:51   ` Tejun Heo
@ 2010-11-26 16:56     ` Eric Dumazet
  2010-11-26 16:58       ` Tejun Heo
  0 siblings, 1 reply; 51+ messages in thread
From: Eric Dumazet @ 2010-11-26 16:56 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers

Le vendredi 26 novembre 2010 à 17:51 +0100, Tejun Heo a écrit :
> On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> > +/*
> > + * cmpxchg_double replaces two adjacent scalars at once. The first parameter
> > + * passed is a percpu pointer, not a scalar like the other this_cpu
> > + * operations. This is so because the function operates on two scalars
> > + * (must be of same size). A truth value is returned to indicate success or
> > + * failure (since a double register result is difficult to handle).
> > + * There is very limited hardware support for these operations. So only certain
> > + * sizes may work.
> > + */
> > +#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
> 
> Ah... this is scary.  If it proves to be useful enough, sure, but
> otherwise I'd like to avoid it.
> 

This is mandatory for several lockless algos, dont be afraid, its a
single instruction ;)




^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-11-26 16:56     ` Eric Dumazet
@ 2010-11-26 16:58       ` Tejun Heo
  2010-11-26 17:01         ` Eric Dumazet
  0 siblings, 1 reply; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 16:58 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers

On 11/26/2010 05:56 PM, Eric Dumazet wrote:
> Le vendredi 26 novembre 2010 à 17:51 +0100, Tejun Heo a écrit :
>> On 11/24/2010 12:51 AM, Christoph Lameter wrote:
>>> +/*
>>> + * cmpxchg_double replaces two adjacent scalars at once. The first parameter
>>> + * passed is a percpu pointer, not a scalar like the other this_cpu
>>> + * operations. This is so because the function operates on two scalars
>>> + * (must be of same size). A truth value is returned to indicate success or
>>> + * failure (since a double register result is difficult to handle).
>>> + * There is very limited hardware support for these operations. So only certain
>>> + * sizes may work.
>>> + */
>>> +#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
>>
>> Ah... this is scary.  If it proves to be useful enough, sure, but
>> otherwise I'd like to avoid it.
> 
> This is mandatory for several lockless algos, dont be afraid, its a
> single instruction ;)

Thanks for holding my hands. :-)

What I'm afraid of is generic code switching to use it in very hot
paths when a lot of archs can't actually do it leading to performance
regressions.  Is this something which is available on most archs?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-11-26 16:58       ` Tejun Heo
@ 2010-11-26 17:01         ` Eric Dumazet
  2010-11-26 17:07           ` Tejun Heo
  0 siblings, 1 reply; 51+ messages in thread
From: Eric Dumazet @ 2010-11-26 17:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers

Le vendredi 26 novembre 2010 à 17:58 +0100, Tejun Heo a écrit :
> On 11/26/2010 05:56 PM, Eric Dumazet wrote:
> > Le vendredi 26 novembre 2010 à 17:51 +0100, Tejun Heo a écrit :
> >> On 11/24/2010 12:51 AM, Christoph Lameter wrote:
> >>> +/*
> >>> + * cmpxchg_double replaces two adjacent scalars at once. The first parameter
> >>> + * passed is a percpu pointer, not a scalar like the other this_cpu
> >>> + * operations. This is so because the function operates on two scalars
> >>> + * (must be of same size). A truth value is returned to indicate success or
> >>> + * failure (since a double register result is difficult to handle).
> >>> + * There is very limited hardware support for these operations. So only certain
> >>> + * sizes may work.
> >>> + */
> >>> +#define __this_cpu_generic_cmpxchg_double(pcp, oval1, oval2, nval1, nval2)	\
> >>
> >> Ah... this is scary.  If it proves to be useful enough, sure, but
> >> otherwise I'd like to avoid it.
> > 
> > This is mandatory for several lockless algos, dont be afraid, its a
> > single instruction ;)
> 
> Thanks for holding my hands. :-)
> 
> What I'm afraid of is generic code switching to use it in very hot
> paths when a lot of archs can't actually do it leading to performance
> regressions.  Is this something which is available on most archs?
> 

I would say cmpxchg() has the same problem, some arches dont have a
native instruction, yet we use it in some paths.




^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter
  2010-11-26 16:35   ` Tejun Heo
@ 2010-11-26 17:02     ` Christoph Lameter
  2010-11-26 17:05       ` Tejun Heo
  0 siblings, 1 reply; 51+ messages in thread
From: Christoph Lameter @ 2010-11-26 17:02 UTC (permalink / raw)
  To: Tejun Heo
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On Fri, 26 Nov 2010, Tejun Heo wrote:

> > -		__this_cpu_inc(alert_counter);
> > -		if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
> > +		if (__this_cpu_inc_return(alert_counter) == 5 * nmi_hz)
>
> Hmmm... one worry I have is that xadd, being not a very popular
> operation, might be slower than add and read.  Using it for atomicity
> would probably be beneficial in most cases but have you checked this
> actually is cheaper?

XADD takes 3 uops. INC 1 and MOV 1 uop. So there is an additiona uop.

However, a memory fetch from l1 takes a mininum 4 cycles. Doing that twice
already ends up with at least 8 cycles.





^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter
  2010-11-26 17:02     ` Christoph Lameter
@ 2010-11-26 17:05       ` Tejun Heo
  0 siblings, 0 replies; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 17:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Pekka Enberg, linux-kernel, Eric Dumazet, Mathieu Desnoyers

On 11/26/2010 06:02 PM, Christoph Lameter wrote:
> On Fri, 26 Nov 2010, Tejun Heo wrote:
> 
>>> -		__this_cpu_inc(alert_counter);
>>> -		if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
>>> +		if (__this_cpu_inc_return(alert_counter) == 5 * nmi_hz)
>>
>> Hmmm... one worry I have is that xadd, being not a very popular
>> operation, might be slower than add and read.  Using it for atomicity
>> would probably be beneficial in most cases but have you checked this
>> actually is cheaper?
> 
> XADD takes 3 uops. INC 1 and MOV 1 uop. So there is an additiona uop.
> 
> However, a memory fetch from l1 takes a mininum 4 cycles. Doing that twice
> already ends up with at least 8 cycles.

Thanks for the explanation.  It might be beneficial to note
performance characteristics on top of the x86 implementation?
Anyways, for this and the following simple conversion patches.

Reviewed-by: Tejun Heo <tj@kernel.org>

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-11-26 17:01         ` Eric Dumazet
@ 2010-11-26 17:07           ` Tejun Heo
  2010-11-26 17:16             ` Eric Dumazet
  0 siblings, 1 reply; 51+ messages in thread
From: Tejun Heo @ 2010-11-26 17:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers

On 11/26/2010 06:01 PM, Eric Dumazet wrote:
>> What I'm afraid of is generic code switching to use it in very hot
>> paths when a lot of archs can't actually do it leading to performance
>> regressions.  Is this something which is available on most archs?
>>
> 
> I would say cmpxchg() has the same problem, some arches dont have a
> native instruction, yet we use it in some paths.

Yeah, well, there's difference between some not having it and only
x86_64 having it.  cmpxchg was already rather well received when it
was added even though it wasn't available on all archs.  I'm not
against it but think we should use some caution here and think about
the impact of unoptimized cases which can be pretty common (preemption
toggling isn't too expensive but irq toggling can be quite).

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support
  2010-11-26 17:07           ` Tejun Heo
@ 2010-11-26 17:16             ` Eric Dumazet
  0 siblings, 0 replies; 51+ messages in thread
From: Eric Dumazet @ 2010-11-26 17:16 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Lameter, akpm, Pekka Enberg, linux-kernel, Mathieu Desnoyers

Le vendredi 26 novembre 2010 à 18:07 +0100, Tejun Heo a écrit :
> On 11/26/2010 06:01 PM, Eric Dumazet wrote:
> >> What I'm afraid of is generic code switching to use it in very hot
> >> paths when a lot of archs can't actually do it leading to performance
> >> regressions.  Is this something which is available on most archs?
> >>
> > 
> > I would say cmpxchg() has the same problem, some arches dont have a
> > native instruction, yet we use it in some paths.
> 
> Yeah, well, there's difference between some not having it and only
> x86_64 having it.  cmpxchg was already rather well received when it
> was added even though it wasn't available on all archs.  I'm not
> against it but think we should use some caution here and think about
> the impact of unoptimized cases which can be pretty common (preemption
> toggling isn't too expensive but irq toggling can be quite).
> 


x86_32 has it too

x86_64 uses cmpxchg16b, while x86_32 uses cmpxhg8b


And I guess it will be used in contexts we had to disable irqs, so it
will be faster on x86, and same cost on other arches.




^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2010-11-26 17:16 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-23 23:51 [thiscpuops upgrade 00/10] Upgrade of this_cpu_ops Christoph Lameter
2010-11-23 23:51 ` [thiscpuops upgrade 01/10] percpucounter: Optimize __percpu_counter_add a bit through the use of this_cpu() options Christoph Lameter
2010-11-24  7:07   ` Pekka Enberg
2010-11-26 15:43   ` Tejun Heo
2010-11-23 23:51 ` [thiscpuops upgrade 02/10] vmstat: Optimize zone counter modifications through the use of this cpu operations Christoph Lameter
2010-11-26 16:25   ` Tejun Heo
2010-11-23 23:51 ` [thiscpuops upgrade 03/10] percpu: Generic support for this_cpu_add,sub,dec,inc_return Christoph Lameter
2010-11-26 16:31   ` Tejun Heo
2010-11-26 16:37     ` Christoph Lameter
2010-11-26 16:39       ` Tejun Heo
2010-11-23 23:51 ` [thiscpuops upgrade 04/10] x86: Support " Christoph Lameter
2010-11-26 16:33   ` Tejun Heo
2010-11-23 23:51 ` [thiscpuops upgrade 05/10] x86: Use this_cpu_inc_return for nmi counter Christoph Lameter
2010-11-26 16:35   ` Tejun Heo
2010-11-26 17:02     ` Christoph Lameter
2010-11-26 17:05       ` Tejun Heo
2010-11-23 23:51 ` [thiscpuops upgrade 06/10] vmstat: Use this_cpu_inc_return for vm statistics Christoph Lameter
2010-11-23 23:51 ` [thiscpuops upgrade 07/10] highmem: Use this_cpu_xx_return() operations Christoph Lameter
2010-11-23 23:51 ` [thiscpuops upgrade 08/10] percpu: generic this_cpu_cmpxchg() and this_cpu_cmpxchg_double support Christoph Lameter
2010-11-26 16:51   ` Tejun Heo
2010-11-26 16:56     ` Eric Dumazet
2010-11-26 16:58       ` Tejun Heo
2010-11-26 17:01         ` Eric Dumazet
2010-11-26 17:07           ` Tejun Heo
2010-11-26 17:16             ` Eric Dumazet
2010-11-23 23:51 ` [thiscpuops upgrade 09/10] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations Christoph Lameter
2010-11-24  0:41   ` Eric Dumazet
2010-11-24  3:11     ` Christoph Lameter
2010-11-24  7:05       ` Pekka Enberg
2010-11-24  0:44   ` Mathieu Desnoyers
2010-11-23 23:51 ` [thiscpuops upgrade 10/10] Lockless (and preemptless) fastpaths for slub Christoph Lameter
2010-11-24  0:22   ` Eric Dumazet
2010-11-24  3:13     ` Christoph Lameter
2010-11-24  4:37       ` Christoph Lameter
2010-11-24  1:02   ` Mathieu Desnoyers
2010-11-24  1:05     ` Mathieu Desnoyers
2010-11-24  3:09       ` Christoph Lameter
2010-11-24  7:16   ` Pekka Enberg
2010-11-24 16:17     ` Christoph Lameter
2010-11-24 16:37       ` Pekka Enberg
2010-11-24 16:45         ` Christoph Lameter
2010-11-24 16:47           ` Pekka Enberg
2010-11-24 16:55             ` Christoph Lameter
2010-11-24 19:37       ` Jeremy Fitzhardinge
2010-11-24 19:53         ` Christoph Lameter
2010-11-24 20:01           ` Jeremy Fitzhardinge
2010-11-24 19:56         ` Mathieu Desnoyers
2010-11-24  8:15   ` Peter Zijlstra
2010-11-24 16:14     ` Christoph Lameter
2010-11-24 17:26       ` Peter Zijlstra
2010-11-24 18:08         ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).