All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21  8:51 Nicholas Piggin
  2016-09-21 10:25   ` kbuild test robot
                   ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-21  8:51 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter
  Cc: Nicholas Piggin, linux-kernel, linux-arch, linuxppc-dev

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it saves 2 loads:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/asm-generic/percpu.h | 54 +++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..3fe18fe 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,18 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +149,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp);				\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +159,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +176,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21  8:51 [PATCH] percpu: improve generic percpu modify-return implementation Nicholas Piggin
  2016-09-21 10:25   ` kbuild test robot
@ 2016-09-21 10:25   ` kbuild test robot
  2016-09-21 10:30   ` kbuild test robot
  2016-09-21 10:57   ` Nicholas Piggin
  3 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:25 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Nicholas Piggin,
	linux-kernel, linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 5055 bytes --]

Hi Nicholas,

[auto build test ERROR on asm-generic/master]
[also build test ERROR on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: blackfin-BF561-EZKIT-SMP_defconfig (attached as .config)
compiler: bfin-uclinux-gcc (GCC) 6.2.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=blackfin 

All error/warnings (new ones prefixed by >>):

   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
   In file included from arch/blackfin/include/generated/asm/percpu.h:1:0,
                    from include/linux/percpu.h:12,
                    from include/linux/percpu-rwsem.h:6,
                    from include/linux/fs.h:30,
                    from mm/vmstat.c:12:
>> include/asm-generic/percpu.h:152:10: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
>> include/asm-generic/percpu.h:382:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_1(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:323:24: note: in expansion of macro 'this_cpu_xchg_1'
     case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
                           ^~~~
>> include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
>> mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
   include/asm-generic/percpu.h:152:10: note: each undeclared identifier is reported only once for each function it appears in
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
>> include/asm-generic/percpu.h:382:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_1(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:323:24: note: in expansion of macro 'this_cpu_xchg_1'
     case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
                           ^~~~
>> include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
>> mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given

vim +/raw_cpu_generic_xchg +476 mm/vmstat.c

ee99c71c KOSAKI Motohiro   2009-03-31  470  	for_each_populated_zone(zone) {
fbc2edb0 Christoph Lameter 2013-09-11  471  		struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a Christoph Lameter 2006-06-30  472  
fbc2edb0 Christoph Lameter 2013-09-11  473  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
a7f75e25 Christoph Lameter 2008-02-04  474  			int v;
a7f75e25 Christoph Lameter 2008-02-04  475  
fbc2edb0 Christoph Lameter 2013-09-11 @476  			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
fbc2edb0 Christoph Lameter 2013-09-11  477  			if (v) {
fbc2edb0 Christoph Lameter 2013-09-11  478  
a7f75e25 Christoph Lameter 2008-02-04  479  				atomic_long_add(v, &zone->vm_stat[i]);

:::::: The code at line 476 was first introduced by commit
:::::: fbc2edb05354480a88aa39db8a6acb5782fa1a1b vmstat: use this_cpu() to avoid irqon/off sequence in refresh_cpu_vm_stats

:::::: TO: Christoph Lameter <cl@linux.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10251 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:25   ` kbuild test robot
  0 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:25 UTC (permalink / raw)
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Nicholas Piggin,
	linux-kernel, linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 5055 bytes --]

Hi Nicholas,

[auto build test ERROR on asm-generic/master]
[also build test ERROR on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: blackfin-BF561-EZKIT-SMP_defconfig (attached as .config)
compiler: bfin-uclinux-gcc (GCC) 6.2.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=blackfin 

All error/warnings (new ones prefixed by >>):

   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
   In file included from arch/blackfin/include/generated/asm/percpu.h:1:0,
                    from include/linux/percpu.h:12,
                    from include/linux/percpu-rwsem.h:6,
                    from include/linux/fs.h:30,
                    from mm/vmstat.c:12:
>> include/asm-generic/percpu.h:152:10: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
>> include/asm-generic/percpu.h:382:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_1(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:323:24: note: in expansion of macro 'this_cpu_xchg_1'
     case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
                           ^~~~
>> include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
>> mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
   include/asm-generic/percpu.h:152:10: note: each undeclared identifier is reported only once for each function it appears in
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
>> include/asm-generic/percpu.h:382:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_1(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:323:24: note: in expansion of macro 'this_cpu_xchg_1'
     case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
                           ^~~~
>> include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
>> mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given

vim +/raw_cpu_generic_xchg +476 mm/vmstat.c

ee99c71c KOSAKI Motohiro   2009-03-31  470  	for_each_populated_zone(zone) {
fbc2edb0 Christoph Lameter 2013-09-11  471  		struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a Christoph Lameter 2006-06-30  472  
fbc2edb0 Christoph Lameter 2013-09-11  473  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
a7f75e25 Christoph Lameter 2008-02-04  474  			int v;
a7f75e25 Christoph Lameter 2008-02-04  475  
fbc2edb0 Christoph Lameter 2013-09-11 @476  			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
fbc2edb0 Christoph Lameter 2013-09-11  477  			if (v) {
fbc2edb0 Christoph Lameter 2013-09-11  478  
a7f75e25 Christoph Lameter 2008-02-04  479  				atomic_long_add(v, &zone->vm_stat[i]);

:::::: The code at line 476 was first introduced by commit
:::::: fbc2edb05354480a88aa39db8a6acb5782fa1a1b vmstat: use this_cpu() to avoid irqon/off sequence in refresh_cpu_vm_stats

:::::: TO: Christoph Lameter <cl@linux.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10251 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:25   ` kbuild test robot
  0 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:25 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, linux-kernel,
	linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 5055 bytes --]

Hi Nicholas,

[auto build test ERROR on asm-generic/master]
[also build test ERROR on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: blackfin-BF561-EZKIT-SMP_defconfig (attached as .config)
compiler: bfin-uclinux-gcc (GCC) 6.2.0
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=blackfin 

All error/warnings (new ones prefixed by >>):

   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
   In file included from arch/blackfin/include/generated/asm/percpu.h:1:0,
                    from include/linux/percpu.h:12,
                    from include/linux/percpu-rwsem.h:6,
                    from include/linux/fs.h:30,
                    from mm/vmstat.c:12:
>> include/asm-generic/percpu.h:152:10: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
>> include/asm-generic/percpu.h:382:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_1(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:323:24: note: in expansion of macro 'this_cpu_xchg_1'
     case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
                           ^~~~
>> include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
>> mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
   include/asm-generic/percpu.h:152:10: note: each undeclared identifier is reported only once for each function it appears in
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
>> include/asm-generic/percpu.h:382:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_1(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:323:24: note: in expansion of macro 'this_cpu_xchg_1'
     case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
                           ^~~~
>> include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
>> mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
>> mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given

vim +/raw_cpu_generic_xchg +476 mm/vmstat.c

ee99c71c KOSAKI Motohiro   2009-03-31  470  	for_each_populated_zone(zone) {
fbc2edb0 Christoph Lameter 2013-09-11  471  		struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a Christoph Lameter 2006-06-30  472  
fbc2edb0 Christoph Lameter 2013-09-11  473  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
a7f75e25 Christoph Lameter 2008-02-04  474  			int v;
a7f75e25 Christoph Lameter 2008-02-04  475  
fbc2edb0 Christoph Lameter 2013-09-11 @476  			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
fbc2edb0 Christoph Lameter 2013-09-11  477  			if (v) {
fbc2edb0 Christoph Lameter 2013-09-11  478  
a7f75e25 Christoph Lameter 2008-02-04  479  				atomic_long_add(v, &zone->vm_stat[i]);

:::::: The code at line 476 was first introduced by commit
:::::: fbc2edb05354480a88aa39db8a6acb5782fa1a1b vmstat: use this_cpu() to avoid irqon/off sequence in refresh_cpu_vm_stats

:::::: TO: Christoph Lameter <cl@linux.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 10251 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21  8:51 [PATCH] percpu: improve generic percpu modify-return implementation Nicholas Piggin
  2016-09-21 10:25   ` kbuild test robot
@ 2016-09-21 10:30   ` kbuild test robot
  2016-09-21 10:30   ` kbuild test robot
  2016-09-21 10:57   ` Nicholas Piggin
  3 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:30 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Nicholas Piggin,
	linux-kernel, linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 2873 bytes --]

Hi Nicholas,

[auto build test ERROR on asm-generic/master]
[also build test ERROR on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: tile-tilegx_defconfig (attached as .config)
compiler: tilegx-linux-gcc (GCC) 4.6.2
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=tile 

All errors (new ones prefixed by >>):

   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
>> mm/vmstat.c:476:1: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
   mm/vmstat.c:476:1: note: each undeclared identifier is reported only once for each function it appears in
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given

vim +/raw_cpu_generic_xchg +476 mm/vmstat.c

ee99c71c KOSAKI Motohiro   2009-03-31  470  	for_each_populated_zone(zone) {
fbc2edb0 Christoph Lameter 2013-09-11  471  		struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a Christoph Lameter 2006-06-30  472  
fbc2edb0 Christoph Lameter 2013-09-11  473  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
a7f75e25 Christoph Lameter 2008-02-04  474  			int v;
a7f75e25 Christoph Lameter 2008-02-04  475  
fbc2edb0 Christoph Lameter 2013-09-11 @476  			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
fbc2edb0 Christoph Lameter 2013-09-11  477  			if (v) {
fbc2edb0 Christoph Lameter 2013-09-11  478  
a7f75e25 Christoph Lameter 2008-02-04  479  				atomic_long_add(v, &zone->vm_stat[i]);

:::::: The code at line 476 was first introduced by commit
:::::: fbc2edb05354480a88aa39db8a6acb5782fa1a1b vmstat: use this_cpu() to avoid irqon/off sequence in refresh_cpu_vm_stats

:::::: TO: Christoph Lameter <cl@linux.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 15729 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21  8:51 [PATCH] percpu: improve generic percpu modify-return implementation Nicholas Piggin
  2016-09-21 10:25   ` kbuild test robot
@ 2016-09-21 10:30   ` kbuild test robot
  2016-09-21 10:30   ` kbuild test robot
  2016-09-21 10:57   ` Nicholas Piggin
  3 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:30 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Nicholas Piggin,
	linux-kernel, linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 7000 bytes --]

Hi Nicholas,

[auto build test WARNING on asm-generic/master]
[also build test WARNING on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: i386-randconfig-s0-201638 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from arch/x86/include/asm/percpu.h:551:0,
                    from arch/x86/include/asm/preempt.h:5,
                    from include/linux/preempt.h:59,
                    from include/linux/spinlock.h:50,
                    from arch/x86/kernel/nmi.c:13:
   arch/x86/kernel/nmi.c: In function 'do_nmi':
   include/asm-generic/percpu.h:138:17: warning: unused variable '__p' [-Wunused-variable]
     typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));   \
                    ^
   include/asm-generic/percpu.h:378:41: note: in expansion of macro 'this_cpu_generic_add_return'
    #define this_cpu_add_return_8(pcp, val) this_cpu_generic_add_return(pcp, val)
                                            ^~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_add_return_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:499:39: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
                                          ^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/percpu-defs.h:511:34: note: in expansion of macro 'this_cpu_add_return'
    #define this_cpu_dec_return(pcp) this_cpu_add_return(pcp, -1)
                                     ^~~~~~~~~~~~~~~~~~~
>> arch/x86/kernel/nmi.c:544:6: note: in expansion of macro 'this_cpu_dec_return'
     if (this_cpu_dec_return(nmi_state))
         ^~~~~~~~~~~~~~~~~~~
--
   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
   In file included from arch/x86/include/asm/percpu.h:551:0,
                    from arch/x86/include/asm/preempt.h:5,
                    from include/linux/preempt.h:59,
                    from include/linux/spinlock.h:50,
                    from include/linux/wait.h:8,
                    from include/linux/fs.h:5,
                    from mm/vmstat.c:12:
   include/asm-generic/percpu.h:152:10: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
   include/asm-generic/percpu.h:391:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_8(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_xchg_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
   mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
   include/asm-generic/percpu.h:152:10: note: each undeclared identifier is reported only once for each function it appears in
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
   include/asm-generic/percpu.h:391:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_8(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_xchg_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
   mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~

vim +/this_cpu_dec_return +544 arch/x86/kernel/nmi.c

1d48922c Don Zickus       2011-09-30  528  	inc_irq_stat(__nmi_count);
1d48922c Don Zickus       2011-09-30  529  
1d48922c Don Zickus       2011-09-30  530  	if (!ignore_nmis)
1d48922c Don Zickus       2011-09-30  531  		default_do_nmi(regs);
1d48922c Don Zickus       2011-09-30  532  
1d48922c Don Zickus       2011-09-30  533  	nmi_exit();
228bdaa9 Steven Rostedt   2011-12-09  534  
9d050416 Andy Lutomirski  2015-07-15  535  #ifdef CONFIG_X86_64
9d050416 Andy Lutomirski  2015-07-15  536  	if (unlikely(this_cpu_read(update_debug_stack))) {
9d050416 Andy Lutomirski  2015-07-15  537  		debug_stack_reset();
9d050416 Andy Lutomirski  2015-07-15  538  		this_cpu_write(update_debug_stack, 0);
9d050416 Andy Lutomirski  2015-07-15  539  	}
9d050416 Andy Lutomirski  2015-07-15  540  #endif
9d050416 Andy Lutomirski  2015-07-15  541  
9d050416 Andy Lutomirski  2015-07-15  542  	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
9d050416 Andy Lutomirski  2015-07-15  543  		write_cr2(this_cpu_read(nmi_cr2));
9d050416 Andy Lutomirski  2015-07-15 @544  	if (this_cpu_dec_return(nmi_state))
9d050416 Andy Lutomirski  2015-07-15  545  		goto nmi_restart;
1d48922c Don Zickus       2011-09-30  546  }
9326638c Masami Hiramatsu 2014-04-17  547  NOKPROBE_SYMBOL(do_nmi);
1d48922c Don Zickus       2011-09-30  548  
1d48922c Don Zickus       2011-09-30  549  void stop_nmi(void)
1d48922c Don Zickus       2011-09-30  550  {
1d48922c Don Zickus       2011-09-30  551  	ignore_nmis++;
1d48922c Don Zickus       2011-09-30  552  }

:::::: The code at line 544 was first introduced by commit
:::::: 9d05041679904b12c12421cbcf9cb5f4860a8d7b x86/nmi: Enable nested do_nmi() handling for 64-bit kernels

:::::: TO: Andy Lutomirski <luto@kernel.org>
:::::: CC: Ingo Molnar <mingo@kernel.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28216 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:30   ` kbuild test robot
  0 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:30 UTC (permalink / raw)
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Nicholas Piggin,
	linux-kernel, linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 2873 bytes --]

Hi Nicholas,

[auto build test ERROR on asm-generic/master]
[also build test ERROR on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: tile-tilegx_defconfig (attached as .config)
compiler: tilegx-linux-gcc (GCC) 4.6.2
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=tile 

All errors (new ones prefixed by >>):

   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
>> mm/vmstat.c:476:1: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
   mm/vmstat.c:476:1: note: each undeclared identifier is reported only once for each function it appears in
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given

vim +/raw_cpu_generic_xchg +476 mm/vmstat.c

ee99c71c KOSAKI Motohiro   2009-03-31  470  	for_each_populated_zone(zone) {
fbc2edb0 Christoph Lameter 2013-09-11  471  		struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a Christoph Lameter 2006-06-30  472  
fbc2edb0 Christoph Lameter 2013-09-11  473  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
a7f75e25 Christoph Lameter 2008-02-04  474  			int v;
a7f75e25 Christoph Lameter 2008-02-04  475  
fbc2edb0 Christoph Lameter 2013-09-11 @476  			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
fbc2edb0 Christoph Lameter 2013-09-11  477  			if (v) {
fbc2edb0 Christoph Lameter 2013-09-11  478  
a7f75e25 Christoph Lameter 2008-02-04  479  				atomic_long_add(v, &zone->vm_stat[i]);

:::::: The code at line 476 was first introduced by commit
:::::: fbc2edb05354480a88aa39db8a6acb5782fa1a1b vmstat: use this_cpu() to avoid irqon/off sequence in refresh_cpu_vm_stats

:::::: TO: Christoph Lameter <cl@linux.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 15729 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:30   ` kbuild test robot
  0 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:30 UTC (permalink / raw)
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, Nicholas Piggin,
	linux-kernel, linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 7000 bytes --]

Hi Nicholas,

[auto build test WARNING on asm-generic/master]
[also build test WARNING on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: i386-randconfig-s0-201638 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from arch/x86/include/asm/percpu.h:551:0,
                    from arch/x86/include/asm/preempt.h:5,
                    from include/linux/preempt.h:59,
                    from include/linux/spinlock.h:50,
                    from arch/x86/kernel/nmi.c:13:
   arch/x86/kernel/nmi.c: In function 'do_nmi':
   include/asm-generic/percpu.h:138:17: warning: unused variable '__p' [-Wunused-variable]
     typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));   \
                    ^
   include/asm-generic/percpu.h:378:41: note: in expansion of macro 'this_cpu_generic_add_return'
    #define this_cpu_add_return_8(pcp, val) this_cpu_generic_add_return(pcp, val)
                                            ^~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_add_return_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:499:39: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
                                          ^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/percpu-defs.h:511:34: note: in expansion of macro 'this_cpu_add_return'
    #define this_cpu_dec_return(pcp) this_cpu_add_return(pcp, -1)
                                     ^~~~~~~~~~~~~~~~~~~
>> arch/x86/kernel/nmi.c:544:6: note: in expansion of macro 'this_cpu_dec_return'
     if (this_cpu_dec_return(nmi_state))
         ^~~~~~~~~~~~~~~~~~~
--
   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
   In file included from arch/x86/include/asm/percpu.h:551:0,
                    from arch/x86/include/asm/preempt.h:5,
                    from include/linux/preempt.h:59,
                    from include/linux/spinlock.h:50,
                    from include/linux/wait.h:8,
                    from include/linux/fs.h:5,
                    from mm/vmstat.c:12:
   include/asm-generic/percpu.h:152:10: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
   include/asm-generic/percpu.h:391:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_8(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_xchg_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
   mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
   include/asm-generic/percpu.h:152:10: note: each undeclared identifier is reported only once for each function it appears in
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
   include/asm-generic/percpu.h:391:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_8(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_xchg_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
   mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~

vim +/this_cpu_dec_return +544 arch/x86/kernel/nmi.c

1d48922c Don Zickus       2011-09-30  528  	inc_irq_stat(__nmi_count);
1d48922c Don Zickus       2011-09-30  529  
1d48922c Don Zickus       2011-09-30  530  	if (!ignore_nmis)
1d48922c Don Zickus       2011-09-30  531  		default_do_nmi(regs);
1d48922c Don Zickus       2011-09-30  532  
1d48922c Don Zickus       2011-09-30  533  	nmi_exit();
228bdaa9 Steven Rostedt   2011-12-09  534  
9d050416 Andy Lutomirski  2015-07-15  535  #ifdef CONFIG_X86_64
9d050416 Andy Lutomirski  2015-07-15  536  	if (unlikely(this_cpu_read(update_debug_stack))) {
9d050416 Andy Lutomirski  2015-07-15  537  		debug_stack_reset();
9d050416 Andy Lutomirski  2015-07-15  538  		this_cpu_write(update_debug_stack, 0);
9d050416 Andy Lutomirski  2015-07-15  539  	}
9d050416 Andy Lutomirski  2015-07-15  540  #endif
9d050416 Andy Lutomirski  2015-07-15  541  
9d050416 Andy Lutomirski  2015-07-15  542  	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
9d050416 Andy Lutomirski  2015-07-15  543  		write_cr2(this_cpu_read(nmi_cr2));
9d050416 Andy Lutomirski  2015-07-15 @544  	if (this_cpu_dec_return(nmi_state))
9d050416 Andy Lutomirski  2015-07-15  545  		goto nmi_restart;
1d48922c Don Zickus       2011-09-30  546  }
9326638c Masami Hiramatsu 2014-04-17  547  NOKPROBE_SYMBOL(do_nmi);
1d48922c Don Zickus       2011-09-30  548  
1d48922c Don Zickus       2011-09-30  549  void stop_nmi(void)
1d48922c Don Zickus       2011-09-30  550  {
1d48922c Don Zickus       2011-09-30  551  	ignore_nmis++;
1d48922c Don Zickus       2011-09-30  552  }

:::::: The code at line 544 was first introduced by commit
:::::: 9d05041679904b12c12421cbcf9cb5f4860a8d7b x86/nmi: Enable nested do_nmi() handling for 64-bit kernels

:::::: TO: Andy Lutomirski <luto@kernel.org>
:::::: CC: Ingo Molnar <mingo@kernel.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28216 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:30   ` kbuild test robot
  0 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:30 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, linux-kernel,
	linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 2873 bytes --]

Hi Nicholas,

[auto build test ERROR on asm-generic/master]
[also build test ERROR on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: tile-tilegx_defconfig (attached as .config)
compiler: tilegx-linux-gcc (GCC) 4.6.2
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=tile 

All errors (new ones prefixed by >>):

   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
>> mm/vmstat.c:476:1: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
   mm/vmstat.c:476:1: note: each undeclared identifier is reported only once for each function it appears in
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given

vim +/raw_cpu_generic_xchg +476 mm/vmstat.c

ee99c71c KOSAKI Motohiro   2009-03-31  470  	for_each_populated_zone(zone) {
fbc2edb0 Christoph Lameter 2013-09-11  471  		struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a Christoph Lameter 2006-06-30  472  
fbc2edb0 Christoph Lameter 2013-09-11  473  		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
a7f75e25 Christoph Lameter 2008-02-04  474  			int v;
a7f75e25 Christoph Lameter 2008-02-04  475  
fbc2edb0 Christoph Lameter 2013-09-11 @476  			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
fbc2edb0 Christoph Lameter 2013-09-11  477  			if (v) {
fbc2edb0 Christoph Lameter 2013-09-11  478  
a7f75e25 Christoph Lameter 2008-02-04  479  				atomic_long_add(v, &zone->vm_stat[i]);

:::::: The code at line 476 was first introduced by commit
:::::: fbc2edb05354480a88aa39db8a6acb5782fa1a1b vmstat: use this_cpu() to avoid irqon/off sequence in refresh_cpu_vm_stats

:::::: TO: Christoph Lameter <cl@linux.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 15729 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:30   ` kbuild test robot
  0 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2016-09-21 10:30 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: kbuild-all, Tejun Heo, Christoph Lameter, linux-kernel,
	linux-arch, linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 7000 bytes --]

Hi Nicholas,

[auto build test WARNING on asm-generic/master]
[also build test WARNING on v4.8-rc7 next-20160920]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/percpu-improve-generic-percpu-modify-return-implementation/20160921-170016
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master
config: i386-randconfig-s0-201638 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from arch/x86/include/asm/percpu.h:551:0,
                    from arch/x86/include/asm/preempt.h:5,
                    from include/linux/preempt.h:59,
                    from include/linux/spinlock.h:50,
                    from arch/x86/kernel/nmi.c:13:
   arch/x86/kernel/nmi.c: In function 'do_nmi':
   include/asm-generic/percpu.h:138:17: warning: unused variable '__p' [-Wunused-variable]
     typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));   \
                    ^
   include/asm-generic/percpu.h:378:41: note: in expansion of macro 'this_cpu_generic_add_return'
    #define this_cpu_add_return_8(pcp, val) this_cpu_generic_add_return(pcp, val)
                                            ^~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_add_return_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:499:39: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
                                          ^~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/percpu-defs.h:511:34: note: in expansion of macro 'this_cpu_add_return'
    #define this_cpu_dec_return(pcp) this_cpu_add_return(pcp, -1)
                                     ^~~~~~~~~~~~~~~~~~~
>> arch/x86/kernel/nmi.c:544:6: note: in expansion of macro 'this_cpu_dec_return'
     if (this_cpu_dec_return(nmi_state))
         ^~~~~~~~~~~~~~~~~~~
--
   mm/vmstat.c: In function 'refresh_cpu_vm_stats':
   mm/vmstat.c:476:1: error: macro "raw_cpu_generic_xchg" requires 2 arguments, but only 1 given
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
    ^  ~~~~~~~~~~~~
   In file included from arch/x86/include/asm/percpu.h:551:0,
                    from arch/x86/include/asm/preempt.h:5,
                    from include/linux/preempt.h:59,
                    from include/linux/spinlock.h:50,
                    from include/linux/wait.h:8,
                    from include/linux/fs.h:5,
                    from mm/vmstat.c:12:
   include/asm-generic/percpu.h:152:10: error: 'raw_cpu_generic_xchg' undeclared (first use in this function)
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
   include/asm-generic/percpu.h:391:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_8(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_xchg_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
   mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~
   include/asm-generic/percpu.h:152:10: note: each undeclared identifier is reported only once for each function it appears in
     __ret = raw_cpu_generic_xchg(pcp);    \
             ^
   include/asm-generic/percpu.h:391:36: note: in expansion of macro 'this_cpu_generic_xchg'
    #define this_cpu_xchg_8(pcp, nval) this_cpu_generic_xchg(pcp, nval)
                                       ^~~~~~~~~~~~~~~~~~~~~
>> include/linux/percpu-defs.h:326:24: note: in expansion of macro 'this_cpu_xchg_8'
     case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
                           ^~~~
   include/linux/percpu-defs.h:500:34: note: in expansion of macro '__pcpu_size_call_return2'
    #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
                                     ^~~~~~~~~~~~~~~~~~~~~~~~
   mm/vmstat.c:476:8: note: in expansion of macro 'this_cpu_xchg'
       v = this_cpu_xchg(p->vm_stat_diff[i], 0);
           ^~~~~~~~~~~~~

vim +/this_cpu_dec_return +544 arch/x86/kernel/nmi.c

1d48922c Don Zickus       2011-09-30  528  	inc_irq_stat(__nmi_count);
1d48922c Don Zickus       2011-09-30  529  
1d48922c Don Zickus       2011-09-30  530  	if (!ignore_nmis)
1d48922c Don Zickus       2011-09-30  531  		default_do_nmi(regs);
1d48922c Don Zickus       2011-09-30  532  
1d48922c Don Zickus       2011-09-30  533  	nmi_exit();
228bdaa9 Steven Rostedt   2011-12-09  534  
9d050416 Andy Lutomirski  2015-07-15  535  #ifdef CONFIG_X86_64
9d050416 Andy Lutomirski  2015-07-15  536  	if (unlikely(this_cpu_read(update_debug_stack))) {
9d050416 Andy Lutomirski  2015-07-15  537  		debug_stack_reset();
9d050416 Andy Lutomirski  2015-07-15  538  		this_cpu_write(update_debug_stack, 0);
9d050416 Andy Lutomirski  2015-07-15  539  	}
9d050416 Andy Lutomirski  2015-07-15  540  #endif
9d050416 Andy Lutomirski  2015-07-15  541  
9d050416 Andy Lutomirski  2015-07-15  542  	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
9d050416 Andy Lutomirski  2015-07-15  543  		write_cr2(this_cpu_read(nmi_cr2));
9d050416 Andy Lutomirski  2015-07-15 @544  	if (this_cpu_dec_return(nmi_state))
9d050416 Andy Lutomirski  2015-07-15  545  		goto nmi_restart;
1d48922c Don Zickus       2011-09-30  546  }
9326638c Masami Hiramatsu 2014-04-17  547  NOKPROBE_SYMBOL(do_nmi);
1d48922c Don Zickus       2011-09-30  548  
1d48922c Don Zickus       2011-09-30  549  void stop_nmi(void)
1d48922c Don Zickus       2011-09-30  550  {
1d48922c Don Zickus       2011-09-30  551  	ignore_nmis++;
1d48922c Don Zickus       2011-09-30  552  }

:::::: The code at line 544 was first introduced by commit
:::::: 9d05041679904b12c12421cbcf9cb5f4860a8d7b x86/nmi: Enable nested do_nmi() handling for 64-bit kernels

:::::: TO: Andy Lutomirski <luto@kernel.org>
:::::: CC: Ingo Molnar <mingo@kernel.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28216 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21  8:51 [PATCH] percpu: improve generic percpu modify-return implementation Nicholas Piggin
  2016-09-21 10:25   ` kbuild test robot
@ 2016-09-21 10:57   ` Nicholas Piggin
  2016-09-21 10:30   ` kbuild test robot
  2016-09-21 10:57   ` Nicholas Piggin
  3 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-21 10:57 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter; +Cc: linux-kernel, linux-arch, linuxppc-dev

On Wed, 21 Sep 2016 18:51:37 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> Some architectures require an additional load to find the address of
> percpu pointers. In some implemenatations, the C aliasing rules do not
> allow the result of that load to be kept over the store that modifies
> the percpu variable, which causes additional loads.

Sorry I picked up an old patch here. This one should be better.

>From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 21 Sep 2016 18:23:43 +1000
Subject: [PATCH] percpu: improve generic percpu modify-return implementations

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:57   ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-21 10:57 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter; +Cc: linux-kernel, linux-arch, linuxppc-dev

On Wed, 21 Sep 2016 18:51:37 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> Some architectures require an additional load to find the address of
> percpu pointers. In some implemenatations, the C aliasing rules do not
> allow the result of that load to be kept over the store that modifies
> the percpu variable, which causes additional loads.

Sorry I picked up an old patch here. This one should be better.

From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 21 Sep 2016 18:23:43 +1000
Subject: [PATCH] percpu: improve generic percpu modify-return implementations

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 10:57   ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-21 10:57 UTC (permalink / raw)
  To: Tejun Heo, Christoph Lameter; +Cc: linux-kernel, linux-arch, linuxppc-dev

On Wed, 21 Sep 2016 18:51:37 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> Some architectures require an additional load to find the address of
> percpu pointers. In some implemenatations, the C aliasing rules do not
> allow the result of that load to be kept over the store that modifies
> the percpu variable, which causes additional loads.

Sorry I picked up an old patch here. This one should be better.

=46rom d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 21 Sep 2016 18:23:43 +1000
Subject: [PATCH] percpu: improve generic percpu modify-return implementatio=
ns

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++---------------=
----
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
=20
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
=20
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp));			\
+									\
+	*__p +=3D val;							\
+	*__p;								\
 })
=20
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret =3D raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret =3D *__p;							\
+	*__p =3D nval;							\
 	__ret;								\
 })
=20
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret =3D raw_cpu_read(pcp);					\
+	__ret =3D *__p;							\
 	if (__ret =3D=3D (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p =3D nval;						\
 	__ret;								\
 })
=20
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nv=
al2) \
 ({									\
+	typeof(&(pcp1)) __p1 =3D raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 =3D raw_cpu_ptr(&(pcp2));			\
 	int __ret =3D 0;							\
-	if (raw_cpu_read(pcp1) =3D=3D (oval1) &&				\
-			 raw_cpu_read(pcp2)  =3D=3D (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 =3D=3D (oval1) && *__p2  =3D=3D (oval2)) {			\
+		*__p1 =3D nval1;						\
+		*__p2 =3D nval2;						\
 		__ret =3D 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret =3D *this_cpu_ptr(&(pcp));					\
+	__ret =3D raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
=20
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret =3D raw_cpu_read(pcp);					\
+	__ret =3D raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret =3D raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret =3D raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret =3D raw_cpu_read(pcp);					\
-	if (__ret =3D=3D (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret =3D raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
=20
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
=20
 #ifndef raw_cpu_write_1
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21 10:57   ` Nicholas Piggin
  (?)
  (?)
@ 2016-09-21 14:23   ` Tejun Heo
  2016-09-21 20:16       ` Christoph Lameter
  2016-09-22  4:35       ` Nicholas Piggin
  -1 siblings, 2 replies; 24+ messages in thread
From: Tejun Heo @ 2016-09-21 14:23 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: Christoph Lameter, linux-kernel, linux-arch, linuxppc-dev

Hello, Nick.

How have you been? :)

On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote:
> On Wed, 21 Sep 2016 18:51:37 +1000
> Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> > Some architectures require an additional load to find the address of
> > percpu pointers. In some implemenatations, the C aliasing rules do not
> > allow the result of that load to be kept over the store that modifies
> > the percpu variable, which causes additional loads.
> 
> Sorry I picked up an old patch here. This one should be better.
> 
> From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
> From: Nicholas Piggin <npiggin@gmail.com>
> Date: Wed, 21 Sep 2016 18:23:43 +1000
> Subject: [PATCH] percpu: improve generic percpu modify-return implementations
> 
> Some architectures require an additional load to find the address of
> percpu pointers. In some implemenatations, the C aliasing rules do not
> allow the result of that load to be kept over the store that modifies
> the percpu variable, which causes additional loads.
> 
> Work around this by finding the pointer first, then operating on that.
> 
> It's also possible to mark things as restrict and those kind of games,
> but that can require larger and arch specific changes.
> 
> On powerpc, __this_cpu_inc_return compiles to:
> 
>         ld 10,48(13)
>         ldx 9,3,10
>         addi 9,9,1
>         stdx 9,3,10
>         ld 9,48(13)
>         ldx 3,9,3
> 
> With this patch it compiles to:
> 
>         ld 10,48(13)
>         ldx 9,3,10
>         addi 9,9,1
>         stdx 9,3,10
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Patch looks good to me but seems QP encoded.  Can you please resend?

Thanks and it's great to see you again!

-- 
tejun

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21 14:23   ` Tejun Heo
@ 2016-09-21 20:16       ` Christoph Lameter
  2016-09-22  4:35       ` Nicholas Piggin
  1 sibling, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2016-09-21 20:16 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Nicholas Piggin, linux-kernel, linux-arch, linux-mm, linuxppc-dev

On Wed, 21 Sep 2016, Tejun Heo wrote:

> Hello, Nick.
>
> How have you been? :)
>

He is baack. Are we getting SL!B? ;-)

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-21 20:16       ` Christoph Lameter
  0 siblings, 0 replies; 24+ messages in thread
From: Christoph Lameter @ 2016-09-21 20:16 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Nicholas Piggin, linux-kernel, linux-arch, linux-mm, linuxppc-dev

On Wed, 21 Sep 2016, Tejun Heo wrote:

> Hello, Nick.
>
> How have you been? :)
>

He is baack. Are we getting SL!B? ;-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21 14:23   ` Tejun Heo
  2016-09-21 20:16       ` Christoph Lameter
  2016-09-22  4:35       ` Nicholas Piggin
@ 2016-09-22  4:35       ` Nicholas Piggin
  1 sibling, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-22  4:35 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Christoph Lameter, linux-kernel, linux-arch, linuxppc-dev

On Wed, 21 Sep 2016 10:23:43 -0400
Tejun Heo <tj@kernel.org> wrote:

> Hello, Nick.
> 
> How have you been? :)

Hey Tejun,

Well thank you, how about you?
 
> On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote:
> > On Wed, 21 Sep 2016 18:51:37 +1000
> > Nicholas Piggin <npiggin@gmail.com> wrote:
> >   
> > > Some architectures require an additional load to find the address of
> > > percpu pointers. In some implemenatations, the C aliasing rules do not
> > > allow the result of that load to be kept over the store that modifies
> > > the percpu variable, which causes additional loads.  
> > 
> > Sorry I picked up an old patch here. This one should be better.
> > 
> > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
> > From: Nicholas Piggin <npiggin@gmail.com>
> > Date: Wed, 21 Sep 2016 18:23:43 +1000
> > Subject: [PATCH] percpu: improve generic percpu modify-return implementations
> > 
> > Some architectures require an additional load to find the address of
> > percpu pointers. In some implemenatations, the C aliasing rules do not
> > allow the result of that load to be kept over the store that modifies
> > the percpu variable, which causes additional loads.
> > 
> > Work around this by finding the pointer first, then operating on that.
> > 
> > It's also possible to mark things as restrict and those kind of games,
> > but that can require larger and arch specific changes.
> > 
> > On powerpc, __this_cpu_inc_return compiles to:
> > 
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> >         ld 9,48(13)
> >         ldx 3,9,3
> > 
> > With this patch it compiles to:
> > 
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> > 
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>  
> 
> Patch looks good to me but seems QP encoded.  Can you please resend?
> 
> Thanks and it's great to see you again!
> 

Trying a new mail client, sorry. It *seems* to be working now, how's
this?

>From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 21 Sep 2016 18:23:43 +1000
Subject: [PATCH] percpu: improve generic percpu modify-return implementations

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

To: Tejun Heo <tj@kernel.org>
To: Christoph Lameter <cl@linux.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-22  4:35       ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-22  4:35 UTC (permalink / raw)
  To: Tejun Heo; +Cc: linux-arch, Christoph Lameter, linuxppc-dev, linux-kernel

On Wed, 21 Sep 2016 10:23:43 -0400
Tejun Heo <tj@kernel.org> wrote:

> Hello, Nick.
> 
> How have you been? :)

Hey Tejun,

Well thank you, how about you?
 
> On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote:
> > On Wed, 21 Sep 2016 18:51:37 +1000
> > Nicholas Piggin <npiggin@gmail.com> wrote:
> >   
> > > Some architectures require an additional load to find the address of
> > > percpu pointers. In some implemenatations, the C aliasing rules do not
> > > allow the result of that load to be kept over the store that modifies
> > > the percpu variable, which causes additional loads.  
> > 
> > Sorry I picked up an old patch here. This one should be better.
> > 
> > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
> > From: Nicholas Piggin <npiggin@gmail.com>
> > Date: Wed, 21 Sep 2016 18:23:43 +1000
> > Subject: [PATCH] percpu: improve generic percpu modify-return implementations
> > 
> > Some architectures require an additional load to find the address of
> > percpu pointers. In some implemenatations, the C aliasing rules do not
> > allow the result of that load to be kept over the store that modifies
> > the percpu variable, which causes additional loads.
> > 
> > Work around this by finding the pointer first, then operating on that.
> > 
> > It's also possible to mark things as restrict and those kind of games,
> > but that can require larger and arch specific changes.
> > 
> > On powerpc, __this_cpu_inc_return compiles to:
> > 
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> >         ld 9,48(13)
> >         ldx 3,9,3
> > 
> > With this patch it compiles to:
> > 
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> > 
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>  
> 
> Patch looks good to me but seems QP encoded.  Can you please resend?
> 
> Thanks and it's great to see you again!
> 

Trying a new mail client, sorry. It *seems* to be working now, how's
this?

From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 21 Sep 2016 18:23:43 +1000
Subject: [PATCH] percpu: improve generic percpu modify-return implementations

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

To: Tejun Heo <tj@kernel.org>
To: Christoph Lameter <cl@linux.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-22  4:35       ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-22  4:35 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Christoph Lameter, linux-kernel, linux-arch, linuxppc-dev

On Wed, 21 Sep 2016 10:23:43 -0400
Tejun Heo <tj@kernel.org> wrote:

> Hello, Nick.
> 
> How have you been? :)

Hey Tejun,

Well thank you, how about you?
 
> On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote:
> > On Wed, 21 Sep 2016 18:51:37 +1000
> > Nicholas Piggin <npiggin@gmail.com> wrote:
> >   
> > > Some architectures require an additional load to find the address of
> > > percpu pointers. In some implemenatations, the C aliasing rules do not
> > > allow the result of that load to be kept over the store that modifies
> > > the percpu variable, which causes additional loads.  
> > 
> > Sorry I picked up an old patch here. This one should be better.
> > 
> > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
> > From: Nicholas Piggin <npiggin@gmail.com>
> > Date: Wed, 21 Sep 2016 18:23:43 +1000
> > Subject: [PATCH] percpu: improve generic percpu modify-return implementations
> > 
> > Some architectures require an additional load to find the address of
> > percpu pointers. In some implemenatations, the C aliasing rules do not
> > allow the result of that load to be kept over the store that modifies
> > the percpu variable, which causes additional loads.
> > 
> > Work around this by finding the pointer first, then operating on that.
> > 
> > It's also possible to mark things as restrict and those kind of games,
> > but that can require larger and arch specific changes.
> > 
> > On powerpc, __this_cpu_inc_return compiles to:
> > 
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> >         ld 9,48(13)
> >         ldx 3,9,3
> > 
> > With this patch it compiles to:
> > 
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> > 
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>  
> 
> Patch looks good to me but seems QP encoded.  Can you please resend?
> 
> Thanks and it's great to see you again!
> 

Trying a new mail client, sorry. It *seems* to be working now, how's
this?

From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 21 Sep 2016 18:23:43 +1000
Subject: [PATCH] percpu: improve generic percpu modify-return implementations

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

To: Tejun Heo <tj@kernel.org>
To: Christoph Lameter <cl@linux.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
 
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
 
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
+									\
+	*__p += val;							\
+	*__p;								\
 })
 
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = *__p;							\
+	*__p = nval;							\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = *__p;							\
 	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p = nval;						\
 	__ret;								\
 })
 
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
 ({									\
+	typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2));			\
 	int __ret = 0;							\
-	if (raw_cpu_read(pcp1) == (oval1) &&				\
-			 raw_cpu_read(pcp2)  == (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 == (oval1) && *__p2  == (oval2)) {			\
+		*__p1 = nval1;						\
+		*__p2 = nval2;						\
 		__ret = 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret = *this_cpu_ptr(&(pcp));					\
+	__ret = raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
 
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret = raw_cpu_read(pcp);					\
+	__ret = raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret = raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret = raw_cpu_read(pcp);					\
-	if (__ret == (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
 
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
 
 #ifndef raw_cpu_write_1
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-22  4:35       ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-22  4:35 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Christoph Lameter, linux-kernel, linux-arch, linuxppc-dev

On Wed, 21 Sep 2016 10:23:43 -0400
Tejun Heo <tj@kernel.org> wrote:

> Hello, Nick.
>=20
> How have you been? :)

Hey Tejun,

Well thank you, how about you?
=20
> On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote:
> > On Wed, 21 Sep 2016 18:51:37 +1000
> > Nicholas Piggin <npiggin@gmail.com> wrote:
> >  =20
> > > Some architectures require an additional load to find the address of
> > > percpu pointers. In some implemenatations, the C aliasing rules do not
> > > allow the result of that load to be kept over the store that modifies
> > > the percpu variable, which causes additional loads. =20
> >=20
> > Sorry I picked up an old patch here. This one should be better.
> >=20
> > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
> > From: Nicholas Piggin <npiggin@gmail.com>
> > Date: Wed, 21 Sep 2016 18:23:43 +1000
> > Subject: [PATCH] percpu: improve generic percpu modify-return implement=
ations
> >=20
> > Some architectures require an additional load to find the address of
> > percpu pointers. In some implemenatations, the C aliasing rules do not
> > allow the result of that load to be kept over the store that modifies
> > the percpu variable, which causes additional loads.
> >=20
> > Work around this by finding the pointer first, then operating on that.
> >=20
> > It's also possible to mark things as restrict and those kind of games,
> > but that can require larger and arch specific changes.
> >=20
> > On powerpc, __this_cpu_inc_return compiles to:
> >=20
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> >         ld 9,48(13)
> >         ldx 3,9,3
> >=20
> > With this patch it compiles to:
> >=20
> >         ld 10,48(13)
> >         ldx 9,3,10
> >         addi 9,9,1
> >         stdx 9,3,10
> >=20
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com> =20
>=20
> Patch looks good to me but seems QP encoded.  Can you please resend?
>=20
> Thanks and it's great to see you again!
>=20

Trying a new mail client, sorry. It *seems* to be working now, how's
this?

=46rom d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 21 Sep 2016 18:23:43 +1000
Subject: [PATCH] percpu: improve generic percpu modify-return implementatio=
ns

Some architectures require an additional load to find the address of
percpu pointers. In some implemenatations, the C aliasing rules do not
allow the result of that load to be kept over the store that modifies
the percpu variable, which causes additional loads.

Work around this by finding the pointer first, then operating on that.

It's also possible to mark things as restrict and those kind of games,
but that can require larger and arch specific changes.

On powerpc, __this_cpu_inc_return compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10
        ld 9,48(13)
        ldx 3,9,3

With this patch it compiles to:

        ld 10,48(13)
        ldx 9,3,10
        addi 9,9,1
        stdx 9,3,10

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

To: Tejun Heo <tj@kernel.org>
To: Christoph Lameter <cl@linux.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-arch@vger.kernel.org
---
 include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++---------------=
----
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 4d9f233..40e8870 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void);
 #define PER_CPU_DEF_ATTRIBUTES
 #endif
=20
+#define raw_cpu_generic_read(pcp)					\
+({									\
+	*raw_cpu_ptr(&(pcp));						\
+})
+
 #define raw_cpu_generic_to_op(pcp, val, op)				\
 do {									\
 	*raw_cpu_ptr(&(pcp)) op val;					\
@@ -72,34 +77,39 @@ do {									\
=20
 #define raw_cpu_generic_add_return(pcp, val)				\
 ({									\
-	raw_cpu_add(pcp, val);						\
-	raw_cpu_read(pcp);						\
+	typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp));			\
+									\
+	*__p +=3D val;							\
+	*__p;								\
 })
=20
 #define raw_cpu_generic_xchg(pcp, nval)					\
 ({									\
+	typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret =3D raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret =3D *__p;							\
+	*__p =3D nval;							\
 	__ret;								\
 })
=20
 #define raw_cpu_generic_cmpxchg(pcp, oval, nval)			\
 ({									\
+	typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp));			\
 	typeof(pcp) __ret;						\
-	__ret =3D raw_cpu_read(pcp);					\
+	__ret =3D *__p;							\
 	if (__ret =3D=3D (oval))						\
-		raw_cpu_write(pcp, nval);				\
+		*__p =3D nval;						\
 	__ret;								\
 })
=20
 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nv=
al2) \
 ({									\
+	typeof(&(pcp1)) __p1 =3D raw_cpu_ptr(&(pcp1));			\
+	typeof(&(pcp2)) __p2 =3D raw_cpu_ptr(&(pcp2));			\
 	int __ret =3D 0;							\
-	if (raw_cpu_read(pcp1) =3D=3D (oval1) &&				\
-			 raw_cpu_read(pcp2)  =3D=3D (oval2)) {		\
-		raw_cpu_write(pcp1, nval1);				\
-		raw_cpu_write(pcp2, nval2);				\
+	if (*__p1 =3D=3D (oval1) && *__p2  =3D=3D (oval2)) {			\
+		*__p1 =3D nval1;						\
+		*__p2 =3D nval2;						\
 		__ret =3D 1;						\
 	}								\
 	(__ret);							\
@@ -109,7 +119,7 @@ do {									\
 ({									\
 	typeof(pcp) __ret;						\
 	preempt_disable();						\
-	__ret =3D *this_cpu_ptr(&(pcp));					\
+	__ret =3D raw_cpu_generic_read(pcp);				\
 	preempt_enable();						\
 	__ret;								\
 })
@@ -118,17 +128,17 @@ do {									\
 do {									\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	*raw_cpu_ptr(&(pcp)) op val;					\
+	raw_cpu_generic_to_op(pcp, val, op);				\
 	raw_local_irq_restore(__flags);					\
 } while (0)
=20
+
 #define this_cpu_generic_add_return(pcp, val)				\
 ({									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	raw_cpu_add(pcp, val);						\
-	__ret =3D raw_cpu_read(pcp);					\
+	__ret =3D raw_cpu_generic_add_return(pcp, val);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -138,8 +148,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret =3D raw_cpu_read(pcp);					\
-	raw_cpu_write(pcp, nval);					\
+	__ret =3D raw_cpu_generic_xchg(pcp, nval);			\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -149,9 +158,7 @@ do {									\
 	typeof(pcp) __ret;						\
 	unsigned long __flags;						\
 	raw_local_irq_save(__flags);					\
-	__ret =3D raw_cpu_read(pcp);					\
-	if (__ret =3D=3D (oval))						\
-		raw_cpu_write(pcp, nval);				\
+	__ret =3D raw_cpu_generic_cmpxchg(pcp, oval, nval);		\
 	raw_local_irq_restore(__flags);					\
 	__ret;								\
 })
@@ -168,16 +175,16 @@ do {									\
 })
=20
 #ifndef raw_cpu_read_1
-#define raw_cpu_read_1(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_1(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_2
-#define raw_cpu_read_2(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_2(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_4
-#define raw_cpu_read_4(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_4(pcp)		raw_cpu_generic_read(pcp)
 #endif
 #ifndef raw_cpu_read_8
-#define raw_cpu_read_8(pcp)		(*raw_cpu_ptr(&(pcp)))
+#define raw_cpu_read_8(pcp)		raw_cpu_generic_read(pcp)
 #endif
=20
 #ifndef raw_cpu_write_1
--=20
2.9.3

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-21 20:16       ` Christoph Lameter
@ 2016-09-22  4:42         ` Nicholas Piggin
  -1 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-22  4:42 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, linux-kernel, linux-arch, linux-mm, linuxppc-dev

On Wed, 21 Sep 2016 15:16:25 -0500 (CDT)
Christoph Lameter <cl@linux.com> wrote:

> On Wed, 21 Sep 2016, Tejun Heo wrote:
> 
> > Hello, Nick.
> >
> > How have you been? :)
> >  
> 
> He is baack. Are we getting SL!B? ;-)
> 

Hey Christoph. Sure, why not.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
@ 2016-09-22  4:42         ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-22  4:42 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Tejun Heo, linux-kernel, linux-arch, linux-mm, linuxppc-dev

On Wed, 21 Sep 2016 15:16:25 -0500 (CDT)
Christoph Lameter <cl@linux.com> wrote:

> On Wed, 21 Sep 2016, Tejun Heo wrote:
> 
> > Hello, Nick.
> >
> > How have you been? :)
> >  
> 
> He is baack. Are we getting SL!B? ;-)
> 

Hey Christoph. Sure, why not.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-22  4:35       ` Nicholas Piggin
                         ` (2 preceding siblings ...)
  (?)
@ 2016-09-22 16:07       ` Tejun Heo
  2016-09-23  7:33         ` Nicholas Piggin
  -1 siblings, 1 reply; 24+ messages in thread
From: Tejun Heo @ 2016-09-22 16:07 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: Christoph Lameter, linux-kernel, linux-arch, linuxppc-dev

Hello,

On Thu, Sep 22, 2016 at 02:35:00PM +1000, Nicholas Piggin wrote:
> Well thank you, how about you?

Heh, can't complain.  Hope to see you around sometime.  It's been
forever.

> Trying a new mail client, sorry. It *seems* to be working now, how's
> this?

Hmm... Still encoded.

> From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
> From: Nicholas Piggin <npiggin@gmail.com>
> Date: Wed, 21 Sep 2016 18:23:43 +1000
> Subject: [PATCH] percpu: improve generic percpu modify-return implementations
> 
> Some architectures require an additional load to find the address of
> percpu pointers. In some implemenatations, the C aliasing rules do not
> allow the result of that load to be kept over the store that modifies
> the percpu variable, which causes additional loads.
> 
> Work around this by finding the pointer first, then operating on that.
> 
> It's also possible to mark things as restrict and those kind of games,
> but that can require larger and arch specific changes.

QP-decoded and applied to percpu/for-4.9.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] percpu: improve generic percpu modify-return implementation
  2016-09-22 16:07       ` Tejun Heo
@ 2016-09-23  7:33         ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2016-09-23  7:33 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Christoph Lameter, linux-kernel, linux-arch, linuxppc-dev

On Thu, 22 Sep 2016 12:07:49 -0400
Tejun Heo <tj@kernel.org> wrote:

> Hello,
> 
> On Thu, Sep 22, 2016 at 02:35:00PM +1000, Nicholas Piggin wrote:
> > Well thank you, how about you?  
> 
> Heh, can't complain.  Hope to see you around sometime.  It's been
> forever.

Yeah, it has been. Hopefully I'll see you around.


> > Trying a new mail client, sorry. It *seems* to be working now, how's
> > this?  
> 
> Hmm... Still encoded.

It looks to be a helpful surprise feature of claws. Any line starting with
the word "From" make it go q-p due to some mail servers treating that
differently. Sigh.

> 
> > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001
> > From: Nicholas Piggin <npiggin@gmail.com>
> > Date: Wed, 21 Sep 2016 18:23:43 +1000
> > Subject: [PATCH] percpu: improve generic percpu modify-return implementations
> > 
> > Some architectures require an additional load to find the address of
> > percpu pointers. In some implemenatations, the C aliasing rules do not
> > allow the result of that load to be kept over the store that modifies
> > the percpu variable, which causes additional loads.
> > 
> > Work around this by finding the pointer first, then operating on that.
> > 
> > It's also possible to mark things as restrict and those kind of games,
> > but that can require larger and arch specific changes.  
> 
> QP-decoded and applied to percpu/for-4.9.
> 
> Thanks.
> 

Thanks!

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2016-09-23  7:34 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-21  8:51 [PATCH] percpu: improve generic percpu modify-return implementation Nicholas Piggin
2016-09-21 10:25 ` kbuild test robot
2016-09-21 10:25   ` kbuild test robot
2016-09-21 10:25   ` kbuild test robot
2016-09-21 10:30 ` kbuild test robot
2016-09-21 10:30   ` kbuild test robot
2016-09-21 10:30   ` kbuild test robot
2016-09-21 10:30 ` kbuild test robot
2016-09-21 10:30   ` kbuild test robot
2016-09-21 10:30   ` kbuild test robot
2016-09-21 10:57 ` Nicholas Piggin
2016-09-21 10:57   ` Nicholas Piggin
2016-09-21 10:57   ` Nicholas Piggin
2016-09-21 14:23   ` Tejun Heo
2016-09-21 20:16     ` Christoph Lameter
2016-09-21 20:16       ` Christoph Lameter
2016-09-22  4:42       ` Nicholas Piggin
2016-09-22  4:42         ` Nicholas Piggin
2016-09-22  4:35     ` Nicholas Piggin
2016-09-22  4:35       ` Nicholas Piggin
2016-09-22  4:35       ` Nicholas Piggin
2016-09-22  4:35       ` Nicholas Piggin
2016-09-22 16:07       ` Tejun Heo
2016-09-23  7:33         ` Nicholas Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.