From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753930AbcIVEt5 (ORCPT ); Thu, 22 Sep 2016 00:49:57 -0400 Received: from mail-pf0-f195.google.com ([209.85.192.195]:34741 "EHLO mail-pf0-f195.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751072AbcIVEtx (ORCPT ); Thu, 22 Sep 2016 00:49:53 -0400 Date: Thu, 22 Sep 2016 14:35:00 +1000 From: Nicholas Piggin To: Tejun Heo Cc: Christoph Lameter , linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org Subject: Re: [PATCH] percpu: improve generic percpu modify-return implementation Message-ID: <20160922143500.21809b98@roar.ozlabs.ibm.com> In-Reply-To: <20160921142343.GA10734@htj.duckdns.org> References: <20160921085137.862-1-npiggin@gmail.com> <20160921205711.4e804777@roar.ozlabs.ibm.com> <20160921142343.GA10734@htj.duckdns.org> Organization: IBM X-Mailer: Claws Mail 3.14.0 (GTK+ 2.24.31; x86_64-pc-linux-gnu) MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Transfer-Encoding: 8bit X-MIME-Autoconverted: from quoted-printable to 8bit by mail.home.local id u8M4o1PC010977 On Wed, 21 Sep 2016 10:23:43 -0400 Tejun Heo wrote: > Hello, Nick. > > How have you been? :) Hey Tejun, Well thank you, how about you? > On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote: > > On Wed, 21 Sep 2016 18:51:37 +1000 > > Nicholas Piggin wrote: > > > > > Some architectures require an additional load to find the address of > > > percpu pointers. In some implemenatations, the C aliasing rules do not > > > allow the result of that load to be kept over the store that modifies > > > the percpu variable, which causes additional loads. > > > > Sorry I picked up an old patch here. This one should be better. > > > > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 > > From: Nicholas Piggin > > Date: Wed, 21 Sep 2016 18:23:43 +1000 > > Subject: [PATCH] percpu: improve generic percpu modify-return implementations > > > > Some architectures require an additional load to find the address of > > percpu pointers. In some implemenatations, the C aliasing rules do not > > allow the result of that load to be kept over the store that modifies > > the percpu variable, which causes additional loads. > > > > Work around this by finding the pointer first, then operating on that. > > > > It's also possible to mark things as restrict and those kind of games, > > but that can require larger and arch specific changes. > > > > On powerpc, __this_cpu_inc_return compiles to: > > > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > > ld 9,48(13) > > ldx 3,9,3 > > > > With this patch it compiles to: > > > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > > > > Signed-off-by: Nicholas Piggin > > Patch looks good to me but seems QP encoded. Can you please resend? > > Thanks and it's great to see you again! > Trying a new mail client, sorry. It *seems* to be working now, how's this? >>From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Sep 2016 18:23:43 +1000 Subject: [PATCH] percpu: improve generic percpu modify-return implementations Some architectures require an additional load to find the address of percpu pointers. In some implemenatations, the C aliasing rules do not allow the result of that load to be kept over the store that modifies the percpu variable, which causes additional loads. Work around this by finding the pointer first, then operating on that. It's also possible to mark things as restrict and those kind of games, but that can require larger and arch specific changes. On powerpc, __this_cpu_inc_return compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 ld 9,48(13) ldx 3,9,3 With this patch it compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 Signed-off-by: Nicholas Piggin To: Tejun Heo To: Christoph Lameter Cc: linux-kernel@vger.kernel.org Cc: linux-arch@vger.kernel.org --- include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 4d9f233..40e8870 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void); #define PER_CPU_DEF_ATTRIBUTES #endif +#define raw_cpu_generic_read(pcp) \ +({ \ + *raw_cpu_ptr(&(pcp)); \ +}) + #define raw_cpu_generic_to_op(pcp, val, op) \ do { \ *raw_cpu_ptr(&(pcp)) op val; \ @@ -72,34 +77,39 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - raw_cpu_add(pcp, val); \ - raw_cpu_read(pcp); \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ + \ + *__p += val; \ + *__p; \ }) #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret = raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret = *__p; \ + *__p = nval; \ __ret; \ }) #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ + typeof(&(pcp)) __p = raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret = raw_cpu_read(pcp); \ + __ret = *__p; \ if (__ret == (oval)) \ - raw_cpu_write(pcp, nval); \ + *__p = nval; \ __ret; \ }) #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \ ({ \ + typeof(&(pcp1)) __p1 = raw_cpu_ptr(&(pcp1)); \ + typeof(&(pcp2)) __p2 = raw_cpu_ptr(&(pcp2)); \ int __ret = 0; \ - if (raw_cpu_read(pcp1) == (oval1) && \ - raw_cpu_read(pcp2) == (oval2)) { \ - raw_cpu_write(pcp1, nval1); \ - raw_cpu_write(pcp2, nval2); \ + if (*__p1 == (oval1) && *__p2 == (oval2)) { \ + *__p1 = nval1; \ + *__p2 = nval2; \ __ret = 1; \ } \ (__ret); \ @@ -109,7 +119,7 @@ do { \ ({ \ typeof(pcp) __ret; \ preempt_disable(); \ - __ret = *this_cpu_ptr(&(pcp)); \ + __ret = raw_cpu_generic_read(pcp); \ preempt_enable(); \ __ret; \ }) @@ -118,17 +128,17 @@ do { \ do { \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - *raw_cpu_ptr(&(pcp)) op val; \ + raw_cpu_generic_to_op(pcp, val, op); \ raw_local_irq_restore(__flags); \ } while (0) + #define this_cpu_generic_add_return(pcp, val) \ ({ \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - raw_cpu_add(pcp, val); \ - __ret = raw_cpu_read(pcp); \ + __ret = raw_cpu_generic_add_return(pcp, val); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -138,8 +148,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret = raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret = raw_cpu_generic_xchg(pcp, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -149,9 +158,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret = raw_cpu_read(pcp); \ - if (__ret == (oval)) \ - raw_cpu_write(pcp, nval); \ + __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -168,16 +175,16 @@ do { \ }) #ifndef raw_cpu_read_1 -#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_2 -#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_4 -#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_8 -#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_write_1 -- 2.9.3 From mboxrd@z Thu Jan 1 00:00:00 1970 From: Nicholas Piggin Subject: Re: [PATCH] percpu: improve generic percpu modify-return implementation Date: Thu, 22 Sep 2016 14:35:00 +1000 Message-ID: <20160922143500.21809b98@roar.ozlabs.ibm.com> References: <20160921085137.862-1-npiggin@gmail.com> <20160921205711.4e804777@roar.ozlabs.ibm.com> <20160921142343.GA10734@htj.duckdns.org> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Return-path: In-Reply-To: <20160921142343.GA10734@htj.duckdns.org> List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: linuxppc-dev-bounces+glppe-linuxppc-embedded-2=m.gmane.org@lists.ozlabs.org Sender: "Linuxppc-dev" To: Tejun Heo Cc: linux-arch@vger.kernel.org, Christoph Lameter , linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org List-Id: linux-arch.vger.kernel.org On Wed, 21 Sep 2016 10:23:43 -0400 Tejun Heo wrote: > Hello, Nick. >=20 > How have you been? :) Hey Tejun, Well thank you, how about you? =20 > On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote: > > On Wed, 21 Sep 2016 18:51:37 +1000 > > Nicholas Piggin wrote: > > =20 > > > Some architectures require an additional load to find the address of > > > percpu pointers. In some implemenatations, the C aliasing rules do not > > > allow the result of that load to be kept over the store that modifies > > > the percpu variable, which causes additional loads. =20 > >=20 > > Sorry I picked up an old patch here. This one should be better. > >=20 > > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 > > From: Nicholas Piggin > > Date: Wed, 21 Sep 2016 18:23:43 +1000 > > Subject: [PATCH] percpu: improve generic percpu modify-return implement= ations > >=20 > > Some architectures require an additional load to find the address of > > percpu pointers. In some implemenatations, the C aliasing rules do not > > allow the result of that load to be kept over the store that modifies > > the percpu variable, which causes additional loads. > >=20 > > Work around this by finding the pointer first, then operating on that. > >=20 > > It's also possible to mark things as restrict and those kind of games, > > but that can require larger and arch specific changes. > >=20 > > On powerpc, __this_cpu_inc_return compiles to: > >=20 > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > > ld 9,48(13) > > ldx 3,9,3 > >=20 > > With this patch it compiles to: > >=20 > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > >=20 > > Signed-off-by: Nicholas Piggin =20 >=20 > Patch looks good to me but seems QP encoded. Can you please resend? >=20 > Thanks and it's great to see you again! >=20 Trying a new mail client, sorry. It *seems* to be working now, how's this? =46rom d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Sep 2016 18:23:43 +1000 Subject: [PATCH] percpu: improve generic percpu modify-return implementatio= ns Some architectures require an additional load to find the address of percpu pointers. In some implemenatations, the C aliasing rules do not allow the result of that load to be kept over the store that modifies the percpu variable, which causes additional loads. Work around this by finding the pointer first, then operating on that. It's also possible to mark things as restrict and those kind of games, but that can require larger and arch specific changes. On powerpc, __this_cpu_inc_return compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 ld 9,48(13) ldx 3,9,3 With this patch it compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 Signed-off-by: Nicholas Piggin To: Tejun Heo To: Christoph Lameter Cc: linux-kernel@vger.kernel.org Cc: linux-arch@vger.kernel.org --- include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++---------------= ---- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 4d9f233..40e8870 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void); #define PER_CPU_DEF_ATTRIBUTES #endif =20 +#define raw_cpu_generic_read(pcp) \ +({ \ + *raw_cpu_ptr(&(pcp)); \ +}) + #define raw_cpu_generic_to_op(pcp, val, op) \ do { \ *raw_cpu_ptr(&(pcp)) op val; \ @@ -72,34 +77,39 @@ do { \ =20 #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - raw_cpu_add(pcp, val); \ - raw_cpu_read(pcp); \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ + \ + *__p +=3D val; \ + *__p; \ }) =20 #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret =3D raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret =3D *__p; \ + *__p =3D nval; \ __ret; \ }) =20 #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret =3D raw_cpu_read(pcp); \ + __ret =3D *__p; \ if (__ret =3D=3D (oval)) \ - raw_cpu_write(pcp, nval); \ + *__p =3D nval; \ __ret; \ }) =20 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nv= al2) \ ({ \ + typeof(&(pcp1)) __p1 =3D raw_cpu_ptr(&(pcp1)); \ + typeof(&(pcp2)) __p2 =3D raw_cpu_ptr(&(pcp2)); \ int __ret =3D 0; \ - if (raw_cpu_read(pcp1) =3D=3D (oval1) && \ - raw_cpu_read(pcp2) =3D=3D (oval2)) { \ - raw_cpu_write(pcp1, nval1); \ - raw_cpu_write(pcp2, nval2); \ + if (*__p1 =3D=3D (oval1) && *__p2 =3D=3D (oval2)) { \ + *__p1 =3D nval1; \ + *__p2 =3D nval2; \ __ret =3D 1; \ } \ (__ret); \ @@ -109,7 +119,7 @@ do { \ ({ \ typeof(pcp) __ret; \ preempt_disable(); \ - __ret =3D *this_cpu_ptr(&(pcp)); \ + __ret =3D raw_cpu_generic_read(pcp); \ preempt_enable(); \ __ret; \ }) @@ -118,17 +128,17 @@ do { \ do { \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - *raw_cpu_ptr(&(pcp)) op val; \ + raw_cpu_generic_to_op(pcp, val, op); \ raw_local_irq_restore(__flags); \ } while (0) =20 + #define this_cpu_generic_add_return(pcp, val) \ ({ \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - raw_cpu_add(pcp, val); \ - __ret =3D raw_cpu_read(pcp); \ + __ret =3D raw_cpu_generic_add_return(pcp, val); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -138,8 +148,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret =3D raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret =3D raw_cpu_generic_xchg(pcp, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -149,9 +158,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret =3D raw_cpu_read(pcp); \ - if (__ret =3D=3D (oval)) \ - raw_cpu_write(pcp, nval); \ + __ret =3D raw_cpu_generic_cmpxchg(pcp, oval, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -168,16 +175,16 @@ do { \ }) =20 #ifndef raw_cpu_read_1 -#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_2 -#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_4 -#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_8 -#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp) #endif =20 #ifndef raw_cpu_write_1 --=20 2.9.3 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf0-f195.google.com ([209.85.192.195]:34741 "EHLO mail-pf0-f195.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751072AbcIVEtx (ORCPT ); Thu, 22 Sep 2016 00:49:53 -0400 Date: Thu, 22 Sep 2016 14:35:00 +1000 From: Nicholas Piggin Subject: Re: [PATCH] percpu: improve generic percpu modify-return implementation Message-ID: <20160922143500.21809b98@roar.ozlabs.ibm.com> In-Reply-To: <20160921142343.GA10734@htj.duckdns.org> References: <20160921085137.862-1-npiggin@gmail.com> <20160921205711.4e804777@roar.ozlabs.ibm.com> <20160921142343.GA10734@htj.duckdns.org> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Sender: linux-arch-owner@vger.kernel.org List-ID: To: Tejun Heo Cc: Christoph Lameter , linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org Message-ID: <20160922043500.R--5yN2x8Jt_F6ZJsSH4EDFECIMeh-tSZaqlIOzcDHM@z> On Wed, 21 Sep 2016 10:23:43 -0400 Tejun Heo wrote: > Hello, Nick. >=20 > How have you been? :) Hey Tejun, Well thank you, how about you? =20 > On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote: > > On Wed, 21 Sep 2016 18:51:37 +1000 > > Nicholas Piggin wrote: > > =20 > > > Some architectures require an additional load to find the address of > > > percpu pointers. In some implemenatations, the C aliasing rules do not > > > allow the result of that load to be kept over the store that modifies > > > the percpu variable, which causes additional loads. =20 > >=20 > > Sorry I picked up an old patch here. This one should be better. > >=20 > > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 > > From: Nicholas Piggin > > Date: Wed, 21 Sep 2016 18:23:43 +1000 > > Subject: [PATCH] percpu: improve generic percpu modify-return implement= ations > >=20 > > Some architectures require an additional load to find the address of > > percpu pointers. In some implemenatations, the C aliasing rules do not > > allow the result of that load to be kept over the store that modifies > > the percpu variable, which causes additional loads. > >=20 > > Work around this by finding the pointer first, then operating on that. > >=20 > > It's also possible to mark things as restrict and those kind of games, > > but that can require larger and arch specific changes. > >=20 > > On powerpc, __this_cpu_inc_return compiles to: > >=20 > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > > ld 9,48(13) > > ldx 3,9,3 > >=20 > > With this patch it compiles to: > >=20 > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > >=20 > > Signed-off-by: Nicholas Piggin =20 >=20 > Patch looks good to me but seems QP encoded. Can you please resend? >=20 > Thanks and it's great to see you again! >=20 Trying a new mail client, sorry. It *seems* to be working now, how's this? =46rom d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Sep 2016 18:23:43 +1000 Subject: [PATCH] percpu: improve generic percpu modify-return implementatio= ns Some architectures require an additional load to find the address of percpu pointers. In some implemenatations, the C aliasing rules do not allow the result of that load to be kept over the store that modifies the percpu variable, which causes additional loads. Work around this by finding the pointer first, then operating on that. It's also possible to mark things as restrict and those kind of games, but that can require larger and arch specific changes. On powerpc, __this_cpu_inc_return compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 ld 9,48(13) ldx 3,9,3 With this patch it compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 Signed-off-by: Nicholas Piggin To: Tejun Heo To: Christoph Lameter Cc: linux-kernel@vger.kernel.org Cc: linux-arch@vger.kernel.org --- include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++---------------= ---- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 4d9f233..40e8870 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void); #define PER_CPU_DEF_ATTRIBUTES #endif =20 +#define raw_cpu_generic_read(pcp) \ +({ \ + *raw_cpu_ptr(&(pcp)); \ +}) + #define raw_cpu_generic_to_op(pcp, val, op) \ do { \ *raw_cpu_ptr(&(pcp)) op val; \ @@ -72,34 +77,39 @@ do { \ =20 #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - raw_cpu_add(pcp, val); \ - raw_cpu_read(pcp); \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ + \ + *__p +=3D val; \ + *__p; \ }) =20 #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret =3D raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret =3D *__p; \ + *__p =3D nval; \ __ret; \ }) =20 #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret =3D raw_cpu_read(pcp); \ + __ret =3D *__p; \ if (__ret =3D=3D (oval)) \ - raw_cpu_write(pcp, nval); \ + *__p =3D nval; \ __ret; \ }) =20 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nv= al2) \ ({ \ + typeof(&(pcp1)) __p1 =3D raw_cpu_ptr(&(pcp1)); \ + typeof(&(pcp2)) __p2 =3D raw_cpu_ptr(&(pcp2)); \ int __ret =3D 0; \ - if (raw_cpu_read(pcp1) =3D=3D (oval1) && \ - raw_cpu_read(pcp2) =3D=3D (oval2)) { \ - raw_cpu_write(pcp1, nval1); \ - raw_cpu_write(pcp2, nval2); \ + if (*__p1 =3D=3D (oval1) && *__p2 =3D=3D (oval2)) { \ + *__p1 =3D nval1; \ + *__p2 =3D nval2; \ __ret =3D 1; \ } \ (__ret); \ @@ -109,7 +119,7 @@ do { \ ({ \ typeof(pcp) __ret; \ preempt_disable(); \ - __ret =3D *this_cpu_ptr(&(pcp)); \ + __ret =3D raw_cpu_generic_read(pcp); \ preempt_enable(); \ __ret; \ }) @@ -118,17 +128,17 @@ do { \ do { \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - *raw_cpu_ptr(&(pcp)) op val; \ + raw_cpu_generic_to_op(pcp, val, op); \ raw_local_irq_restore(__flags); \ } while (0) =20 + #define this_cpu_generic_add_return(pcp, val) \ ({ \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - raw_cpu_add(pcp, val); \ - __ret =3D raw_cpu_read(pcp); \ + __ret =3D raw_cpu_generic_add_return(pcp, val); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -138,8 +148,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret =3D raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret =3D raw_cpu_generic_xchg(pcp, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -149,9 +158,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret =3D raw_cpu_read(pcp); \ - if (__ret =3D=3D (oval)) \ - raw_cpu_write(pcp, nval); \ + __ret =3D raw_cpu_generic_cmpxchg(pcp, oval, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -168,16 +175,16 @@ do { \ }) =20 #ifndef raw_cpu_read_1 -#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_2 -#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_4 -#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_8 -#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp) #endif =20 #ifndef raw_cpu_write_1 --=20 2.9.3 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf0-x243.google.com (mail-pf0-x243.google.com [IPv6:2607:f8b0:400e:c00::243]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by lists.ozlabs.org (Postfix) with ESMTPS id 3sfkDz0SYjzDsgg for ; Thu, 22 Sep 2016 14:35:15 +1000 (AEST) Received: by mail-pf0-x243.google.com with SMTP id 6so3225961pfl.2 for ; Wed, 21 Sep 2016 21:35:14 -0700 (PDT) Date: Thu, 22 Sep 2016 14:35:00 +1000 From: Nicholas Piggin To: Tejun Heo Cc: Christoph Lameter , linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org Subject: Re: [PATCH] percpu: improve generic percpu modify-return implementation Message-ID: <20160922143500.21809b98@roar.ozlabs.ibm.com> In-Reply-To: <20160921142343.GA10734@htj.duckdns.org> References: <20160921085137.862-1-npiggin@gmail.com> <20160921205711.4e804777@roar.ozlabs.ibm.com> <20160921142343.GA10734@htj.duckdns.org> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII List-Id: Linux on PowerPC Developers Mail List List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , On Wed, 21 Sep 2016 10:23:43 -0400 Tejun Heo wrote: > Hello, Nick. >=20 > How have you been? :) Hey Tejun, Well thank you, how about you? =20 > On Wed, Sep 21, 2016 at 08:57:11PM +1000, Nicholas Piggin wrote: > > On Wed, 21 Sep 2016 18:51:37 +1000 > > Nicholas Piggin wrote: > > =20 > > > Some architectures require an additional load to find the address of > > > percpu pointers. In some implemenatations, the C aliasing rules do not > > > allow the result of that load to be kept over the store that modifies > > > the percpu variable, which causes additional loads. =20 > >=20 > > Sorry I picked up an old patch here. This one should be better. > >=20 > > From d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 > > From: Nicholas Piggin > > Date: Wed, 21 Sep 2016 18:23:43 +1000 > > Subject: [PATCH] percpu: improve generic percpu modify-return implement= ations > >=20 > > Some architectures require an additional load to find the address of > > percpu pointers. In some implemenatations, the C aliasing rules do not > > allow the result of that load to be kept over the store that modifies > > the percpu variable, which causes additional loads. > >=20 > > Work around this by finding the pointer first, then operating on that. > >=20 > > It's also possible to mark things as restrict and those kind of games, > > but that can require larger and arch specific changes. > >=20 > > On powerpc, __this_cpu_inc_return compiles to: > >=20 > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > > ld 9,48(13) > > ldx 3,9,3 > >=20 > > With this patch it compiles to: > >=20 > > ld 10,48(13) > > ldx 9,3,10 > > addi 9,9,1 > > stdx 9,3,10 > >=20 > > Signed-off-by: Nicholas Piggin =20 >=20 > Patch looks good to me but seems QP encoded. Can you please resend? >=20 > Thanks and it's great to see you again! >=20 Trying a new mail client, sorry. It *seems* to be working now, how's this? =46rom d0cb9052d6f4c31d24f999b7b0cecb34681eee9b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Sep 2016 18:23:43 +1000 Subject: [PATCH] percpu: improve generic percpu modify-return implementatio= ns Some architectures require an additional load to find the address of percpu pointers. In some implemenatations, the C aliasing rules do not allow the result of that load to be kept over the store that modifies the percpu variable, which causes additional loads. Work around this by finding the pointer first, then operating on that. It's also possible to mark things as restrict and those kind of games, but that can require larger and arch specific changes. On powerpc, __this_cpu_inc_return compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 ld 9,48(13) ldx 3,9,3 With this patch it compiles to: ld 10,48(13) ldx 9,3,10 addi 9,9,1 stdx 9,3,10 Signed-off-by: Nicholas Piggin To: Tejun Heo To: Christoph Lameter Cc: linux-kernel@vger.kernel.org Cc: linux-arch@vger.kernel.org --- include/asm-generic/percpu.h | 53 +++++++++++++++++++++++++---------------= ---- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 4d9f233..40e8870 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -65,6 +65,11 @@ extern void setup_per_cpu_areas(void); #define PER_CPU_DEF_ATTRIBUTES #endif =20 +#define raw_cpu_generic_read(pcp) \ +({ \ + *raw_cpu_ptr(&(pcp)); \ +}) + #define raw_cpu_generic_to_op(pcp, val, op) \ do { \ *raw_cpu_ptr(&(pcp)) op val; \ @@ -72,34 +77,39 @@ do { \ =20 #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - raw_cpu_add(pcp, val); \ - raw_cpu_read(pcp); \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ + \ + *__p +=3D val; \ + *__p; \ }) =20 #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret =3D raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret =3D *__p; \ + *__p =3D nval; \ __ret; \ }) =20 #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ + typeof(&(pcp)) __p =3D raw_cpu_ptr(&(pcp)); \ typeof(pcp) __ret; \ - __ret =3D raw_cpu_read(pcp); \ + __ret =3D *__p; \ if (__ret =3D=3D (oval)) \ - raw_cpu_write(pcp, nval); \ + *__p =3D nval; \ __ret; \ }) =20 #define raw_cpu_generic_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nv= al2) \ ({ \ + typeof(&(pcp1)) __p1 =3D raw_cpu_ptr(&(pcp1)); \ + typeof(&(pcp2)) __p2 =3D raw_cpu_ptr(&(pcp2)); \ int __ret =3D 0; \ - if (raw_cpu_read(pcp1) =3D=3D (oval1) && \ - raw_cpu_read(pcp2) =3D=3D (oval2)) { \ - raw_cpu_write(pcp1, nval1); \ - raw_cpu_write(pcp2, nval2); \ + if (*__p1 =3D=3D (oval1) && *__p2 =3D=3D (oval2)) { \ + *__p1 =3D nval1; \ + *__p2 =3D nval2; \ __ret =3D 1; \ } \ (__ret); \ @@ -109,7 +119,7 @@ do { \ ({ \ typeof(pcp) __ret; \ preempt_disable(); \ - __ret =3D *this_cpu_ptr(&(pcp)); \ + __ret =3D raw_cpu_generic_read(pcp); \ preempt_enable(); \ __ret; \ }) @@ -118,17 +128,17 @@ do { \ do { \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - *raw_cpu_ptr(&(pcp)) op val; \ + raw_cpu_generic_to_op(pcp, val, op); \ raw_local_irq_restore(__flags); \ } while (0) =20 + #define this_cpu_generic_add_return(pcp, val) \ ({ \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - raw_cpu_add(pcp, val); \ - __ret =3D raw_cpu_read(pcp); \ + __ret =3D raw_cpu_generic_add_return(pcp, val); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -138,8 +148,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret =3D raw_cpu_read(pcp); \ - raw_cpu_write(pcp, nval); \ + __ret =3D raw_cpu_generic_xchg(pcp, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -149,9 +158,7 @@ do { \ typeof(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ - __ret =3D raw_cpu_read(pcp); \ - if (__ret =3D=3D (oval)) \ - raw_cpu_write(pcp, nval); \ + __ret =3D raw_cpu_generic_cmpxchg(pcp, oval, nval); \ raw_local_irq_restore(__flags); \ __ret; \ }) @@ -168,16 +175,16 @@ do { \ }) =20 #ifndef raw_cpu_read_1 -#define raw_cpu_read_1(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_1(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_2 -#define raw_cpu_read_2(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_2(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_4 -#define raw_cpu_read_4(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_4(pcp) raw_cpu_generic_read(pcp) #endif #ifndef raw_cpu_read_8 -#define raw_cpu_read_8(pcp) (*raw_cpu_ptr(&(pcp))) +#define raw_cpu_read_8(pcp) raw_cpu_generic_read(pcp) #endif =20 #ifndef raw_cpu_write_1 --=20 2.9.3