All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] fbdev/riva:change to use generice function to implement  reverse_order()
@ 2015-08-10 10:12 ` yalin wang
  0 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-10 10:12 UTC (permalink / raw)
  To: adaplas, plagnioj, tomi.valkeinen, linux-fbdev, open list

This change to use swab32(bitrev32()) to implement reverse_order()
function, have better performance on some platforms.

Signed-off-by: yalin wang <yalin.wang2010@gmail.com>
---
 drivers/video/fbdev/riva/fbdev.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c
index f1ad274..4803901 100644
--- a/drivers/video/fbdev/riva/fbdev.c
+++ b/drivers/video/fbdev/riva/fbdev.c
@@ -40,6 +40,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/backlight.h>
+#include <linux/swab.h>
 #include <linux/bitrev.h>
 #ifdef CONFIG_PMAC_BACKLIGHT
 #include <asm/machdep.h>
@@ -84,6 +85,7 @@
 #define SetBit(n)		(1<<(n))
 #define Set8Bits(value)		((value)&0xff)
 
+#define reverse_order(v) swab32(bitrev32(v))
 /* HW cursor parameters */
 #define MAX_CURS		32
 
@@ -451,15 +453,6 @@ static inline unsigned char MISCin(struct riva_par *par)
 	return (VGA_RD08(par->riva.PVIO, 0x3cc));
 }
 
-static inline void reverse_order(u32 *l)
-{
-	u8 *a = (u8 *)l;
-	a[0] = bitrev8(a[0]);
-	a[1] = bitrev8(a[1]);
-	a[2] = bitrev8(a[2]);
-	a[3] = bitrev8(a[3]);
-}
-
 /* ------------------------------------------------------------------------- *
  *
  * cursor stuff
@@ -497,8 +490,8 @@ static void rivafb_load_cursor_image(struct riva_par *par, u8 *data8,
 
 	for (i = 0; i < h; i++) {
 		b = *data++;
-		reverse_order(&b);
-		
+		b = reverse_order(b);
+
 		for (j = 0; j < w/2; j++) {
 			tmp = 0;
 #if defined (__BIG_ENDIAN)
@@ -1545,7 +1538,7 @@ static void rivafb_imageblit(struct fb_info *info,
 		for (i = 0; i < 16; i++) {
 			tmp = *((u32 *)cdat);
 			cdat = (u8 *)((u32 *)cdat + 1);
-			reverse_order(&tmp);
+			tmp = reverse_order(tmp);
 			NV_WR32(d, i*4, tmp);
 		}
 		size -= 16;
@@ -1555,7 +1548,7 @@ static void rivafb_imageblit(struct fb_info *info,
 		for (i = 0; i < size; i++) {
 			tmp = *((u32 *) cdat);
 			cdat = (u8 *)((u32 *)cdat + 1);
-			reverse_order(&tmp);
+			tmp = reverse_order(tmp);
 			NV_WR32(d, i*4, tmp);
 		}
 	}
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC] fbdev/riva:change to use generice function to implement  reverse_order()
@ 2015-08-10 10:12 ` yalin wang
  0 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-10 10:12 UTC (permalink / raw)
  To: adaplas, plagnioj, tomi.valkeinen, linux-fbdev, open list

This change to use swab32(bitrev32()) to implement reverse_order()
function, have better performance on some platforms.

Signed-off-by: yalin wang <yalin.wang2010@gmail.com>
---
 drivers/video/fbdev/riva/fbdev.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c
index f1ad274..4803901 100644
--- a/drivers/video/fbdev/riva/fbdev.c
+++ b/drivers/video/fbdev/riva/fbdev.c
@@ -40,6 +40,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/backlight.h>
+#include <linux/swab.h>
 #include <linux/bitrev.h>
 #ifdef CONFIG_PMAC_BACKLIGHT
 #include <asm/machdep.h>
@@ -84,6 +85,7 @@
 #define SetBit(n)		(1<<(n))
 #define Set8Bits(value)		((value)&0xff)
 
+#define reverse_order(v) swab32(bitrev32(v))
 /* HW cursor parameters */
 #define MAX_CURS		32
 
@@ -451,15 +453,6 @@ static inline unsigned char MISCin(struct riva_par *par)
 	return (VGA_RD08(par->riva.PVIO, 0x3cc));
 }
 
-static inline void reverse_order(u32 *l)
-{
-	u8 *a = (u8 *)l;
-	a[0] = bitrev8(a[0]);
-	a[1] = bitrev8(a[1]);
-	a[2] = bitrev8(a[2]);
-	a[3] = bitrev8(a[3]);
-}
-
 /* ------------------------------------------------------------------------- *
  *
  * cursor stuff
@@ -497,8 +490,8 @@ static void rivafb_load_cursor_image(struct riva_par *par, u8 *data8,
 
 	for (i = 0; i < h; i++) {
 		b = *data++;
-		reverse_order(&b);
-		
+		b = reverse_order(b);
+
 		for (j = 0; j < w/2; j++) {
 			tmp = 0;
 #if defined (__BIG_ENDIAN)
@@ -1545,7 +1538,7 @@ static void rivafb_imageblit(struct fb_info *info,
 		for (i = 0; i < 16; i++) {
 			tmp = *((u32 *)cdat);
 			cdat = (u8 *)((u32 *)cdat + 1);
-			reverse_order(&tmp);
+			tmp = reverse_order(tmp);
 			NV_WR32(d, i*4, tmp);
 		}
 		size -= 16;
@@ -1555,7 +1548,7 @@ static void rivafb_imageblit(struct fb_info *info,
 		for (i = 0; i < size; i++) {
 			tmp = *((u32 *) cdat);
 			cdat = (u8 *)((u32 *)cdat + 1);
-			reverse_order(&tmp);
+			tmp = reverse_order(tmp);
 			NV_WR32(d, i*4, tmp);
 		}
 	}
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-10 10:12 ` yalin wang
@ 2015-08-20 11:02   ` Tomi Valkeinen
  -1 siblings, 0 replies; 18+ messages in thread
From: Tomi Valkeinen @ 2015-08-20 11:02 UTC (permalink / raw)
  To: yalin wang, adaplas, plagnioj, linux-fbdev, open list

[-- Attachment #1: Type: text/plain, Size: 1510 bytes --]


On 10/08/15 13:12, yalin wang wrote:
> This change to use swab32(bitrev32()) to implement reverse_order()
> function, have better performance on some platforms.

Which platforms? Presuming you tested this, roughly how much better
performance? If you didn't, how do you know it's faster?

> Signed-off-by: yalin wang <yalin.wang2010@gmail.com>
> ---
>  drivers/video/fbdev/riva/fbdev.c | 19 ++++++-------------
>  1 file changed, 6 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c
> index f1ad274..4803901 100644
> --- a/drivers/video/fbdev/riva/fbdev.c
> +++ b/drivers/video/fbdev/riva/fbdev.c
> @@ -40,6 +40,7 @@
>  #include <linux/init.h>
>  #include <linux/pci.h>
>  #include <linux/backlight.h>
> +#include <linux/swab.h>
>  #include <linux/bitrev.h>
>  #ifdef CONFIG_PMAC_BACKLIGHT
>  #include <asm/machdep.h>
> @@ -84,6 +85,7 @@
>  #define SetBit(n)		(1<<(n))
>  #define Set8Bits(value)		((value)&0xff)
>  
> +#define reverse_order(v) swab32(bitrev32(v))
>  /* HW cursor parameters */
>  #define MAX_CURS		32
>  
> @@ -451,15 +453,6 @@ static inline unsigned char MISCin(struct riva_par *par)
>  	return (VGA_RD08(par->riva.PVIO, 0x3cc));
>  }
>  
> -static inline void reverse_order(u32 *l)

I would suggest to do the work in the inline function, instead of a
macro. And if you keep the function prototype the same, then the changes
to each reverse_order call site are not needed.

 Tomi


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-20 11:02   ` Tomi Valkeinen
  0 siblings, 0 replies; 18+ messages in thread
From: Tomi Valkeinen @ 2015-08-20 11:02 UTC (permalink / raw)
  To: yalin wang, adaplas, plagnioj, linux-fbdev, open list

[-- Attachment #1: Type: text/plain, Size: 1510 bytes --]


On 10/08/15 13:12, yalin wang wrote:
> This change to use swab32(bitrev32()) to implement reverse_order()
> function, have better performance on some platforms.

Which platforms? Presuming you tested this, roughly how much better
performance? If you didn't, how do you know it's faster?

> Signed-off-by: yalin wang <yalin.wang2010@gmail.com>
> ---
>  drivers/video/fbdev/riva/fbdev.c | 19 ++++++-------------
>  1 file changed, 6 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c
> index f1ad274..4803901 100644
> --- a/drivers/video/fbdev/riva/fbdev.c
> +++ b/drivers/video/fbdev/riva/fbdev.c
> @@ -40,6 +40,7 @@
>  #include <linux/init.h>
>  #include <linux/pci.h>
>  #include <linux/backlight.h>
> +#include <linux/swab.h>
>  #include <linux/bitrev.h>
>  #ifdef CONFIG_PMAC_BACKLIGHT
>  #include <asm/machdep.h>
> @@ -84,6 +85,7 @@
>  #define SetBit(n)		(1<<(n))
>  #define Set8Bits(value)		((value)&0xff)
>  
> +#define reverse_order(v) swab32(bitrev32(v))
>  /* HW cursor parameters */
>  #define MAX_CURS		32
>  
> @@ -451,15 +453,6 @@ static inline unsigned char MISCin(struct riva_par *par)
>  	return (VGA_RD08(par->riva.PVIO, 0x3cc));
>  }
>  
> -static inline void reverse_order(u32 *l)

I would suggest to do the work in the inline function, instead of a
macro. And if you keep the function prototype the same, then the changes
to each reverse_order call site are not needed.

 Tomi


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-20 11:02   ` Tomi Valkeinen
@ 2015-08-20 11:30     ` yalin wang
  -1 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-20 11:30 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: adaplas, plagnioj, linux-fbdev, open list


> On Aug 20, 2015, at 19:02, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
> 
> 
> On 10/08/15 13:12, yalin wang wrote:
>> This change to use swab32(bitrev32()) to implement reverse_order()
>> function, have better performance on some platforms.
> 
> Which platforms? Presuming you tested this, roughly how much better
> performance? If you didn't, how do you know it's faster?

i investigate on arm64 platforms:


for (j = dsize; j--;) {
                        tmp = data[k++];
                        tmp = reverse_order(tmp);
                        NVDmaNext(par, tmp);
 bac:   110006a4        add     w4, w21, #0x1
 bb0:   5ac000a3        rbit    w3, w5

        if (dsize) {
                NVDmaStart(info, par, RECT_EXPAND_TWO_COLOR_DATA(0), dsize);

                for (j = dsize; j--;) {
                        tmp = data[k++];
 bb4:   110006d6        add     w22, w22, #0x1
 bb8:   5ac00861        rev     w1, w3
                        tmp = reverse_order(tmp);
                        NVDmaNext(par, tmp);
 bbc:   b9041e64        str     w4, [x19,#1052]
 bc0:   8b3548c2        add     x2, x6, w21, uxtw #2
 bc4:   b9000041        str     w1, [x2]


this is the disassemble code after apply the patch,
only need:
rbit    w3, w5
rev     w1, w3
2 instruction to get the reverse_order() result,
apparently after than the origianl macro code.

> 
>> Signed-off-by: yalin wang <yalin.wang2010@gmail.com>
>> ---
>> drivers/video/fbdev/riva/fbdev.c | 19 ++++++-------------
>> 1 file changed, 6 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c
>> index f1ad274..4803901 100644
>> --- a/drivers/video/fbdev/riva/fbdev.c
>> +++ b/drivers/video/fbdev/riva/fbdev.c
>> @@ -40,6 +40,7 @@
>> #include <linux/init.h>
>> #include <linux/pci.h>
>> #include <linux/backlight.h>
>> +#include <linux/swab.h>
>> #include <linux/bitrev.h>
>> #ifdef CONFIG_PMAC_BACKLIGHT
>> #include <asm/machdep.h>
>> @@ -84,6 +85,7 @@
>> #define SetBit(n)		(1<<(n))
>> #define Set8Bits(value)		((value)&0xff)
>> 
>> +#define reverse_order(v) swab32(bitrev32(v))
>> /* HW cursor parameters */
>> #define MAX_CURS		32
>> 
>> @@ -451,15 +453,6 @@ static inline unsigned char MISCin(struct riva_par *par)
>> 	return (VGA_RD08(par->riva.PVIO, 0x3cc));
>> }
>> 
>> -static inline void reverse_order(u32 *l)
> 
> I would suggest to do the work in the inline function, instead of a
> macro. And if you keep the function prototype the same, then the changes
> to each reverse_order call site are not needed.
> 

ok, i will change to a inline function. 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-20 11:30     ` yalin wang
  0 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-20 11:30 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: adaplas, plagnioj, linux-fbdev, open list


> On Aug 20, 2015, at 19:02, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
> 
> 
> On 10/08/15 13:12, yalin wang wrote:
>> This change to use swab32(bitrev32()) to implement reverse_order()
>> function, have better performance on some platforms.
> 
> Which platforms? Presuming you tested this, roughly how much better
> performance? If you didn't, how do you know it's faster?

i investigate on arm64 platforms:


for (j = dsize; j--;) {
                        tmp = data[k++];
                        tmp = reverse_order(tmp);
                        NVDmaNext(par, tmp);
 bac:   110006a4        add     w4, w21, #0x1
 bb0:   5ac000a3        rbit    w3, w5

        if (dsize) {
                NVDmaStart(info, par, RECT_EXPAND_TWO_COLOR_DATA(0), dsize);

                for (j = dsize; j--;) {
                        tmp = data[k++];
 bb4:   110006d6        add     w22, w22, #0x1
 bb8:   5ac00861        rev     w1, w3
                        tmp = reverse_order(tmp);
                        NVDmaNext(par, tmp);
 bbc:   b9041e64        str     w4, [x19,#1052]
 bc0:   8b3548c2        add     x2, x6, w21, uxtw #2
 bc4:   b9000041        str     w1, [x2]


this is the disassemble code after apply the patch,
only need:
rbit    w3, w5
rev     w1, w3
2 instruction to get the reverse_order() result,
apparently after than the origianl macro code.

> 
>> Signed-off-by: yalin wang <yalin.wang2010@gmail.com>
>> ---
>> drivers/video/fbdev/riva/fbdev.c | 19 ++++++-------------
>> 1 file changed, 6 insertions(+), 13 deletions(-)
>> 
>> diff --git a/drivers/video/fbdev/riva/fbdev.c b/drivers/video/fbdev/riva/fbdev.c
>> index f1ad274..4803901 100644
>> --- a/drivers/video/fbdev/riva/fbdev.c
>> +++ b/drivers/video/fbdev/riva/fbdev.c
>> @@ -40,6 +40,7 @@
>> #include <linux/init.h>
>> #include <linux/pci.h>
>> #include <linux/backlight.h>
>> +#include <linux/swab.h>
>> #include <linux/bitrev.h>
>> #ifdef CONFIG_PMAC_BACKLIGHT
>> #include <asm/machdep.h>
>> @@ -84,6 +85,7 @@
>> #define SetBit(n)		(1<<(n))
>> #define Set8Bits(value)		((value)&0xff)
>> 
>> +#define reverse_order(v) swab32(bitrev32(v))
>> /* HW cursor parameters */
>> #define MAX_CURS		32
>> 
>> @@ -451,15 +453,6 @@ static inline unsigned char MISCin(struct riva_par *par)
>> 	return (VGA_RD08(par->riva.PVIO, 0x3cc));
>> }
>> 
>> -static inline void reverse_order(u32 *l)
> 
> I would suggest to do the work in the inline function, instead of a
> macro. And if you keep the function prototype the same, then the changes
> to each reverse_order call site are not needed.
> 

ok, i will change to a inline function. 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-20 11:30     ` yalin wang
@ 2015-08-21  6:41       ` Tomi Valkeinen
  -1 siblings, 0 replies; 18+ messages in thread
From: Tomi Valkeinen @ 2015-08-21  6:41 UTC (permalink / raw)
  To: yalin wang; +Cc: adaplas, plagnioj, linux-fbdev, open list

[-- Attachment #1: Type: text/plain, Size: 745 bytes --]



On 20/08/15 14:30, yalin wang wrote:
> 
>> On Aug 20, 2015, at 19:02, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
>>
>>
>> On 10/08/15 13:12, yalin wang wrote:
>>> This change to use swab32(bitrev32()) to implement reverse_order()
>>> function, have better performance on some platforms.
>>
>> Which platforms? Presuming you tested this, roughly how much better
>> performance? If you didn't, how do you know it's faster?
> 
> i investigate on arm64 platforms:

Ok. So is any arm64 platform actually using these devices? If these
devices are mostly used by 32bit x86 platforms, optimizing them for
arm64 doesn't make any sense.

Possibly the patches are still good for x86 also, but that needs to be
proven.

 Tomi


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-21  6:41       ` Tomi Valkeinen
  0 siblings, 0 replies; 18+ messages in thread
From: Tomi Valkeinen @ 2015-08-21  6:41 UTC (permalink / raw)
  To: yalin wang; +Cc: adaplas, plagnioj, linux-fbdev, open list

[-- Attachment #1: Type: text/plain, Size: 745 bytes --]



On 20/08/15 14:30, yalin wang wrote:
> 
>> On Aug 20, 2015, at 19:02, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
>>
>>
>> On 10/08/15 13:12, yalin wang wrote:
>>> This change to use swab32(bitrev32()) to implement reverse_order()
>>> function, have better performance on some platforms.
>>
>> Which platforms? Presuming you tested this, roughly how much better
>> performance? If you didn't, how do you know it's faster?
> 
> i investigate on arm64 platforms:

Ok. So is any arm64 platform actually using these devices? If these
devices are mostly used by 32bit x86 platforms, optimizing them for
arm64 doesn't make any sense.

Possibly the patches are still good for x86 also, but that needs to be
proven.

 Tomi


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-21  6:41       ` Tomi Valkeinen
@ 2015-08-21  7:46         ` yalin wang
  -1 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-21  7:46 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: adaplas, plagnioj, linux-fbdev, open list


> On Aug 21, 2015, at 14:41, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
> 
> 
> 
> On 20/08/15 14:30, yalin wang wrote:
>> 
>>> On Aug 20, 2015, at 19:02, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
>>> 
>>> 
>>> On 10/08/15 13:12, yalin wang wrote:
>>>> This change to use swab32(bitrev32()) to implement reverse_order()
>>>> function, have better performance on some platforms.
>>> 
>>> Which platforms? Presuming you tested this, roughly how much better
>>> performance? If you didn't, how do you know it's faster?
>> 
>> i investigate on arm64 platforms:
> 
> Ok. So is any arm64 platform actually using these devices? If these
> devices are mostly used by 32bit x86 platforms, optimizing them for
> arm64 doesn't make any sense.
> 
> Possibly the patches are still good for x86 also, but that needs to be
> proven.
> 
not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
i compile by test :

use the patch:
  use swab32(bitrev32()):
  2775:       0f b6 d0                movzbl %al,%edx                                                                                                                                                    
  2778:       0f b6 c4                movzbl %ah,%eax
  277b:       0f b6 92 00 00 00 00    movzbl 0x0(%rdx),%edx
  2782:       0f b6 80 00 00 00 00    movzbl 0x0(%rax),%eax
  2789:       c1 e2 08                shl    $0x8,%edx
  278c:       09 d0                   or     %edx,%eax
  278e:       0f b6 d5                movzbl %ch,%edx
  2791:       0f b6 c9                movzbl %cl,%ecx
  2794:       0f b6 89 00 00 00 00    movzbl 0x0(%rcx),%ecx
  279b:       0f b6 92 00 00 00 00    movzbl 0x0(%rdx),%edx
  27a2:       0f b7 c0                movzwl %ax,%eax
  27a5:       c1 e1 08                shl    $0x8,%ecx
  27a8:       09 ca                   or     %ecx,%edx
  27aa:       c1 e2 10                shl    $0x10,%edx
  27ad:       09 d0                   or     %edx,%eax
  27af:       45 85 ff                test   %r15d,%r15d
  27b2:       0f c8                   bswap  %eax
4 memory access instructions,



without the patch:
use
do {                            \
-       u8 *a = (u8 *)(l);      \
-       a[0] = bitrev8(a[0]);   \
-       a[1] = bitrev8(a[1]);   \
-       a[2] = bitrev8(a[2]);   \
-       a[3] = bitrev8(a[3]);   \
-} while(0)



    277b:       45 0f b6 80 00 00 00    movzbl 0x0(%r8),%r8d
    2782:       00 
    2783:       c1 ee 10                shr    $0x10,%esi
    2786:       89 f2                   mov    %esi,%edx
    2788:       0f b6 f4                movzbl %ah,%esi
    278b:       c1 e8 18                shr    $0x18,%eax
    278e:       0f b6 d2                movzbl %dl,%edx
    2791:       48 98                   cltq   
    2793:       45 85 ed                test   %r13d,%r13d
    2796:       0f b6 92 00 00 00 00    movzbl 0x0(%rdx),%edx
    279d:       0f b6 80 00 00 00 00    movzbl 0x0(%rax),%eax
    27a4:       44 88 85 54 ff ff ff    mov    %r8b,-0xac(%rbp)
    27ab:       44 0f b6 86 00 00 00    movzbl 0x0(%rsi),%r8d
    27b2:       00 
    27b3:       88 95 56 ff ff ff       mov    %dl,-0xaa(%rbp)
    27b9:       88 85 57 ff ff ff       mov    %al,-0xa9(%rbp)
    27bf:       44 88 85 55 ff ff ff    mov    %r8b,-0xab(%rbp)

6 memory access instructions, and generate more code that the patch .

because the original code use byte access 4 times , i don’t
think have better performance. :)

Thanks







^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-21  7:46         ` yalin wang
  0 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-21  7:46 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: adaplas, plagnioj, linux-fbdev, open list


> On Aug 21, 2015, at 14:41, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
> 
> 
> 
> On 20/08/15 14:30, yalin wang wrote:
>> 
>>> On Aug 20, 2015, at 19:02, Tomi Valkeinen <tomi.valkeinen@ti.com> wrote:
>>> 
>>> 
>>> On 10/08/15 13:12, yalin wang wrote:
>>>> This change to use swab32(bitrev32()) to implement reverse_order()
>>>> function, have better performance on some platforms.
>>> 
>>> Which platforms? Presuming you tested this, roughly how much better
>>> performance? If you didn't, how do you know it's faster?
>> 
>> i investigate on arm64 platforms:
> 
> Ok. So is any arm64 platform actually using these devices? If these
> devices are mostly used by 32bit x86 platforms, optimizing them for
> arm64 doesn't make any sense.
> 
> Possibly the patches are still good for x86 also, but that needs to be
> proven.
> 
not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
i compile by test :

use the patch:
  use swab32(bitrev32()):
  2775:       0f b6 d0                movzbl %al,%edx                                                                                                                                                    
  2778:       0f b6 c4                movzbl %ah,%eax
  277b:       0f b6 92 00 00 00 00    movzbl 0x0(%rdx),%edx
  2782:       0f b6 80 00 00 00 00    movzbl 0x0(%rax),%eax
  2789:       c1 e2 08                shl    $0x8,%edx
  278c:       09 d0                   or     %edx,%eax
  278e:       0f b6 d5                movzbl %ch,%edx
  2791:       0f b6 c9                movzbl %cl,%ecx
  2794:       0f b6 89 00 00 00 00    movzbl 0x0(%rcx),%ecx
  279b:       0f b6 92 00 00 00 00    movzbl 0x0(%rdx),%edx
  27a2:       0f b7 c0                movzwl %ax,%eax
  27a5:       c1 e1 08                shl    $0x8,%ecx
  27a8:       09 ca                   or     %ecx,%edx
  27aa:       c1 e2 10                shl    $0x10,%edx
  27ad:       09 d0                   or     %edx,%eax
  27af:       45 85 ff                test   %r15d,%r15d
  27b2:       0f c8                   bswap  %eax
4 memory access instructions,



without the patch:
use
do {                            \
-       u8 *a = (u8 *)(l);      \
-       a[0] = bitrev8(a[0]);   \
-       a[1] = bitrev8(a[1]);   \
-       a[2] = bitrev8(a[2]);   \
-       a[3] = bitrev8(a[3]);   \
-} while(0)



    277b:       45 0f b6 80 00 00 00    movzbl 0x0(%r8),%r8d
    2782:       00 
    2783:       c1 ee 10                shr    $0x10,%esi
    2786:       89 f2                   mov    %esi,%edx
    2788:       0f b6 f4                movzbl %ah,%esi
    278b:       c1 e8 18                shr    $0x18,%eax
    278e:       0f b6 d2                movzbl %dl,%edx
    2791:       48 98                   cltq   
    2793:       45 85 ed                test   %r13d,%r13d
    2796:       0f b6 92 00 00 00 00    movzbl 0x0(%rdx),%edx
    279d:       0f b6 80 00 00 00 00    movzbl 0x0(%rax),%eax
    27a4:       44 88 85 54 ff ff ff    mov    %r8b,-0xac(%rbp)
    27ab:       44 0f b6 86 00 00 00    movzbl 0x0(%rsi),%r8d
    27b2:       00 
    27b3:       88 95 56 ff ff ff       mov    %dl,-0xaa(%rbp)
    27b9:       88 85 57 ff ff ff       mov    %al,-0xa9(%rbp)
    27bf:       44 88 85 55 ff ff ff    mov    %r8b,-0xab(%rbp)

6 memory access instructions, and generate more code that the patch .

because the original code use byte access 4 times , i don’t
think have better performance. :)

Thanks







^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-21  7:46         ` yalin wang
@ 2015-08-21  8:01           ` Tomi Valkeinen
  -1 siblings, 0 replies; 18+ messages in thread
From: Tomi Valkeinen @ 2015-08-21  8:01 UTC (permalink / raw)
  To: yalin wang; +Cc: adaplas, plagnioj, linux-fbdev, open list

[-- Attachment #1: Type: text/plain, Size: 726 bytes --]



On 21/08/15 10:46, yalin wang wrote:

>>> i investigate on arm64 platforms:
>>
>> Ok. So is any arm64 platform actually using these devices? If these
>> devices are mostly used by 32bit x86 platforms, optimizing them for
>> arm64 doesn't make any sense.
>>
>> Possibly the patches are still good for x86 also, but that needs to be
>> proven.
>>
> not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
> i compile by test :

For old drivers i386 may be more relevant than x86_64.

So you don't have the actual HW?

These kind of optimizations should have some real world measurements,
not just compiling, looking at the assembly and guessing whether it's
faster or not.

 Tomi


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-21  8:01           ` Tomi Valkeinen
  0 siblings, 0 replies; 18+ messages in thread
From: Tomi Valkeinen @ 2015-08-21  8:01 UTC (permalink / raw)
  To: yalin wang; +Cc: adaplas, plagnioj, linux-fbdev, open list

[-- Attachment #1: Type: text/plain, Size: 726 bytes --]



On 21/08/15 10:46, yalin wang wrote:

>>> i investigate on arm64 platforms:
>>
>> Ok. So is any arm64 platform actually using these devices? If these
>> devices are mostly used by 32bit x86 platforms, optimizing them for
>> arm64 doesn't make any sense.
>>
>> Possibly the patches are still good for x86 also, but that needs to be
>> proven.
>>
> not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
> i compile by test :

For old drivers i386 may be more relevant than x86_64.

So you don't have the actual HW?

These kind of optimizations should have some real world measurements,
not just compiling, looking at the assembly and guessing whether it's
faster or not.

 Tomi


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-21  8:01           ` Tomi Valkeinen
@ 2015-08-22  7:53             ` Afzal Mohammed
  -1 siblings, 0 replies; 18+ messages in thread
From: Afzal Mohammed @ 2015-08-22  7:53 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: yalin wang, adaplas, plagnioj, linux-fbdev, open list

Hi,

On Fri, Aug 21, 2015 at 11:01:41AM +0300, Tomi Valkeinen wrote:

> >> Possibly the patches are still good for x86 also, but that needs to be
> >> proven.
> >>
> > not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
> > i compile by test :
> 
> For old drivers i386 may be more relevant than x86_64.

It seems asm bit reversal is supported in Kernel on arm & arm64 only,
not sure whether any other arch even provide asm bit reversal
instruction.

> These kind of optimizations should have some real world measurements,

Not for this case, but once measured on ARM, iirc, a 32-bit asm bit
reversal as compared to doing it in C was taking 1 cycle as opposed to
~225 cycles!, of course writing optimized C could have made it fare
better, but still would reach no-way near asm bit reversal.

Regards
Afzal

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-22  7:53             ` Afzal Mohammed
  0 siblings, 0 replies; 18+ messages in thread
From: Afzal Mohammed @ 2015-08-22  7:53 UTC (permalink / raw)
  To: Tomi Valkeinen; +Cc: yalin wang, adaplas, plagnioj, linux-fbdev, open list

Hi,

On Fri, Aug 21, 2015 at 11:01:41AM +0300, Tomi Valkeinen wrote:

> >> Possibly the patches are still good for x86 also, but that needs to be
> >> proven.
> >>
> > not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
> > i compile by test :
> 
> For old drivers i386 may be more relevant than x86_64.

It seems asm bit reversal is supported in Kernel on arm & arm64 only,
not sure whether any other arch even provide asm bit reversal
instruction.

> These kind of optimizations should have some real world measurements,

Not for this case, but once measured on ARM, iirc, a 32-bit asm bit
reversal as compared to doing it in C was taking 1 cycle as opposed to
~225 cycles!, of course writing optimized C could have made it fare
better, but still would reach no-way near asm bit reversal.

Regards
Afzal

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-22  7:53             ` Afzal Mohammed
@ 2015-08-24  8:31               ` yalin wang
  -1 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-24  8:31 UTC (permalink / raw)
  To: Afzal Mohammed; +Cc: Tomi Valkeinen, adaplas, plagnioj, linux-fbdev, open list


> On Aug 22, 2015, at 15:53, Afzal Mohammed <afzal.mohd.ma@gmail.com> wrote:
> 
> Hi,
> 
> On Fri, Aug 21, 2015 at 11:01:41AM +0300, Tomi Valkeinen wrote:
> 
>>>> Possibly the patches are still good for x86 also, but that needs to be
>>>> proven.
>>>> 
>>> not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
>>> i compile by test :
>> 
>> For old drivers i386 may be more relevant than x86_64.
> 
> It seems asm bit reversal is supported in Kernel on arm & arm64 only,
> not sure whether any other arch even provide asm bit reversal
> instruction.

i only submit the bit reverse patch for arm / arm64 arch,
i am not sure if there are some other arch also have hard ware bit reverse 
instructions, need arch maintainers to submit if their arch also have these hard
ware instructions . :)

> 
>> These kind of optimizations should have some real world measurements,
> 
> Not for this case, but once measured on ARM, iirc, a 32-bit asm bit
> reversal as compared to doing it in C was taking 1 cycle as opposed to
> ~225 cycles!, of course writing optimized C could have made it fare
> better, but still would reach no-way near asm bit reversal.
> 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-24  8:31               ` yalin wang
  0 siblings, 0 replies; 18+ messages in thread
From: yalin wang @ 2015-08-24  8:31 UTC (permalink / raw)
  To: Afzal Mohammed; +Cc: Tomi Valkeinen, adaplas, plagnioj, linux-fbdev, open list


> On Aug 22, 2015, at 15:53, Afzal Mohammed <afzal.mohd.ma@gmail.com> wrote:
> 
> Hi,
> 
> On Fri, Aug 21, 2015 at 11:01:41AM +0300, Tomi Valkeinen wrote:
> 
>>>> Possibly the patches are still good for x86 also, but that needs to be
>>>> proven.
>>>> 
>>> not exactly, because x86_64 don’t have hardware instruction to do rbit OP,
>>> i compile by test :
>> 
>> For old drivers i386 may be more relevant than x86_64.
> 
> It seems asm bit reversal is supported in Kernel on arm & arm64 only,
> not sure whether any other arch even provide asm bit reversal
> instruction.

i only submit the bit reverse patch for arm / arm64 arch,
i am not sure if there are some other arch also have hard ware bit reverse 
instructions, need arch maintainers to submit if their arch also have these hard
ware instructions . :)

> 
>> These kind of optimizations should have some real world measurements,
> 
> Not for this case, but once measured on ARM, iirc, a 32-bit asm bit
> reversal as compared to doing it in C was taking 1 cycle as opposed to
> ~225 cycles!, of course writing optimized C could have made it fare
> better, but still would reach no-way near asm bit reversal.
> 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
  2015-08-24  8:31               ` yalin wang
@ 2015-08-24 13:53                 ` Afzal Mohammed
  -1 siblings, 0 replies; 18+ messages in thread
From: Afzal Mohammed @ 2015-08-24 13:41 UTC (permalink / raw)
  To: yalin wang; +Cc: Tomi Valkeinen, adaplas, plagnioj, linux-fbdev, open list

Hi,

On Mon, Aug 24, 2015 at 04:31:13PM +0800, yalin wang wrote:
> 
> i only submit the bit reverse patch for arm / arm64 arch,

yes, saw later git blaming it on you :)

> > Not for this case, but once measured on ARM, iirc, a 32-bit asm bit
> > reversal as compared to doing it in C was taking 1 cycle as opposed to
> > ~225 cycles!, of course writing optimized C could have made it fare
> > better, but still would reach no-way near asm bit reversal.

The above measurement was done not in Linux, rather on a baremetal
code, but seeing the efficient Kernel C implementation, realized that
the gain would not be that much, it would be good to know if there are
measurements for Kernel bitreversal in C & asm (on supported arch)

Regards
afzal

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] fbdev/riva:change to use generice function to implement reverse_order()
@ 2015-08-24 13:53                 ` Afzal Mohammed
  0 siblings, 0 replies; 18+ messages in thread
From: Afzal Mohammed @ 2015-08-24 13:53 UTC (permalink / raw)
  To: yalin wang; +Cc: Tomi Valkeinen, adaplas, plagnioj, linux-fbdev, open list

Hi,

On Mon, Aug 24, 2015 at 04:31:13PM +0800, yalin wang wrote:
> 
> i only submit the bit reverse patch for arm / arm64 arch,

yes, saw later git blaming it on you :)

> > Not for this case, but once measured on ARM, iirc, a 32-bit asm bit
> > reversal as compared to doing it in C was taking 1 cycle as opposed to
> > ~225 cycles!, of course writing optimized C could have made it fare
> > better, but still would reach no-way near asm bit reversal.

The above measurement was done not in Linux, rather on a baremetal
code, but seeing the efficient Kernel C implementation, realized that
the gain would not be that much, it would be good to know if there are
measurements for Kernel bitreversal in C & asm (on supported arch)

Regards
afzal

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-08-24 13:53 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-08-10 10:12 [RFC] fbdev/riva:change to use generice function to implement reverse_order() yalin wang
2015-08-10 10:12 ` yalin wang
2015-08-20 11:02 ` Tomi Valkeinen
2015-08-20 11:02   ` Tomi Valkeinen
2015-08-20 11:30   ` yalin wang
2015-08-20 11:30     ` yalin wang
2015-08-21  6:41     ` Tomi Valkeinen
2015-08-21  6:41       ` Tomi Valkeinen
2015-08-21  7:46       ` yalin wang
2015-08-21  7:46         ` yalin wang
2015-08-21  8:01         ` Tomi Valkeinen
2015-08-21  8:01           ` Tomi Valkeinen
2015-08-22  7:53           ` Afzal Mohammed
2015-08-22  7:53             ` Afzal Mohammed
2015-08-24  8:31             ` yalin wang
2015-08-24  8:31               ` yalin wang
2015-08-24 13:41               ` Afzal Mohammed
2015-08-24 13:53                 ` Afzal Mohammed

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.