All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xf86-video-ati: Fix build for xserver < 1.13
@ 2016-11-24 18:00 Jochen Rollwagen
       [not found] ` <58372ADA.8040009-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Jochen Rollwagen @ 2016-11-24 18:00 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

[-- Attachment #1: Type: text/plain, Size: 832 bytes --]

Subject: [PATCH] fix build for xserver < 1.13

same procedure every few patches.....
---
  src/radeon_kms.c |    6 +++++-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/radeon_kms.c b/src/radeon_kms.c
index a5943d1..ec099c9 100644
--- a/src/radeon_kms.c
+++ b/src/radeon_kms.c
@@ -360,7 +360,11 @@ static Bool 
RADEONCreateScreenResources_KMS(ScreenPtr pScre
en)
      radeon_glamor_create_screen_resources(pScreen);

      info->callback_event_type = -1;
-    if (!pScreen->isGPU && (damage_ext = CheckExtension("DAMAGE"))) {
+    if (
+#ifdef RADEON_PIXMAP_SHARING
+        !pScreen->isGPU &&
+#endif
+        (damage_ext = CheckExtension("DAMAGE"))) {
      info->callback_event_type = damage_ext->eventBase + XDamageNotify;

      if (!AddCallback(&FlushCallback, radeon_flush_callback, pScrn))
-- 
1.7.9.5


[-- Attachment #2: 0001-fix-build-for-xserver-1.13.patch --]
[-- Type: text/x-patch, Size: 1000 bytes --]

>From d3bf43ba5ca62dc2409b58cb4f79efe387bf891f Mon Sep 17 00:00:00 2001
From: Jochen Rollwagen <joro-2013-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
Date: Thu, 24 Nov 2016 18:44:01 +0100
Subject: [PATCH] fix build for xserver < 1.13

same procedure every few patches.....
---
 src/radeon_kms.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/radeon_kms.c b/src/radeon_kms.c
index a5943d1..ec099c9 100644
--- a/src/radeon_kms.c
+++ b/src/radeon_kms.c
@@ -360,7 +360,11 @@ static Bool RADEONCreateScreenResources_KMS(ScreenPtr pScreen)
 	radeon_glamor_create_screen_resources(pScreen);
 
     info->callback_event_type = -1;
-    if (!pScreen->isGPU && (damage_ext = CheckExtension("DAMAGE"))) {
+    if (
+#ifdef RADEON_PIXMAP_SHARING
+	    !pScreen->isGPU &&
+#endif    
+    	(damage_ext = CheckExtension("DAMAGE"))) {
 	info->callback_event_type = damage_ext->eventBase + XDamageNotify;
 
 	if (!AddCallback(&FlushCallback, radeon_flush_callback, pScrn))
-- 
1.7.9.5


[-- Attachment #3: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] xf86-video-ati: Fix build for xserver < 1.13
       [not found] ` <58372ADA.8040009-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
@ 2016-11-25  9:14   ` Michel Dänzer
       [not found]     ` <dd911a26-9976-6570-f5ac-5efe67e27928-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Michel Dänzer @ 2016-11-25  9:14 UTC (permalink / raw)
  To: Jochen Rollwagen; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 25/11/16 03:00 AM, Jochen Rollwagen wrote:
> Subject: [PATCH] fix build for xserver < 1.13
> 
> same procedure every few patches.....

Sorry about that. I pushed your patch (with the indentation fixed up),
thanks.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH xf86-video-ati] Use finer-grained pointer types in mem copying functions
       [not found]     ` <dd911a26-9976-6570-f5ac-5efe67e27928-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2016-11-25 10:00       ` Jochen Rollwagen
       [not found]         ` <58380BA2.3060006-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Jochen Rollwagen @ 2016-11-25 10:00 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

This commit modifies some pointer definitions in functions copying 
memory corresponding to those in memcpy.
That should enable a compiler to produce better code (though i haven't 
checked whether that's the case).
---
  src/radeon.h       |    2 +-
  src/radeon_accel.c |   10 +++++-----
  src/radeon_dri2.c  |   14 +++++++-------
  3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/radeon.h b/src/radeon.h
index ad7e69c..cbc7866 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -599,7 +599,7 @@ typedef struct {
  /* radeon_accel.c */
  extern Bool RADEONAccelInit(ScreenPtr pScreen);
  extern void RADEONEngineInit(ScrnInfoPtr pScrn);
-extern void  RADEONCopySwap(uint8_t *dst, uint8_t *src, unsigned int 
size, int swap);
+extern void  RADEONCopySwap(uint8_t * __restrict dst, const uint8_t * 
__restrict src, unsigned int size, int swap);
  extern void RADEONInit3DEngine(ScrnInfoPtr pScrn);
  extern int radeon_cs_space_remaining(ScrnInfoPtr pScrn);

diff --git a/src/radeon_accel.c b/src/radeon_accel.c
index af2fc99..8c748f2 100644
--- a/src/radeon_accel.c
+++ b/src/radeon_accel.c
@@ -128,13 +128,13 @@ int radeon_cs_space_remaining(ScrnInfoPtr pScrn)
      return (info->cs->ndw - info->cs->cdw);
  }

-void RADEONCopySwap(uint8_t *dst, uint8_t *src, unsigned int size, int 
swap)
+void RADEONCopySwap(uint8_t * __restrict dst, const uint8_t * 
__restrict src, unsigned int size, int swap)
  {
      switch(swap) {
      case RADEON_HOST_DATA_SWAP_32BIT:
          {
-           unsigned int *d = (unsigned int *)dst;
-           unsigned int *s = (unsigned int *)src;
+           unsigned int * __restrict d = (unsigned int *)dst;
+           unsigned int * __restrict s = (unsigned int *)src;
             unsigned int nwords = size >> 2;

             for (; nwords > 0; --nwords, ++d, ++s)
@@ -148,8 +148,8 @@ void RADEONCopySwap(uint8_t *dst, uint8_t *src, 
unsigned int size, int swap)
          }
      case RADEON_HOST_DATA_SWAP_16BIT:
          {
-           unsigned short *d = (unsigned short *)dst;
-           unsigned short *s = (unsigned short *)src;
+           unsigned short * __restrict d = (unsigned short *)dst;
+           unsigned short * __restrict s = (unsigned short *)src;
             unsigned int nwords = size >> 1;

             for (; nwords > 0; --nwords, ++d, ++s)
diff --git a/src/radeon_dri2.c b/src/radeon_dri2.c
index 860ff29..0ff42e0 100644
--- a/src/radeon_dri2.c
+++ b/src/radeon_dri2.c
@@ -339,14 +339,14 @@ static void
  radeon_dri2_copy_region2(ScreenPtr pScreen,
                          DrawablePtr drawable,
                          RegionPtr region,
-                        BufferPtr dest_buffer,
-                        BufferPtr src_buffer)
+                        BufferPtr __restrict dest_buffer,
+                        const BufferPtr __restrict src_buffer)
  {
-    struct dri2_buffer_priv *src_private = src_buffer->driverPrivate;
-    struct dri2_buffer_priv *dst_private = dest_buffer->driverPrivate;
+    struct dri2_buffer_priv * __restrict src_private = 
src_buffer->driverPrivate;
+    struct dri2_buffer_priv * __restrict dst_private = 
dest_buffer->driverPrivate;
      ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
-    DrawablePtr src_drawable;
-    DrawablePtr dst_drawable;
+    DrawablePtr __restrict src_drawable;
+    DrawablePtr __restrict dst_drawable;
      RegionPtr copy_clip;
      GCPtr gc;
      RADEONInfoPtr info = RADEONPTR(pScrn);
@@ -435,7 +435,7 @@ radeon_dri2_copy_region2(ScreenPtr pScreen,

  void
  radeon_dri2_copy_region(DrawablePtr pDraw, RegionPtr pRegion,
-                        DRI2BufferPtr pDstBuffer, DRI2BufferPtr pSrcBuffer)
+                        DRI2BufferPtr __restrict pDstBuffer, 
DRI2BufferPtr __restrict pSrcBuffer)
  {
      return radeon_dri2_copy_region2(pDraw->pScreen, pDraw, pRegion,
                                      pDstBuffer, pSrcBuffer);
-- 
1.7.9.5

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH xf86-video-ati] Use finer-grained pointer types in mem copying functions
       [not found]         ` <58380BA2.3060006-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
@ 2016-11-28  9:43           ` Michel Dänzer
       [not found]             ` <f3c04cd0-06b0-85c4-48d0-1bfbdd98dfdf-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Michel Dänzer @ 2016-11-28  9:43 UTC (permalink / raw)
  To: Jochen Rollwagen; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 25/11/16 07:00 PM, Jochen Rollwagen wrote:
> This commit modifies some pointer definitions in functions copying
> memory corresponding to those in memcpy.
> That should enable a compiler to produce better code (though i haven't
> checked whether that's the case).

Please always check that patches actually have an effect before
submitting them. :)

With gcc, the stripped radeon_accel.o and radeon_dri2.o object files are
identical for me with and without this patch. With clang, the patch does
seem to make the code generated for RADEONCopySwap slightly smaller, but
I'm afraid I'm wary of applying this kind of micro-optimization without
justification such as benchmark numbers showing significant gains.


> @@ -128,13 +128,13 @@ int radeon_cs_space_remaining(ScrnInfoPtr pScrn)
>      return (info->cs->ndw - info->cs->cdw);
>  }
> 
> -void RADEONCopySwap(uint8_t *dst, uint8_t *src, unsigned int size, int
> swap)
> +void RADEONCopySwap(uint8_t * __restrict dst, const uint8_t *
> __restrict src, unsigned int size, int swap)
>  {
>      switch(swap) {
>      case RADEON_HOST_DATA_SWAP_32BIT:
>          {
> -           unsigned int *d = (unsigned int *)dst;
> -           unsigned int *s = (unsigned int *)src;
> +           unsigned int * __restrict d = (unsigned int *)dst;
> +           unsigned int * __restrict s = (unsigned int *)src;
>             unsigned int nwords = size >> 2;
> 
>             for (; nwords > 0; --nwords, ++d, ++s)

BTW, this hunk wouldn't apply to current master, looks like it's on top
of your previous patch removing the RADEON_HOST_DATA_SWAP_HDW case.
Please always either make sure patches you submit apply to current
master, or explicitly state dependencies on other patches.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH xf86-video-ati] Replace loop with clz to calculate log base 2 on non-x86 platforms in radeon.h
       [not found]             ` <f3c04cd0-06b0-85c4-48d0-1bfbdd98dfdf-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2016-11-28 18:18               ` Jochen Rollwagen
       [not found]                 ` <583C74DA.2070701-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Jochen Rollwagen @ 2016-11-28 18:18 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

[-- Attachment #1: Type: text/plain, Size: 944 bytes --]

This commit replaces the loop for calculating log base 2 for 
non-x86-platforms in radeon.h with a clz (count leading zeroes)-based 
version to simplify the code and, well, eliminate the loop.
Note: There’s no check for val=0 case, since x86-bsr is undefined for 
that case too, that should be okay.
---
  src/radeon.h |    7 +++----
  1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/radeon.h b/src/radeon.h
index cbc7866..b1a1ce0 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -933,17 +933,16 @@ enum {
  static __inline__ int
  RADEONLog2(int val)
  {
-    int bits;
  #if (defined __i386__ || defined __x86_64__) && (defined __GNUC__)
+    int bits;
+
      __asm volatile("bsrl    %1, %0"
          : "=r" (bits)
          : "c" (val)
      );
      return bits;
  #else
-    for (bits = 0; val != 0; val >>= 1, ++bits)
-        ;
-    return bits - 1;
+    return (31 - __builtin_clz(val));
  #endif
  }

-- 
1.7.9.5


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Replace-loop-with-clz-to-calculate-log-base-2-on-non.patch --]
[-- Type: text/x-patch; name="0001-Replace-loop-with-clz-to-calculate-log-base-2-on-non.patch", Size: 1257 bytes --]

>From d39f7a0d42fcc423030cd73bff8afe6d39d13e80 Mon Sep 17 00:00:00 2001
From: Jochen Rollwagen <joro-2013-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
Date: Mon, 28 Nov 2016 19:02:13 +0100
Subject: [PATCH] Replace loop with clz to calculate log base 2 on non-x86
 platforms in radeon.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit replaces the loop for calculating log base 2 for non-x86-platforms in radeon.h with a clz (count leading zeroes)-based version to simplify the code and, well, eliminate the loop.
Note: There’s no check for val=0 case, since x86-bsr is undefined for that case too, that should be okay.
---
 src/radeon.h |    7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/radeon.h b/src/radeon.h
index cbc7866..b1a1ce0 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -933,17 +933,16 @@ enum {
 static __inline__ int
 RADEONLog2(int val)
 {
-	int bits;
 #if (defined __i386__ || defined __x86_64__) && (defined __GNUC__)
+	int bits;
+	
 	__asm volatile("bsrl	%1, %0"
 		: "=r" (bits)
 		: "c" (val)
 	);
 	return bits;
 #else
-	for (bits = 0; val != 0; val >>= 1, ++bits)
-		;
-	return bits - 1;
+	return (31 - __builtin_clz(val));
 #endif
 }
 
-- 
1.7.9.5


[-- Attachment #3: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH xf86-video-ati] Replace loop with clz to calculate log base 2 on non-x86 platforms in radeon.h
       [not found]                 ` <583C74DA.2070701-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
@ 2016-11-29  7:32                   ` Michel Dänzer
       [not found]                     ` <e3556fa8-39eb-6b1d-9d71-316c23bff9ea-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Michel Dänzer @ 2016-11-29  7:32 UTC (permalink / raw)
  To: Jochen Rollwagen; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 29/11/16 03:18 AM, Jochen Rollwagen wrote:
> This commit replaces the loop for calculating log base 2 for
> non-x86-platforms in radeon.h with a clz (count leading zeroes)-based
> version to simplify the code and, well, eliminate the loop.
> Note: There’s no check for val=0 case, since x86-bsr is undefined for
> that case too, that should be okay.
> ---
>  src/radeon.h |    7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/src/radeon.h b/src/radeon.h
> index cbc7866..b1a1ce0 100644
> --- a/src/radeon.h
> +++ b/src/radeon.h
> @@ -933,17 +933,16 @@ enum {
>  static __inline__ int
>  RADEONLog2(int val)
>  {
> -    int bits;
>  #if (defined __i386__ || defined __x86_64__) && (defined __GNUC__)
> +    int bits;
> +
>      __asm volatile("bsrl    %1, %0"
>          : "=r" (bits)
>          : "c" (val)
>      );
>      return bits;
>  #else
> -    for (bits = 0; val != 0; val >>= 1, ++bits)
> -        ;
> -    return bits - 1;
> +    return (31 - __builtin_clz(val));
>  #endif
>  }

Any reason for not using __builtin_clz on x86 as well? AFAICT both gcc
and clang seem to generate more or less the same code with that as with
the inline assembly.


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH xf86-video-ati] Replace loop with clz to calculate log base 2 on non-x86 platforms in radeon.h
       [not found]                     ` <e3556fa8-39eb-6b1d-9d71-316c23bff9ea-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2016-11-30 17:52                       ` Jochen Rollwagen
       [not found]                         ` <583F11F6.7000005-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Jochen Rollwagen @ 2016-11-30 17:52 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 1514 bytes --]

Am 29.11.2016 um 08:32 schrieb Michel Dänzer:
> On 29/11/16 03:18 AM, Jochen Rollwagen wrote:
>> This commit replaces the loop for calculating log base 2 for
>> non-x86-platforms in radeon.h with a clz (count leading zeroes)-based
>> version to simplify the code and, well, eliminate the loop.
>> Note: There’s no check for val=0 case, since x86-bsr is undefined for
>> that case too, that should be okay.
>> ---
>>   src/radeon.h |    7 +++----
>>   1 file changed, 3 insertions(+), 4 deletions(-)
>>
>> diff --git a/src/radeon.h b/src/radeon.h
>> index cbc7866..b1a1ce0 100644
>> --- a/src/radeon.h
>> +++ b/src/radeon.h
>> @@ -933,17 +933,16 @@ enum {
>>   static __inline__ int
>>   RADEONLog2(int val)
>>   {
>> -    int bits;
>>   #if (defined __i386__ || defined __x86_64__) && (defined __GNUC__)
>> +    int bits;
>> +
>>       __asm volatile("bsrl    %1, %0"
>>           : "=r" (bits)
>>           : "c" (val)
>>       );
>>       return bits;
>>   #else
>> -    for (bits = 0; val != 0; val >>= 1, ++bits)
>> -        ;
>> -    return bits - 1;
>> +    return (31 - __builtin_clz(val));
>>   #endif
>>   }
> Any reason for not using __builtin_clz on x86 as well? AFAICT both gcc
> and clang seem to generate more or less the same code with that as with
> the inline assembly.
>
>
I guess not. According to 
http://stackoverflow.com/questions/9353973/implementation-of-builtin-clz 
"bsr and clz are related but different.

On x86 for clz gcc (-O2) generates:

|bsrl %edi, %eax xorl $31, %eax ret " |


[-- Attachment #1.2: Type: text/html, Size: 2084 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH xf86-video-ati] Replace loop with clz to calculate log base 2 on non-x86 platforms in radeon.h
       [not found]                         ` <583F11F6.7000005-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
@ 2016-12-01  0:57                           ` Michel Dänzer
       [not found]                             ` <b61bde0b-5120-5eaa-4243-18e30091ef86-otUistvHUpPR7s880joybQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Michel Dänzer @ 2016-12-01  0:57 UTC (permalink / raw)
  To: Jochen Rollwagen; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 01/12/16 02:52 AM, Jochen Rollwagen wrote:
> Am 29.11.2016 um 08:32 schrieb Michel Dänzer:
>> On 29/11/16 03:18 AM, Jochen Rollwagen wrote:
>>> This commit replaces the loop for calculating log base 2 for
>>> non-x86-platforms in radeon.h with a clz (count leading zeroes)-based
>>> version to simplify the code and, well, eliminate the loop.
>>> Note: There’s no check for val=0 case, since x86-bsr is undefined for
>>> that case too, that should be okay.
>>> ---
>>>  src/radeon.h |    7 +++----
>>>  1 file changed, 3 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/src/radeon.h b/src/radeon.h
>>> index cbc7866..b1a1ce0 100644
>>> --- a/src/radeon.h
>>> +++ b/src/radeon.h
>>> @@ -933,17 +933,16 @@ enum {
>>>  static __inline__ int
>>>  RADEONLog2(int val)
>>>  {
>>> -    int bits;
>>>  #if (defined __i386__ || defined __x86_64__) && (defined __GNUC__)
>>> +    int bits;
>>> +
>>>      __asm volatile("bsrl    %1, %0"
>>>          : "=r" (bits)
>>>          : "c" (val)
>>>      );
>>>      return bits;
>>>  #else
>>> -    for (bits = 0; val != 0; val >>= 1, ++bits)
>>> -        ;
>>> -    return bits - 1;
>>> +    return (31 - __builtin_clz(val));
>>>  #endif
>>>  }
>> Any reason for not using __builtin_clz on x86 as well? AFAICT both gcc
>> and clang seem to generate more or less the same code with that as with
>> the inline assembly.
>>
>>
> I guess not. According to
> http://stackoverflow.com/questions/9353973/implementation-of-builtin-clz
> "bsr and clz are related but different.
> 
> On x86 for clz gcc (-O2) generates:
> 
> bsrl %edi, %eax
> xorl $31, %eax
> ret "

That's not what I'm seeing. Have you compared the code generated by your
compiler in both cases?


-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH xf86-video-ati] Replace loop with clz to calculate log base 2 on non-x86 platforms in radeon.h
       [not found]                             ` <b61bde0b-5120-5eaa-4243-18e30091ef86-otUistvHUpPR7s880joybQ@public.gmane.org>
@ 2016-12-02  9:31                               ` Jochen Rollwagen
  0 siblings, 0 replies; 9+ messages in thread
From: Jochen Rollwagen @ 2016-12-02  9:31 UTC (permalink / raw)
  To: Michel Dänzer; +Cc: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 01.12.2016 um 01:57 schrieb Michel Dänzer:
> On 01/12/16 02:52 AM, Jochen Rollwagen wrote:
>> Am 29.11.2016 um 08:32 schrieb Michel Dänzer:
>>> On 29/11/16 03:18 AM, Jochen Rollwagen wrote:
>>>> This commit replaces the loop for calculating log base 2 for
>>>> non-x86-platforms in radeon.h with a clz (count leading zeroes)-based
>>>> version to simplify the code and, well, eliminate the loop.
>>>> Note: There’s no check for val=0 case, since x86-bsr is undefined for
>>>> that case too, that should be okay.
>>>> ---
>>>>   src/radeon.h |    7 +++----
>>>>   1 file changed, 3 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/src/radeon.h b/src/radeon.h
>>>> index cbc7866..b1a1ce0 100644
>>>> --- a/src/radeon.h
>>>> +++ b/src/radeon.h
>>>> @@ -933,17 +933,16 @@ enum {
>>>>   static __inline__ int
>>>>   RADEONLog2(int val)
>>>>   {
>>>> -    int bits;
>>>>   #if (defined __i386__ || defined __x86_64__) && (defined __GNUC__)
>>>> +    int bits;
>>>> +
>>>>       __asm volatile("bsrl    %1, %0"
>>>>           : "=r" (bits)
>>>>           : "c" (val)
>>>>       );
>>>>       return bits;
>>>>   #else
>>>> -    for (bits = 0; val != 0; val >>= 1, ++bits)
>>>> -        ;
>>>> -    return bits - 1;
>>>> +    return (31 - __builtin_clz(val));
>>>>   #endif
>>>>   }
>>> Any reason for not using __builtin_clz on x86 as well? AFAICT both gcc
>>> and clang seem to generate more or less the same code with that as with
>>> the inline assembly.
>>>
>>>
>> I guess not. According to
>> http://stackoverflow.com/questions/9353973/implementation-of-builtin-clz
>> "bsr and clz are related but different.
>>
>> On x86 for clz gcc (-O2) generates:
>>
>> bsrl %edi, %eax
>> xorl $31, %eax
>> ret "
> That's not what I'm seeing. Have you compared the code generated by your
> compiler in both cases?
>
>
Well, I'm on a powerpc platform, the generated code looks like this:

RADEONLog2:
     stwu %r1,-16(%r1)
     cntlzw %r3,%r3
     subfic %r3,%r3,31
     addi %r1,%r1,16
     blr

Straightforward enough.

But i guess you’re right, the generated code is similar enough to 
justify a general approach without platform-specific inline assembler 
and leaving the rest to the compiler.

  I'll post V2 of the patch.


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-12-02  9:31 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-24 18:00 [PATCH] xf86-video-ati: Fix build for xserver < 1.13 Jochen Rollwagen
     [not found] ` <58372ADA.8040009-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
2016-11-25  9:14   ` Michel Dänzer
     [not found]     ` <dd911a26-9976-6570-f5ac-5efe67e27928-otUistvHUpPR7s880joybQ@public.gmane.org>
2016-11-25 10:00       ` [PATCH xf86-video-ati] Use finer-grained pointer types in mem copying functions Jochen Rollwagen
     [not found]         ` <58380BA2.3060006-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
2016-11-28  9:43           ` Michel Dänzer
     [not found]             ` <f3c04cd0-06b0-85c4-48d0-1bfbdd98dfdf-otUistvHUpPR7s880joybQ@public.gmane.org>
2016-11-28 18:18               ` [PATCH xf86-video-ati] Replace loop with clz to calculate log base 2 on non-x86 platforms in radeon.h Jochen Rollwagen
     [not found]                 ` <583C74DA.2070701-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
2016-11-29  7:32                   ` Michel Dänzer
     [not found]                     ` <e3556fa8-39eb-6b1d-9d71-316c23bff9ea-otUistvHUpPR7s880joybQ@public.gmane.org>
2016-11-30 17:52                       ` Jochen Rollwagen
     [not found]                         ` <583F11F6.7000005-zqRNUXuvxA0b1SvskN2V4Q@public.gmane.org>
2016-12-01  0:57                           ` Michel Dänzer
     [not found]                             ` <b61bde0b-5120-5eaa-4243-18e30091ef86-otUistvHUpPR7s880joybQ@public.gmane.org>
2016-12-02  9:31                               ` Jochen Rollwagen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.