All of lore.kernel.org
 help / color / mirror / Atom feed
* inlined asm x86-64 COPY_DWORDS macro
@ 2010-04-19 19:05 Conn Clark
  2010-04-19 19:15 ` Matt Turner
  0 siblings, 1 reply; 3+ messages in thread
From: Conn Clark @ 2010-04-19 19:05 UTC (permalink / raw)
  To: dri-devel

Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.




#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr )                                     \
do {                                            \
uint32_t * _src_ptr;                            \
uint32_t * _dest_ptr;                           \
uint32_t _size;                                 \
                                                \
  _dest_ptr = (uint32_t *)dst;                          \
  _src_ptr = src;                               \
  _size =nr;                                    \
     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
                "je     1f\n"                   \
                "movl   (%%rsi), %%eax\n"       \
                "addq   $4, %%rsi\n"            \
                "movl   %%eax, (%%rdi)\n"       \
                "addq   $4, %%rdi\n"            \
                "decl   %%ecx\n"                \
"1:             movl    %%ecx, %%eax\n"         \
                "shrl   $3, %%ecx\n"            \
                "je     3f\n"                   \
".p2align 4 \n"                                 \
"2:             movq    (%%rsi), %%r8\n"        \
                "movq   8(%%rsi), %%r9\n"       \
                "addq   $32, %%rsi\n"           \
                "movq   %%r8, (%%rdi)\n"        \
                "movq   %%r9, 8(%%rdi)\n"       \
                "addq   $32, %%rdi\n"           \
                "movq   -16(%%rsi), %%r8\n"     \
                "movq   -8(%%rsi), %%r9\n"      \
                "decl   %%ecx\n"                \
                "movq   %%r8, -16(%%rdi)\n"     \
                "movq   %%r9, -8(%%rdi)\n"      \
                "jnz    2b\n"                   \
"3:             testb  $7, %%al\n"              \
                "je     6f\n"                   \
                "testb  $4, %%al\n"             \
                "je     4f\n"                   \
                "movq    (%%rsi), %%r8\n"       \
                "movq    8(%%rsi), %%r9\n"      \
                "addq    $16, %%rsi\n"          \
                "movq   %%r8, (%%rdi)\n"        \
                "movq   %%r9, 8(%%rdi)\n"       \
                "addq    $16, %%rdi\n"          \
"4:     testb   $2, %%al\n"                     \
                "je      5f\n"                  \
                "movq    (%%rsi), %%r8\n"       \
                "addq   $8, %%rsi\n"            \
                "movq  %%r8, (%%rdi)\n"         \
                "addq    $8, %%rdi\n"           \
"5:    testb   $1, %%al\n"                      \
                "je     6f\n"                   \
                "movl   (%%rsi), %%eax\n"       \
                "movl   %%eax, (%%rdi)\n"       \
"6: \n"                                         \
                : "=%c" (_size)                 \
                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
                : "%eax", "%r8", "%r9"                  \
                );                                      \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: inlined asm x86-64 COPY_DWORDS macro
  2010-04-19 19:05 inlined asm x86-64 COPY_DWORDS macro Conn Clark
@ 2010-04-19 19:15 ` Matt Turner
  2010-04-19 19:44   ` Conn Clark
  0 siblings, 1 reply; 3+ messages in thread
From: Matt Turner @ 2010-04-19 19:15 UTC (permalink / raw)
  To: Conn Clark; +Cc: dri-devel

On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark <conn.o.clark@gmail.com> wrote:
> Hello everybody,
>
> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
> anybody would like to use it. it could be slightly improved by writing
> to 16 byte boundaries but its pretty near optimal when writing to
> uncached ram.
>
>
>
>
> #ifdef USE_X86_64_ASM
> #define COPY_DWORDS( dst, src, nr )                                     \
> do {                                            \
> uint32_t * _src_ptr;                            \
> uint32_t * _dest_ptr;                           \
> uint32_t _size;                                 \
>                                                \
>  _dest_ptr = (uint32_t *)dst;                          \
>  _src_ptr = src;                               \
>  _size =nr;                                    \
>     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
>                "je     1f\n"                   \
>                "movl   (%%rsi), %%eax\n"       \
>                "addq   $4, %%rsi\n"            \
>                "movl   %%eax, (%%rdi)\n"       \
>                "addq   $4, %%rdi\n"            \
>                "decl   %%ecx\n"                \
> "1:             movl    %%ecx, %%eax\n"         \
>                "shrl   $3, %%ecx\n"            \
>                "je     3f\n"                   \
> ".p2align 4 \n"                                 \
> "2:             movq    (%%rsi), %%r8\n"        \
>                "movq   8(%%rsi), %%r9\n"       \
>                "addq   $32, %%rsi\n"           \
>                "movq   %%r8, (%%rdi)\n"        \
>                "movq   %%r9, 8(%%rdi)\n"       \
>                "addq   $32, %%rdi\n"           \
>                "movq   -16(%%rsi), %%r8\n"     \
>                "movq   -8(%%rsi), %%r9\n"      \
>                "decl   %%ecx\n"                \
>                "movq   %%r8, -16(%%rdi)\n"     \
>                "movq   %%r9, -8(%%rdi)\n"      \
>                "jnz    2b\n"                   \
> "3:             testb  $7, %%al\n"              \
>                "je     6f\n"                   \
>                "testb  $4, %%al\n"             \
>                "je     4f\n"                   \
>                "movq    (%%rsi), %%r8\n"       \
>                "movq    8(%%rsi), %%r9\n"      \
>                "addq    $16, %%rsi\n"          \
>                "movq   %%r8, (%%rdi)\n"        \
>                "movq   %%r9, 8(%%rdi)\n"       \
>                "addq    $16, %%rdi\n"          \
> "4:     testb   $2, %%al\n"                     \
>                "je      5f\n"                  \
>                "movq    (%%rsi), %%r8\n"       \
>                "addq   $8, %%rsi\n"            \
>                "movq  %%r8, (%%rdi)\n"         \
>                "addq    $8, %%rdi\n"           \
> "5:    testb   $1, %%al\n"                      \
>                "je     6f\n"                   \
>                "movl   (%%rsi), %%eax\n"       \
>                "movl   %%eax, (%%rdi)\n"       \
> "6: \n"                                         \
>                : "=%c" (_size)                 \
>                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
>                : "%eax", "%r8", "%r9"                  \
>                );                                      \
> } while(0)
> #endif
>
> --
>
> Conn O. Clark
>
> Observation: In formal computer science advances are made
> by standing on the shoulders of giants. Linux has proved
> that if there are enough of you, you can advance just as
> far by stepping on each others toes.
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>

I'm not familiar with this code, but isn't this something we should
just let gcc handle? It's pretty smart about inlining calls to memcpy.
(Certainly your code is an improvement over "rep ; movsl" or "for ( j
= 0 ; j < nr ; j++ ) dst[j] = ((int *)src)[j];" but really, why bother
when gcc can do this in a much nicer more maintainable way?

Matt

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: inlined asm x86-64 COPY_DWORDS macro
  2010-04-19 19:15 ` Matt Turner
@ 2010-04-19 19:44   ` Conn Clark
  0 siblings, 0 replies; 3+ messages in thread
From: Conn Clark @ 2010-04-19 19:44 UTC (permalink / raw)
  To: Matt Turner; +Cc: dri-devel

On Mon, Apr 19, 2010 at 12:15 PM, Matt Turner <mattst88@gmail.com> wrote:
> On Mon, Apr 19, 2010 at 3:05 PM, Conn Clark <conn.o.clark@gmail.com> wrote:
>> Hello everybody,
>>
>> Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
>> anybody would like to use it. it could be slightly improved by writing
>> to 16 byte boundaries but its pretty near optimal when writing to
>> uncached ram.
>>
>>
>>
>>
>> #ifdef USE_X86_64_ASM
>> #define COPY_DWORDS( dst, src, nr )                                     \
>> do {                                            \
>> uint32_t * _src_ptr;                            \
>> uint32_t * _dest_ptr;                           \
>> uint32_t _size;                                 \
>>                                                \
>>  _dest_ptr = (uint32_t *)dst;                          \
>>  _src_ptr = src;                               \
>>  _size =nr;                                    \
>>     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
>>                "je     1f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "addq   $4, %%rsi\n"            \
>>                "movl   %%eax, (%%rdi)\n"       \
>>                "addq   $4, %%rdi\n"            \
>>                "decl   %%ecx\n"                \
>> "1:             movl    %%ecx, %%eax\n"         \
>>                "shrl   $3, %%ecx\n"            \
>>                "je     3f\n"                   \
>> ".p2align 4 \n"                                 \
>> "2:             movq    (%%rsi), %%r8\n"        \
>>                "movq   8(%%rsi), %%r9\n"       \
>>                "addq   $32, %%rsi\n"           \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq   $32, %%rdi\n"           \
>>                "movq   -16(%%rsi), %%r8\n"     \
>>                "movq   -8(%%rsi), %%r9\n"      \
>>                "decl   %%ecx\n"                \
>>                "movq   %%r8, -16(%%rdi)\n"     \
>>                "movq   %%r9, -8(%%rdi)\n"      \
>>                "jnz    2b\n"                   \
>> "3:             testb  $7, %%al\n"              \
>>                "je     6f\n"                   \
>>                "testb  $4, %%al\n"             \
>>                "je     4f\n"                   \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "movq    8(%%rsi), %%r9\n"      \
>>                "addq    $16, %%rsi\n"          \
>>                "movq   %%r8, (%%rdi)\n"        \
>>                "movq   %%r9, 8(%%rdi)\n"       \
>>                "addq    $16, %%rdi\n"          \
>> "4:     testb   $2, %%al\n"                     \
>>                "je      5f\n"                  \
>>                "movq    (%%rsi), %%r8\n"       \
>>                "addq   $8, %%rsi\n"            \
>>                "movq  %%r8, (%%rdi)\n"         \
>>                "addq    $8, %%rdi\n"           \
>> "5:    testb   $1, %%al\n"                      \
>>                "je     6f\n"                   \
>>                "movl   (%%rsi), %%eax\n"       \
>>                "movl   %%eax, (%%rdi)\n"       \
>> "6: \n"                                         \
>>                : "=%c" (_size)                 \
>>                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
>>                : "%eax", "%r8", "%r9"                  \
>>                );                                      \
>> } while(0)
>> #endif
>>
>> --
>>
>> Conn O. Clark
>>
>> Observation: In formal computer science advances are made
>> by standing on the shoulders of giants. Linux has proved
>> that if there are enough of you, you can advance just as
>> far by stepping on each others toes.
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> I'm not familiar with this code, but isn't this something we should
> just let gcc handle? It's pretty smart about inlining calls to memcpy.
> (Certainly your code is an improvement over "rep ; movsl" or "for ( j
> = 0 ; j < nr ; j++ ) dst[j] = ((int *)src)[j];" but really, why bother
> when gcc can do this in a much nicer more maintainable way?
>
> Matt
>

Matt,

GCC inlines memcpy with a REP MOVQ preceded by alignment checks for
byte word and double word to align the destination to be on 8 byte
boundaries. Its also followed further checks in case the number of
bytes is not evenly divisible by 4.

The inner loop is based on AMD's optimization guide's memcpy. This
also allows you to modify it to use the movnti instructions for
writing to the destination ram directly bypassing the cache if you
like.

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2010-04-19 19:45 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-19 19:05 inlined asm x86-64 COPY_DWORDS macro Conn Clark
2010-04-19 19:15 ` Matt Turner
2010-04-19 19:44   ` Conn Clark

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.