inlined asm x86-64 COPY_DWORDS macro

* inlined asm x86-64 COPY_DWORDS macro
@ 2010-04-19 19:05 Conn Clark
  2010-04-19 19:15 ` Matt Turner
  0 siblings, 1 reply; 3+ messages in thread
From: Conn Clark @ 2010-04-19 19:05 UTC (permalink / raw)
  To: dri-devel

Hello everybody,

Here is an inlined asm X86-64 COPY_DWORDS macro I wrote in case
anybody would like to use it. it could be slightly improved by writing
to 16 byte boundaries but its pretty near optimal when writing to
uncached ram.

#ifdef USE_X86_64_ASM
#define COPY_DWORDS( dst, src, nr )                                     \
do {                                            \
uint32_t * _src_ptr;                            \
uint32_t * _dest_ptr;                           \
uint32_t _size;                                 \
                                                \
  _dest_ptr = (uint32_t *)dst;                          \
  _src_ptr = src;                               \
  _size =nr;                                    \
     __asm__ __volatile__ ("testb       $4, %%dil\n"    \
                "je     1f\n"                   \
                "movl   (%%rsi), %%eax\n"       \
                "addq   $4, %%rsi\n"            \
                "movl   %%eax, (%%rdi)\n"       \
                "addq   $4, %%rdi\n"            \
                "decl   %%ecx\n"                \
"1:             movl    %%ecx, %%eax\n"         \
                "shrl   $3, %%ecx\n"            \
                "je     3f\n"                   \
".p2align 4 \n"                                 \
"2:             movq    (%%rsi), %%r8\n"        \
                "movq   8(%%rsi), %%r9\n"       \
                "addq   $32, %%rsi\n"           \
                "movq   %%r8, (%%rdi)\n"        \
                "movq   %%r9, 8(%%rdi)\n"       \
                "addq   $32, %%rdi\n"           \
                "movq   -16(%%rsi), %%r8\n"     \
                "movq   -8(%%rsi), %%r9\n"      \
                "decl   %%ecx\n"                \
                "movq   %%r8, -16(%%rdi)\n"     \
                "movq   %%r9, -8(%%rdi)\n"      \
                "jnz    2b\n"                   \
"3:             testb  $7, %%al\n"              \
                "je     6f\n"                   \
                "testb  $4, %%al\n"             \
                "je     4f\n"                   \
                "movq    (%%rsi), %%r8\n"       \
                "movq    8(%%rsi), %%r9\n"      \
                "addq    $16, %%rsi\n"          \
                "movq   %%r8, (%%rdi)\n"        \
                "movq   %%r9, 8(%%rdi)\n"       \
                "addq    $16, %%rdi\n"          \
"4:     testb   $2, %%al\n"                     \
                "je      5f\n"                  \
                "movq    (%%rsi), %%r8\n"       \
                "addq   $8, %%rsi\n"            \
                "movq  %%r8, (%%rdi)\n"         \
                "addq    $8, %%rdi\n"           \
"5:    testb   $1, %%al\n"                      \
                "je     6f\n"                   \
                "movl   (%%rsi), %%eax\n"       \
                "movl   %%eax, (%%rdi)\n"       \
"6: \n"                                         \
                : "=%c" (_size)                 \
                : "%c" (_size), "S" (_src_ptr), "D" (_dest_ptr)         \
                : "%eax", "%r8", "%r9"                  \
                );                                      \
} while(0)
#endif

-- 

Conn O. Clark

Observation: In formal computer science advances are made
by standing on the shoulders of giants. Linux has proved
that if there are enough of you, you can advance just as
far by stepping on each others toes.

^ permalink raw reply	[flat|nested] 3+ messages in thread