Re: [Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction

From: Stefan Brankovic <stefan.brankovic@rt-rk.com>
To: Richard Henderson <richard.henderson@linaro.org>, qemu-devel@nongnu.org
Cc: david@gibson.dropbear.id.au
Subject: Re: [Qemu-devel] [PATCH v6 1/3] target/ppc: Optimize emulation of vpkpx instruction
Date: Thu, 29 Aug 2019 15:34:24 +0200	[thread overview]
Message-ID: <bdc9fb4f-b247-5919-c691-62f46f14360f@rt-rk.com> (raw)
In-Reply-To: <64b614b6-cb05-bd16-dd0b-1ffbdc7db94a@linaro.org>

[-- Attachment #1: Type: text/plain, Size: 5764 bytes --]

On 27.8.19. 20:52, Richard Henderson wrote:
> On 8/27/19 2:37 AM, Stefan Brankovic wrote:
>> +    for (i = 0; i < 4; i++) {
>> +        switch (i) {
>> +        case 0:
>> +            /*
>> +             * Get high doubleword of vA to perfrom 6-5-5 pack of pixels
>> +             * 1 and 2.
>> +             */
>> +            get_avr64(avr, VA, true);
>> +            tcg_gen_movi_i64(result, 0x0ULL);
>> +            break;
>> +        case 1:
>> +            /*
>> +             * Get low doubleword of vA to perfrom 6-5-5 pack of pixels
>> +             * 3 and 4.
>> +             */
>> +            get_avr64(avr, VA, false);
>> +            break;
>> +        case 2:
>> +            /*
>> +             * Get high doubleword of vB to perfrom 6-5-5 pack of pixels
>> +             * 5 and 6.
>> +             */
>> +            get_avr64(avr, VB, true);
>> +            tcg_gen_movi_i64(result, 0x0ULL);
>> +            break;
>> +        case 3:
>> +            /*
>> +             * Get low doubleword of vB to perfrom 6-5-5 pack of pixels
>> +             * 7 and 8.
>> +             */
>> +            get_avr64(avr, VB, false);
>> +            break;
>> +        }
>> +        /* Perform the packing for 2 pixels(each iteration for 1). */
>> +        tcg_gen_movi_i64(tmp, 0x0ULL);
>> +        for (j = 0; j < 2; j++) {
>> +            tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
>> +            tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
>> +            tcg_gen_or_i64(tmp, tmp, shifted);
>> +
>> +            tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
>> +            tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
>> +            tcg_gen_or_i64(tmp, tmp, shifted);
>> +
>> +            tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
>> +            tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
>> +            tcg_gen_or_i64(tmp, tmp, shifted);
>> +        }
>> +        if ((i == 0) || (i == 2)) {
>> +            tcg_gen_shli_i64(tmp, tmp, 32);
>> +        }
>> +        tcg_gen_or_i64(result, result, tmp);
>> +        if (i == 1) {
>> +            /* Place packed pixels 1:4 to high doubleword of vD. */
>> +            tcg_gen_mov_i64(result1, result);
>> +        }
>> +        if (i == 3) {
>> +            /* Place packed pixels 5:8 to low doubleword of vD. */
>> +            tcg_gen_mov_i64(result2, result);
>> +        }
>> +    }
>> +    set_avr64(VT, result1, true);
>> +    set_avr64(VT, result2, false);
> I really have a hard time believing that it is worthwhile to inline all of this
> code.  By my count this is 82 non-move opcodes.  That is a *lot* of inline
> expansion.
>
> However, I can well imagine that the existing out-of-line helper is less than
> optimal.
>
>> -void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
>> -{
>> -    int i, j;
>> -    ppc_avr_t result;
>> -#if defined(HOST_WORDS_BIGENDIAN)
>> -    const ppc_avr_t *x[2] = { a, b };
>> -#else
>> -    const ppc_avr_t *x[2] = { b, a };
>> -#endif
>> -
>> -    VECTOR_FOR_INORDER_I(i, u64) {
>> -        VECTOR_FOR_INORDER_I(j, u32) {
>> -            uint32_t e = x[i]->u32[j];
> Double indirect loads?
>
>> -
>> -            result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
>> -                                     ((e >> 6) & 0x3e0) |
>> -                                     ((e >> 3) & 0x1f));
> Store to temporary ...
>
>> -        }
>> -    }
>> -    *r = result;
> ... and then copy?
>
> Try replacing the existing helper with something like the following.
>
>
> r~
>
>
>
> static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
> {
>      uint64_t r;
>
>      r  = ((a >> (shr + 9)) & 0x3f) << shl;
>      r |= ((a >> (shr + 6)) & 0x1f) << shl;
>      r |= ((a >> (shr + 3)) & 0x1f) << shl;
>
>      return r;
> }
>
> static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
> {
>      return pkpx_1(ah, 32, 48)
>           | pkpx_1(ah,  0, 32)
>           | pkpx_1(al, 32, 16)
>           | pkpx_1(al,  0,  0);
> }
>
> void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b)
> {
>      uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1));
>      uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1));
>      r->VsrD(0) = rh;
>      r->VsrD(1) = rl;
> }

I implemented vpkpx as you suggested above with small modifications(so 
it builds and gives correct result). It looks like this:

static inline uint64_t pkpx_1(uint64_t a, int shr, int shl)
{
     uint64_t r;

     r  = ((a >> (shr + 9)) & 0xfc00) << shl;
     r |= ((a >> (shr + 6)) & 0x3e0) << shl;
     r |= ((a >> (shr + 3)) & 0x1f) << shl;

     return r;
}

static inline uint64_t pkpx_2(uint64_t ah, uint64_t al)
{
     return pkpx_1(ah, 32, 48)
          | pkpx_1(ah,  0, 32)
          | pkpx_1(al, 32, 16)
          | pkpx_1(al,  0,  0);
}

void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
{
     uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
     uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
     r->u64[1] = rh;
     r->u64[0] = rl;
}

I also noticed that this would work only for little_endian hosts, so we 
would need to modify it in order to support big_endian hosts (this 
shouldn't affect performance results).

Then I run my performance tests and I got following results(test is 
calling vpkpx 100000 times):

1) Current helper implementation: ~ 157 ms

2) helper implementation you suggested: ~94 ms

3) tcg implementation: ~75 ms

Attached file contains assembly code for both current implementation and 
implementation you suggested, so please take a look at that as well.

Kind Regards,

Stefan

[-- Attachment #2: vpkpx_assembly.txt --]
[-- Type: text/plain, Size: 23654 bytes --]

Current vpkpx implementation:

1)Both c and assembly code:

Dump of assembler code for function helper_vpkpx:
1267	{
   0x0000000000195fe0 <+0>:	48 83 ec 38	sub    $0x38,%rsp

1268	    int i, j;
1269	    ppc_avr_t result;
1270	#if defined(HOST_WORDS_BIGENDIAN)
1271	    const ppc_avr_t *x[2] = { a, b };
1272	#else
1273	    const ppc_avr_t *x[2] = { b, a };
   0x0000000000195fe4 <+4>:	b9 07 00 00 00	mov    $0x7,%ecx

1267	{
   0x0000000000195fe9 <+9>:	64 48 8b 04 25 28 00 00 00	mov    %fs:0x28,%rax
   0x0000000000195ff2 <+18>:	48 89 44 24 28	mov    %rax,0x28(%rsp)
   0x0000000000195ff7 <+23>:	31 c0	xor    %eax,%eax
   0x0000000000195ff9 <+25>:	4c 8d 4c 24 10	lea    0x10(%rsp),%r9

1268	    int i, j;
1269	    ppc_avr_t result;
1270	#if defined(HOST_WORDS_BIGENDIAN)
1271	    const ppc_avr_t *x[2] = { a, b };
1272	#else
1273	    const ppc_avr_t *x[2] = { b, a };
   0x0000000000195ffe <+30>:	48 89 54 24 10	mov    %rdx,0x10(%rsp)
   0x0000000000196003 <+35>:	48 89 74 24 18	mov    %rsi,0x18(%rsp)
   0x0000000000196008 <+40>:	44 8d 51 fc	lea    -0x4(%rcx),%r10d
   0x000000000019600c <+44>:	48 83 c6 0c	add    $0xc,%rsi

1278	            uint32_t e = x[i]->u32[j];
   0x0000000000196010 <+48>:	8b 06	mov    (%rsi),%eax

1279	
1280	            result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
   0x0000000000196012 <+50>:	4c 63 d9	movslq %ecx,%r11
   0x0000000000196015 <+53>:	83 e9 01	sub    $0x1,%ecx
   0x0000000000196018 <+56>:	48 83 ee 04	sub    $0x4,%rsi
   0x000000000019601c <+60>:	89 c2	mov    %eax,%edx
   0x000000000019601e <+62>:	c1 ea 09	shr    $0x9,%edx
   0x0000000000196021 <+65>:	41 89 d0	mov    %edx,%r8d
   0x0000000000196024 <+68>:	89 c2	mov    %eax,%edx
   0x0000000000196026 <+70>:	c1 e8 03	shr    $0x3,%eax
   0x0000000000196029 <+73>:	c1 ea 06	shr    $0x6,%edx
   0x000000000019602c <+76>:	66 41 81 e0 00 fc	and    $0xfc00,%r8w
   0x0000000000196032 <+82>:	83 e0 1f	and    $0x1f,%eax
   0x0000000000196035 <+85>:	66 81 e2 e0 03	and    $0x3e0,%dx
   0x000000000019603a <+90>:	44 09 c2	or     %r8d,%edx
   0x000000000019603d <+93>:	09 d0	or     %edx,%eax

1277	        VECTOR_FOR_INORDER_I(j, u32) {
   0x000000000019603f <+95>:	41 39 ca	cmp    %ecx,%r10d

1279	
1280	            result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
   0x0000000000196042 <+98>:	66 42 89 04 5c	mov    %ax,(%rsp,%r11,2)

1277	        VECTOR_FOR_INORDER_I(j, u32) {
   0x0000000000196047 <+103>:	75 c7	jne    0x196010 <helper_vpkpx+48>

1276	    VECTOR_FOR_INORDER_I(i, u64) {
   0x0000000000196049 <+105>:	41 83 fa ff	cmp    $0xffffffff,%r10d
   0x000000000019604d <+109>:	44 89 d1	mov    %r10d,%ecx
   0x0000000000196050 <+112>:	74 0e	je     0x196060 <helper_vpkpx+128>
   0x0000000000196052 <+114>:	49 8b 31	mov    (%r9),%rsi
   0x0000000000196055 <+117>:	49 83 e9 08	sub    $0x8,%r9
   0x0000000000196059 <+121>:	eb ad	jmp    0x196008 <helper_vpkpx+40>
   0x000000000019605b <+123>:	0f 1f 44 00 00	nopl   0x0(%rax,%rax,1)

1281	                                     ((e >> 6) & 0x3e0) |
1282	                                     ((e >> 3) & 0x1f));
1283	//            printf("%x\n",result.u16[4 * i + j]);
1284	        }
1285	    }
1286	//    printf("%lx\n",result.u64[0]);
1287	//    printf("%lx\n",result.u64[1]);
1288	    *r = result;
   0x0000000000196060 <+128>:	48 8b 04 24	mov    (%rsp),%rax
   0x0000000000196064 <+132>:	48 8b 54 24 08	mov    0x8(%rsp),%rdx
   0x0000000000196069 <+137>:	48 89 07	mov    %rax,(%rdi)
   0x000000000019606c <+140>:	48 89 57 08	mov    %rdx,0x8(%rdi)

1289	}
   0x0000000000196070 <+144>:	48 8b 44 24 28	mov    0x28(%rsp),%rax
   0x0000000000196075 <+149>:	64 48 33 04 25 28 00 00 00	xor    %fs:0x28,%rax
   0x000000000019607e <+158>:	75 05	jne    0x196085 <helper_vpkpx+165>
   0x0000000000196080 <+160>:	48 83 c4 38	add    $0x38,%rsp
   0x0000000000196084 <+164>:	c3	retq   
   0x0000000000196085 <+165>:	e8 2e 66 f0 ff	callq  0x9c6b8
End of assembler dump.

2) Only assembly code:

Dump of assembler code for function helper_vpkpx:
   0x0000000000195fe0 <+0>:	48 83 ec 38	sub    $0x38,%rsp
   0x0000000000195fe4 <+4>:	b9 07 00 00 00	mov    $0x7,%ecx
   0x0000000000195fe9 <+9>:	64 48 8b 04 25 28 00 00 00	mov    %fs:0x28,%rax
   0x0000000000195ff2 <+18>:	48 89 44 24 28	mov    %rax,0x28(%rsp)
   0x0000000000195ff7 <+23>:	31 c0	xor    %eax,%eax
   0x0000000000195ff9 <+25>:	4c 8d 4c 24 10	lea    0x10(%rsp),%r9
   0x0000000000195ffe <+30>:	48 89 54 24 10	mov    %rdx,0x10(%rsp)
   0x0000000000196003 <+35>:	48 89 74 24 18	mov    %rsi,0x18(%rsp)
   0x0000000000196008 <+40>:	44 8d 51 fc	lea    -0x4(%rcx),%r10d
   0x000000000019600c <+44>:	48 83 c6 0c	add    $0xc,%rsi
   0x0000000000196010 <+48>:	8b 06	mov    (%rsi),%eax
   0x0000000000196012 <+50>:	4c 63 d9	movslq %ecx,%r11
   0x0000000000196015 <+53>:	83 e9 01	sub    $0x1,%ecx
   0x0000000000196018 <+56>:	48 83 ee 04	sub    $0x4,%rsi
   0x000000000019601c <+60>:	89 c2	mov    %eax,%edx
   0x000000000019601e <+62>:	c1 ea 09	shr    $0x9,%edx
   0x0000000000196021 <+65>:	41 89 d0	mov    %edx,%r8d
   0x0000000000196024 <+68>:	89 c2	mov    %eax,%edx
   0x0000000000196026 <+70>:	c1 e8 03	shr    $0x3,%eax
   0x0000000000196029 <+73>:	c1 ea 06	shr    $0x6,%edx
   0x000000000019602c <+76>:	66 41 81 e0 00 fc	and    $0xfc00,%r8w
   0x0000000000196032 <+82>:	83 e0 1f	and    $0x1f,%eax
   0x0000000000196035 <+85>:	66 81 e2 e0 03	and    $0x3e0,%dx
   0x000000000019603a <+90>:	44 09 c2	or     %r8d,%edx
   0x000000000019603d <+93>:	09 d0	or     %edx,%eax
   0x000000000019603f <+95>:	41 39 ca	cmp    %ecx,%r10d
   0x0000000000196042 <+98>:	66 42 89 04 5c	mov    %ax,(%rsp,%r11,2)
   0x0000000000196047 <+103>:	75 c7	jne    0x196010 <helper_vpkpx+48>
   0x0000000000196049 <+105>:	41 83 fa ff	cmp    $0xffffffff,%r10d
   0x000000000019604d <+109>:	44 89 d1	mov    %r10d,%ecx
   0x0000000000196050 <+112>:	74 0e	je     0x196060 <helper_vpkpx+128>
   0x0000000000196052 <+114>:	49 8b 31	mov    (%r9),%rsi
   0x0000000000196055 <+117>:	49 83 e9 08	sub    $0x8,%r9
   0x0000000000196059 <+121>:	eb ad	jmp    0x196008 <helper_vpkpx+40>
   0x000000000019605b <+123>:	0f 1f 44 00 00	nopl   0x0(%rax,%rax,1)
   0x0000000000196060 <+128>:	48 8b 04 24	mov    (%rsp),%rax
   0x0000000000196064 <+132>:	48 8b 54 24 08	mov    0x8(%rsp),%rdx
   0x0000000000196069 <+137>:	48 89 07	mov    %rax,(%rdi)
   0x000000000019606c <+140>:	48 89 57 08	mov    %rdx,0x8(%rdi)
   0x0000000000196070 <+144>:	48 8b 44 24 28	mov    0x28(%rsp),%rax
   0x0000000000196075 <+149>:	64 48 33 04 25 28 00 00 00	xor    %fs:0x28,%rax
   0x000000000019607e <+158>:	75 05	jne    0x196085 <helper_vpkpx+165>
   0x0000000000196080 <+160>:	48 83 c4 38	add    $0x38,%rsp
   0x0000000000196084 <+164>:	c3	retq   
   0x0000000000196085 <+165>:	e8 2e 66 f0 ff	callq  0x9c6b8
End of assembler dump.

Implementation you suggested:

1)Both c and assembly code:

Dump of assembler code for function helper_vpkpx:
1313	{
   0x0000000000195fe0 <+0>:	55	push   %rbp
   0x0000000000195fe1 <+1>:	53	push   %rbx

1314	    uint64_t rh = pkpx_2(a->u64[1], a->u64[0]);
   0x0000000000195fe2 <+2>:	48 8b 46 08	mov    0x8(%rsi),%rax
   0x0000000000195fe6 <+6>:	48 8b 0e	mov    (%rsi),%rcx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000195fe9 <+9>:	49 89 c1	mov    %rax,%r9

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000195fec <+12>:	48 89 c6	mov    %rax,%rsi
   0x0000000000195fef <+15>:	49 89 c3	mov    %rax,%r11
   0x0000000000195ff2 <+18>:	48 c1 ee 29	shr    $0x29,%rsi

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000195ff6 <+22>:	49 c1 e9 26	shr    $0x26,%r9

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000195ffa <+26>:	49 c1 eb 09	shr    $0x9,%r11
   0x0000000000195ffe <+30>:	81 e6 00 fc 00 00	and    $0xfc00,%esi

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196004 <+36>:	41 81 e1 e0 03 00 00	and    $0x3e0,%r9d
   0x000000000019600b <+43>:	49 89 ca	mov    %rcx,%r10

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x000000000019600e <+46>:	49 89 f0	mov    %rsi,%r8

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196011 <+49>:	4c 89 ce	mov    %r9,%rsi

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196014 <+52>:	49 89 c1	mov    %rax,%r9
   0x0000000000196017 <+55>:	49 c1 e9 23	shr    $0x23,%r9

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019601b <+59>:	4c 09 c6	or     %r8,%rsi
   0x000000000019601e <+62>:	49 c1 ea 26	shr    $0x26,%r10

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196022 <+66>:	41 83 e1 1f	and    $0x1f,%r9d

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196026 <+70>:	41 81 e2 e0 03 00 00	and    $0x3e0,%r10d

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x000000000019602d <+77>:	49 09 f1	or     %rsi,%r9

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196030 <+80>:	4c 89 de	mov    %r11,%rsi

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196033 <+83>:	49 89 c3	mov    %rax,%r11
   0x0000000000196036 <+86>:	49 c1 eb 06	shr    $0x6,%r11

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x000000000019603a <+90>:	81 e6 00 fc 00 00	and    $0xfc00,%esi

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196040 <+96>:	48 c1 e8 03	shr    $0x3,%rax

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196044 <+100>:	41 81 e3 e0 03 00 00	and    $0x3e0,%r11d

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x000000000019604b <+107>:	83 e0 1f	and    $0x1f,%eax

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019604e <+110>:	49 09 f3	or     %rsi,%r11

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196051 <+113>:	49 09 c3	or     %rax,%r11

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196054 <+116>:	48 89 c8	mov    %rcx,%rax
   0x0000000000196057 <+119>:	48 c1 e8 29	shr    $0x29,%rax
   0x000000000019605b <+123>:	25 00 fc 00 00	and    $0xfc00,%eax
   0x0000000000196060 <+128>:	48 89 c6	mov    %rax,%rsi

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196063 <+131>:	4c 89 d0	mov    %r10,%rax

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196066 <+134>:	49 89 ca	mov    %rcx,%r10
   0x0000000000196069 <+137>:	49 c1 ea 23	shr    $0x23,%r10

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019606d <+141>:	48 09 f0	or     %rsi,%rax

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196070 <+144>:	41 83 e2 1f	and    $0x1f,%r10d
   0x0000000000196074 <+148>:	49 09 c2	or     %rax,%r10

1315	    uint64_t rl = pkpx_2(b->u64[1], b->u64[0]);
   0x0000000000196077 <+151>:	48 8b 02	mov    (%rdx),%rax
   0x000000000019607a <+154>:	48 8b 52 08	mov    0x8(%rdx),%rdx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019607e <+158>:	49 89 d0	mov    %rdx,%r8

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196081 <+161>:	48 89 d6	mov    %rdx,%rsi

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196084 <+164>:	49 c1 e8 26	shr    $0x26,%r8

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196088 <+168>:	48 c1 ee 29	shr    $0x29,%rsi

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019608c <+172>:	41 81 e0 e0 03 00 00	and    $0x3e0,%r8d

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196093 <+179>:	48 89 f3	mov    %rsi,%rbx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196096 <+182>:	4c 89 c6	mov    %r8,%rsi

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196099 <+185>:	49 89 d0	mov    %rdx,%r8

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x000000000019609c <+188>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960a2 <+194>:	49 c1 e8 23	shr    $0x23,%r8

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960a6 <+198>:	48 09 de	or     %rbx,%rsi

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960a9 <+201>:	41 83 e0 1f	and    $0x1f,%r8d
   0x00000000001960ad <+205>:	49 09 f0	or     %rsi,%r8

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960b0 <+208>:	48 89 d6	mov    %rdx,%rsi
   0x00000000001960b3 <+211>:	48 c1 ee 09	shr    $0x9,%rsi

1316	    r->u64[1] = rh;
   0x00000000001960b7 <+215>:	49 c1 e1 30	shl    $0x30,%r9
   0x00000000001960bb <+219>:	49 c1 e3 20	shl    $0x20,%r11

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960bf <+223>:	48 89 f3	mov    %rsi,%rbx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960c2 <+226>:	48 89 d6	mov    %rdx,%rsi

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960c5 <+229>:	48 c1 ea 03	shr    $0x3,%rdx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960c9 <+233>:	48 c1 ee 06	shr    $0x6,%rsi

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960cd <+237>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960d3 <+243>:	83 e2 1f	and    $0x1f,%edx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960d6 <+246>:	81 e6 e0 03 00 00	and    $0x3e0,%esi

1316	    r->u64[1] = rh;
   0x00000000001960dc <+252>:	49 c1 e2 10	shl    $0x10,%r10

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960e0 <+256>:	48 09 de	or     %rbx,%rsi

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960e3 <+259>:	48 89 c3	mov    %rax,%rbx

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x00000000001960e6 <+262>:	48 09 f2	or     %rsi,%rdx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960e9 <+265>:	48 89 c6	mov    %rax,%rsi

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960ec <+268>:	48 c1 eb 29	shr    $0x29,%rbx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960f0 <+272>:	48 c1 ee 26	shr    $0x26,%rsi

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x00000000001960f4 <+276>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x00000000001960fa <+282>:	81 e6 e0 03 00 00	and    $0x3e0,%esi

1296	    r  = ((a >> (shr + 9)) & 0xfc00);
   0x0000000000196100 <+288>:	48 89 dd	mov    %rbx,%rbp

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x0000000000196103 <+291>:	48 89 f3	mov    %rsi,%rbx

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196106 <+294>:	48 89 c6	mov    %rax,%rsi
   0x0000000000196109 <+297>:	48 c1 ee 23	shr    $0x23,%rsi

1297	    r |= ((a >> (shr + 6)) & 0x3e0);
   0x000000000019610d <+301>:	48 09 eb	or     %rbp,%rbx

1298	    r |= ((a >> (shr + 3)) & 0x1f);
   0x0000000000196110 <+304>:	83 e6 1f	and    $0x1f,%esi
   0x0000000000196113 <+307>:	48 09 de	or     %rbx,%rsi

1316	    r->u64[1] = rh;
   0x0000000000196116 <+310>:	48 89 cb	mov    %rcx,%rbx
   0x0000000000196119 <+313>:	48 c1 eb 09	shr    $0x9,%rbx
   0x000000000019611d <+317>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx
   0x0000000000196123 <+323>:	48 89 dd	mov    %rbx,%rbp
   0x0000000000196126 <+326>:	48 89 cb	mov    %rcx,%rbx
   0x0000000000196129 <+329>:	48 c1 e9 03	shr    $0x3,%rcx
   0x000000000019612d <+333>:	48 c1 eb 06	shr    $0x6,%rbx
   0x0000000000196131 <+337>:	83 e1 1f	and    $0x1f,%ecx
   0x0000000000196134 <+340>:	81 e3 e0 03 00 00	and    $0x3e0,%ebx
   0x000000000019613a <+346>:	48 09 eb	or     %rbp,%rbx
   0x000000000019613d <+349>:	48 09 d9	or     %rbx,%rcx
   0x0000000000196140 <+352>:	4c 09 c9	or     %r9,%rcx
   0x0000000000196143 <+355>:	4c 09 d9	or     %r11,%rcx
   0x0000000000196146 <+358>:	4c 09 d1	or     %r10,%rcx
   0x0000000000196149 <+361>:	48 89 4f 08	mov    %rcx,0x8(%rdi)

1317	    r->u64[0] = rl;
   0x000000000019614d <+365>:	48 89 c1	mov    %rax,%rcx
   0x0000000000196150 <+368>:	48 c1 e9 09	shr    $0x9,%rcx
   0x0000000000196154 <+372>:	81 e1 00 fc 00 00	and    $0xfc00,%ecx
   0x000000000019615a <+378>:	49 89 c9	mov    %rcx,%r9
   0x000000000019615d <+381>:	48 89 c1	mov    %rax,%rcx
   0x0000000000196160 <+384>:	48 c1 e9 06	shr    $0x6,%rcx
   0x0000000000196164 <+388>:	48 c1 e8 03	shr    $0x3,%rax
   0x0000000000196168 <+392>:	49 c1 e0 30	shl    $0x30,%r8
   0x000000000019616c <+396>:	81 e1 e0 03 00 00	and    $0x3e0,%ecx
   0x0000000000196172 <+402>:	83 e0 1f	and    $0x1f,%eax
   0x0000000000196175 <+405>:	48 c1 e2 20	shl    $0x20,%rdx
   0x0000000000196179 <+409>:	4c 09 c9	or     %r9,%rcx
   0x000000000019617c <+412>:	48 09 c8	or     %rcx,%rax
   0x000000000019617f <+415>:	48 89 f1	mov    %rsi,%rcx
   0x0000000000196182 <+418>:	4c 09 c0	or     %r8,%rax
   0x0000000000196185 <+421>:	48 c1 e1 10	shl    $0x10,%rcx
   0x0000000000196189 <+425>:	48 09 d0	or     %rdx,%rax
   0x000000000019618c <+428>:	48 09 c8	or     %rcx,%rax

1318	}
   0x000000000019618f <+431>:	5b	pop    %rbx

1317	    r->u64[0] = rl;
   0x0000000000196190 <+432>:	48 89 07	mov    %rax,(%rdi)

1318	}
   0x0000000000196193 <+435>:	5d	pop    %rbp
   0x0000000000196194 <+436>:	c3	retq   
End of assembler dump.

2) Only assembly code:

Dump of assembler code for function helper_vpkpx:
   0x0000000000195fe0 <+0>:	55	push   %rbp
   0x0000000000195fe1 <+1>:	53	push   %rbx
   0x0000000000195fe2 <+2>:	48 8b 46 08	mov    0x8(%rsi),%rax
   0x0000000000195fe6 <+6>:	48 8b 0e	mov    (%rsi),%rcx
   0x0000000000195fe9 <+9>:	49 89 c1	mov    %rax,%r9
   0x0000000000195fec <+12>:	48 89 c6	mov    %rax,%rsi
   0x0000000000195fef <+15>:	49 89 c3	mov    %rax,%r11
   0x0000000000195ff2 <+18>:	48 c1 ee 29	shr    $0x29,%rsi
   0x0000000000195ff6 <+22>:	49 c1 e9 26	shr    $0x26,%r9
   0x0000000000195ffa <+26>:	49 c1 eb 09	shr    $0x9,%r11
   0x0000000000195ffe <+30>:	81 e6 00 fc 00 00	and    $0xfc00,%esi
   0x0000000000196004 <+36>:	41 81 e1 e0 03 00 00	and    $0x3e0,%r9d
   0x000000000019600b <+43>:	49 89 ca	mov    %rcx,%r10
   0x000000000019600e <+46>:	49 89 f0	mov    %rsi,%r8
   0x0000000000196011 <+49>:	4c 89 ce	mov    %r9,%rsi
   0x0000000000196014 <+52>:	49 89 c1	mov    %rax,%r9
   0x0000000000196017 <+55>:	49 c1 e9 23	shr    $0x23,%r9
   0x000000000019601b <+59>:	4c 09 c6	or     %r8,%rsi
   0x000000000019601e <+62>:	49 c1 ea 26	shr    $0x26,%r10
   0x0000000000196022 <+66>:	41 83 e1 1f	and    $0x1f,%r9d
   0x0000000000196026 <+70>:	41 81 e2 e0 03 00 00	and    $0x3e0,%r10d
   0x000000000019602d <+77>:	49 09 f1	or     %rsi,%r9
   0x0000000000196030 <+80>:	4c 89 de	mov    %r11,%rsi
   0x0000000000196033 <+83>:	49 89 c3	mov    %rax,%r11
   0x0000000000196036 <+86>:	49 c1 eb 06	shr    $0x6,%r11
   0x000000000019603a <+90>:	81 e6 00 fc 00 00	and    $0xfc00,%esi
   0x0000000000196040 <+96>:	48 c1 e8 03	shr    $0x3,%rax
   0x0000000000196044 <+100>:	41 81 e3 e0 03 00 00	and    $0x3e0,%r11d
   0x000000000019604b <+107>:	83 e0 1f	and    $0x1f,%eax
   0x000000000019604e <+110>:	49 09 f3	or     %rsi,%r11
   0x0000000000196051 <+113>:	49 09 c3	or     %rax,%r11
   0x0000000000196054 <+116>:	48 89 c8	mov    %rcx,%rax
   0x0000000000196057 <+119>:	48 c1 e8 29	shr    $0x29,%rax
   0x000000000019605b <+123>:	25 00 fc 00 00	and    $0xfc00,%eax
   0x0000000000196060 <+128>:	48 89 c6	mov    %rax,%rsi
   0x0000000000196063 <+131>:	4c 89 d0	mov    %r10,%rax
   0x0000000000196066 <+134>:	49 89 ca	mov    %rcx,%r10
   0x0000000000196069 <+137>:	49 c1 ea 23	shr    $0x23,%r10
   0x000000000019606d <+141>:	48 09 f0	or     %rsi,%rax
   0x0000000000196070 <+144>:	41 83 e2 1f	and    $0x1f,%r10d
   0x0000000000196074 <+148>:	49 09 c2	or     %rax,%r10
   0x0000000000196077 <+151>:	48 8b 02	mov    (%rdx),%rax
   0x000000000019607a <+154>:	48 8b 52 08	mov    0x8(%rdx),%rdx
   0x000000000019607e <+158>:	49 89 d0	mov    %rdx,%r8
   0x0000000000196081 <+161>:	48 89 d6	mov    %rdx,%rsi
   0x0000000000196084 <+164>:	49 c1 e8 26	shr    $0x26,%r8
   0x0000000000196088 <+168>:	48 c1 ee 29	shr    $0x29,%rsi
   0x000000000019608c <+172>:	41 81 e0 e0 03 00 00	and    $0x3e0,%r8d
   0x0000000000196093 <+179>:	48 89 f3	mov    %rsi,%rbx
   0x0000000000196096 <+182>:	4c 89 c6	mov    %r8,%rsi
   0x0000000000196099 <+185>:	49 89 d0	mov    %rdx,%r8
   0x000000000019609c <+188>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx
   0x00000000001960a2 <+194>:	49 c1 e8 23	shr    $0x23,%r8
   0x00000000001960a6 <+198>:	48 09 de	or     %rbx,%rsi
   0x00000000001960a9 <+201>:	41 83 e0 1f	and    $0x1f,%r8d
   0x00000000001960ad <+205>:	49 09 f0	or     %rsi,%r8
   0x00000000001960b0 <+208>:	48 89 d6	mov    %rdx,%rsi
   0x00000000001960b3 <+211>:	48 c1 ee 09	shr    $0x9,%rsi
   0x00000000001960b7 <+215>:	49 c1 e1 30	shl    $0x30,%r9
   0x00000000001960bb <+219>:	49 c1 e3 20	shl    $0x20,%r11
   0x00000000001960bf <+223>:	48 89 f3	mov    %rsi,%rbx
   0x00000000001960c2 <+226>:	48 89 d6	mov    %rdx,%rsi
   0x00000000001960c5 <+229>:	48 c1 ea 03	shr    $0x3,%rdx
   0x00000000001960c9 <+233>:	48 c1 ee 06	shr    $0x6,%rsi
   0x00000000001960cd <+237>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx
   0x00000000001960d3 <+243>:	83 e2 1f	and    $0x1f,%edx
   0x00000000001960d6 <+246>:	81 e6 e0 03 00 00	and    $0x3e0,%esi
   0x00000000001960dc <+252>:	49 c1 e2 10	shl    $0x10,%r10
   0x00000000001960e0 <+256>:	48 09 de	or     %rbx,%rsi
   0x00000000001960e3 <+259>:	48 89 c3	mov    %rax,%rbx
   0x00000000001960e6 <+262>:	48 09 f2	or     %rsi,%rdx
   0x00000000001960e9 <+265>:	48 89 c6	mov    %rax,%rsi
   0x00000000001960ec <+268>:	48 c1 eb 29	shr    $0x29,%rbx
   0x00000000001960f0 <+272>:	48 c1 ee 26	shr    $0x26,%rsi
   0x00000000001960f4 <+276>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx
   0x00000000001960fa <+282>:	81 e6 e0 03 00 00	and    $0x3e0,%esi
   0x0000000000196100 <+288>:	48 89 dd	mov    %rbx,%rbp
   0x0000000000196103 <+291>:	48 89 f3	mov    %rsi,%rbx
   0x0000000000196106 <+294>:	48 89 c6	mov    %rax,%rsi
   0x0000000000196109 <+297>:	48 c1 ee 23	shr    $0x23,%rsi
   0x000000000019610d <+301>:	48 09 eb	or     %rbp,%rbx
   0x0000000000196110 <+304>:	83 e6 1f	and    $0x1f,%esi
   0x0000000000196113 <+307>:	48 09 de	or     %rbx,%rsi
   0x0000000000196116 <+310>:	48 89 cb	mov    %rcx,%rbx
   0x0000000000196119 <+313>:	48 c1 eb 09	shr    $0x9,%rbx
   0x000000000019611d <+317>:	81 e3 00 fc 00 00	and    $0xfc00,%ebx
   0x0000000000196123 <+323>:	48 89 dd	mov    %rbx,%rbp
   0x0000000000196126 <+326>:	48 89 cb	mov    %rcx,%rbx
   0x0000000000196129 <+329>:	48 c1 e9 03	shr    $0x3,%rcx
   0x000000000019612d <+333>:	48 c1 eb 06	shr    $0x6,%rbx
   0x0000000000196131 <+337>:	83 e1 1f	and    $0x1f,%ecx
   0x0000000000196134 <+340>:	81 e3 e0 03 00 00	and    $0x3e0,%ebx
   0x000000000019613a <+346>:	48 09 eb	or     %rbp,%rbx
   0x000000000019613d <+349>:	48 09 d9	or     %rbx,%rcx
   0x0000000000196140 <+352>:	4c 09 c9	or     %r9,%rcx
   0x0000000000196143 <+355>:	4c 09 d9	or     %r11,%rcx
   0x0000000000196146 <+358>:	4c 09 d1	or     %r10,%rcx
   0x0000000000196149 <+361>:	48 89 4f 08	mov    %rcx,0x8(%rdi)
   0x000000000019614d <+365>:	48 89 c1	mov    %rax,%rcx
   0x0000000000196150 <+368>:	48 c1 e9 09	shr    $0x9,%rcx
   0x0000000000196154 <+372>:	81 e1 00 fc 00 00	and    $0xfc00,%ecx
   0x000000000019615a <+378>:	49 89 c9	mov    %rcx,%r9
   0x000000000019615d <+381>:	48 89 c1	mov    %rax,%rcx
   0x0000000000196160 <+384>:	48 c1 e9 06	shr    $0x6,%rcx
   0x0000000000196164 <+388>:	48 c1 e8 03	shr    $0x3,%rax
   0x0000000000196168 <+392>:	49 c1 e0 30	shl    $0x30,%r8
   0x000000000019616c <+396>:	81 e1 e0 03 00 00	and    $0x3e0,%ecx
   0x0000000000196172 <+402>:	83 e0 1f	and    $0x1f,%eax
   0x0000000000196175 <+405>:	48 c1 e2 20	shl    $0x20,%rdx
   0x0000000000196179 <+409>:	4c 09 c9	or     %r9,%rcx
   0x000000000019617c <+412>:	48 09 c8	or     %rcx,%rax
   0x000000000019617f <+415>:	48 89 f1	mov    %rsi,%rcx
   0x0000000000196182 <+418>:	4c 09 c0	or     %r8,%rax
   0x0000000000196185 <+421>:	48 c1 e1 10	shl    $0x10,%rcx
   0x0000000000196189 <+425>:	48 09 d0	or     %rdx,%rax
   0x000000000019618c <+428>:	48 09 c8	or     %rcx,%rax
   0x000000000019618f <+431>:	5b	pop    %rbx
   0x0000000000196190 <+432>:	48 89 07	mov    %rax,(%rdi)
   0x0000000000196193 <+435>:	5d	pop    %rbp
   0x0000000000196194 <+436>:	c3	retq   
End of assembler dump.