All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64
@ 2016-08-01  2:52 Mihai Donțu
  2016-08-01  2:52 ` [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64 Mihai Donțu
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01  2:52 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Jan Beulich

Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
---
Changed since v2:
 * minor change to make it more obvious what the code does

Changed since v1:
 * added a test for vmovq
 * made the tests depend on SSE and AVX, respectively
 * added emulator support for vmovq (0xd6 forces the operand size to
   64bit)
---
 tools/tests/x86_emulator/test_x86_emulator.c | 44 ++++++++++++++++++++++++++++
 xen/arch/x86/x86_emulate/x86_emulate.c       |  9 +++---
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c
index c7f572a..8994149 100644
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -697,6 +697,50 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing movq %%xmm0,32(%%eax)...");
+    if ( stack_exec && cpu_has_sse )
+    {
+        decl_insn(movq_to_mem2);
+
+        asm volatile ( "pcmpgtb %%xmm0, %%xmm0\n"
+                       put_insn(movq_to_mem2, "movq %%xmm0, 32(%%eax)")
+                       :: );
+
+        *((unsigned long *)res + 4) = 0xbdbdbdbdbdbdbdbd;
+        set_insn(movq_to_mem2);
+        regs.eax = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(movq_to_mem2) )
+            goto fail;
+        if ( *((unsigned long *)res + 4) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %%xmm1,32(%%eax)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovq_to_mem);
+
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(vmovq_to_mem, "vmovq %%xmm1, 32(%%eax)")
+                       :: );
+
+        *((unsigned long *)res + 4) = 0xbdbdbdbdbdbdbdbd;
+        set_insn(vmovq_to_mem);
+        regs.eax = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovq_to_mem) )
+            goto fail;
+        if ( *((unsigned long *)res + 4) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
     if ( stack_exec && cpu_has_sse2 )
     {
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
index fe594ba..44de3b6 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -245,7 +245,7 @@ static uint8_t twobyte_table[256] = {
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xDF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     /* 0xE0 - 0xEF */
     0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
     /* 0xF0 - 0xFF */
@@ -4412,6 +4412,7 @@ x86_emulate(
     case 0x7f: /* movq mm,mm/m64 */
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
+    case 0xd6: /* {,v}movq xmm,xmm/m64 */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4429,9 +4430,9 @@ x86_emulate(
             case vex_66:
             case vex_f3:
                 host_and_vcpu_must_have(sse2);
-                buf[0] = 0x66; /* movdqa */
+                buf[0] = 0x66; /* SSE */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
-                ea.bytes = 16;
+                ea.bytes = (b == 0xd6 ? 8 : 16);
                 break;
             case vex_none:
                 if ( b != 0xe7 )
@@ -4451,7 +4452,7 @@ x86_emulate(
                     ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
-            ea.bytes = 16 << vex.l;
+            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
         }
         if ( ea.type == OP_MEM )
         {
-- 
2.9.2


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01  2:52 [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Mihai Donțu
@ 2016-08-01  2:52 ` Mihai Donțu
  2016-08-01  9:52   ` Andrew Cooper
  2016-08-01 13:38   ` Jan Beulich
  2016-08-01  2:52 ` [PATCH v3 3/3] x86/emulate: added tests for {, v}movd mm, r32/m32 and {, v}movq xmm, r64/m64 Mihai Donțu
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01  2:52 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Zhi Wang, Jan Beulich

Found that Windows driver was using a SSE2 instruction MOVD.

Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
---
Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper

Changed since v2:
 * handle the case where the destination is a GPR
---
 xen/arch/x86/x86_emulate/x86_emulate.c | 38 +++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
index 44de3b6..9f89ada 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
     /* 0x60 - 0x6F */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
     /* 0x70 - 0x7F */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, ImplicitOps|ModRM,
     /* 0x80 - 0x87 */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
@@ -4409,6 +4409,10 @@ x86_emulate(
     case 0x6f: /* movq mm/m64,mm */
                /* {,v}movdq{a,u} xmm/m128,xmm */
                /* vmovdq{a,u} ymm/m256,ymm */
+    case 0x7e: /* movd mm,r/m32 */
+               /* movq mm,r/m64 */
+               /* {,v}movd xmm,r/m32 */
+               /* {,v}movq xmm,r/m64 */
     case 0x7f: /* movq mm,mm/m64 */
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
@@ -4432,7 +4436,17 @@ x86_emulate(
                 host_and_vcpu_must_have(sse2);
                 buf[0] = 0x66; /* SSE */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
-                ea.bytes = (b == 0xd6 ? 8 : 16);
+                switch ( b )
+                {
+                case 0x7e:
+                    ea.bytes = 4;
+                    break;
+                case 0xd6:
+                    ea.bytes = 8;
+                    break;
+                default:
+                    ea.bytes = 16;
+                }
                 break;
             case vex_none:
                 if ( b != 0xe7 )
@@ -4452,7 +4466,17 @@ x86_emulate(
                     ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
-            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
+            switch ( b )
+            {
+            case 0x7e:
+                ea.bytes = 4;
+                break;
+            case 0xd6:
+                ea.bytes = 8;
+                break;
+            default:
+                ea.bytes = 16 << vex.l;
+            }
         }
         if ( ea.type == OP_MEM )
         {
@@ -4468,6 +4492,14 @@ x86_emulate(
             vex.b = 1;
             buf[4] &= 0x38;
         }
+        else if ( b == 0x7e )
+        {
+            /* convert the GPR destination to (%rAX) */
+            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
+            rex_prefix &= ~REX_B;
+            vex.b = 1;
+            buf[4] &= 0x38;
+        }
         if ( !rc )
         {
            copy_REX_VEX(buf, rex_prefix, vex);
-- 
2.9.2


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v3 3/3] x86/emulate: added tests for {, v}movd mm, r32/m32 and {, v}movq xmm, r64/m64
  2016-08-01  2:52 [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Mihai Donțu
  2016-08-01  2:52 ` [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64 Mihai Donțu
@ 2016-08-01  2:52 ` Mihai Donțu
  2016-08-01  9:54   ` Andrew Cooper
  2016-08-01  9:18 ` [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Andrew Cooper
  2016-08-01 13:19 ` Jan Beulich
  3 siblings, 1 reply; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01  2:52 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Jan Beulich

Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
---
Changed since v2:
 * added tests for {,v}movq xmm,r64
---
 tools/tests/x86_emulator/test_x86_emulator.c | 120 +++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c
index 8994149..fb59b0f 100644
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -650,6 +650,88 @@ int main(int argc, char **argv)
 #define check_eip(which) (regs.eip == (unsigned long)instr + \
                                       (unsigned long)which##_len)
 
+    printf("%-40s", "Testing movd %%mm3,32(%%eax)...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movd_to_mem32);
+
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movd_to_mem32, "movd %%mm3, 32(%%eax)")
+                       :: );
+
+        *(res + 8) = 0xbdbdbdbd;
+        set_insn(movd_to_mem32);
+        regs.eax = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || *(res + 8) != 0xffffffff ||
+             !check_eip(movd_to_mem32) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movd %%mm3,%%eax...");
+    if ( stack_exec && cpu_has_mmx )
+    {
+        decl_insn(movd_to_reg);
+
+        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
+                       put_insn(movd_to_reg, "movd %%mm3, %%eax")
+                       :: );
+
+        set_insn(movd_to_reg);
+        regs.rax = 0xbdbdbdbdbdbdbdbd;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || regs.eax != 0xbdbdbdbdffffffff ||
+             !check_eip(movd_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovd %%xmm1,32(%%eax)...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovd_to_mem32);
+
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovd_to_mem32, "vmovd %%xmm1, 32(%%eax)")
+                       :: );
+
+        *(res + 8) = 0xbdbdbdbd;
+        set_insn(vmovd_to_mem32);
+        regs.eax = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || *(res + 8) != 0xffffffff ||
+             !check_eip(vmovd_to_mem32) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovd %%xmm1,%%eax...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovd_to_reg);
+
+        asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+                       put_insn(vmovd_to_reg, "vmovd %%xmm1, %%eax")
+                       :: );
+
+        set_insn(vmovd_to_reg);
+        regs.rax = 0xbdbdbdbdbdbdbdbd;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || regs.rax != 0xbdbdbdbdffffffff ||
+             !check_eip(vmovd_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -719,6 +801,25 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing movq %%xmm0,%%rax...");
+    if ( stack_exec && cpu_has_sse )
+    {
+        decl_insn(movq_to_reg);
+
+        asm volatile ( "pcmpgtb %%xmm0, %%xmm0\n"
+                       put_insn(movq_to_reg, "movq %%xmm0, %%rax")
+                       :: );
+
+        set_insn(movq_to_reg);
+        regs.rax = 0xbdbdbdbdbdbdbdbd;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rax || !check_eip(movq_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing vmovq %%xmm1,32(%%eax)...");
     if ( stack_exec && cpu_has_avx )
     {
@@ -741,6 +842,25 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovq %%xmm1,%%rax...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vmovq_to_reg);
+
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(vmovq_to_reg, "vmovq %%xmm1, %%rax")
+                       :: );
+
+        set_insn(vmovq_to_reg);
+        regs.rax = 0xbdbdbdbdbdbdbdbd;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || regs.rax || !check_eip(vmovq_to_reg) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
     if ( stack_exec && cpu_has_sse2 )
     {
-- 
2.9.2


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64
  2016-08-01  2:52 [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Mihai Donțu
  2016-08-01  2:52 ` [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64 Mihai Donțu
  2016-08-01  2:52 ` [PATCH v3 3/3] x86/emulate: added tests for {, v}movd mm, r32/m32 and {, v}movq xmm, r64/m64 Mihai Donțu
@ 2016-08-01  9:18 ` Andrew Cooper
  2016-08-01 13:19 ` Jan Beulich
  3 siblings, 0 replies; 26+ messages in thread
From: Andrew Cooper @ 2016-08-01  9:18 UTC (permalink / raw)
  To: Mihai Donțu, xen-devel; +Cc: Jan Beulich

On 01/08/16 03:52, Mihai Donțu wrote:
> Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01  2:52 ` [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64 Mihai Donțu
@ 2016-08-01  9:52   ` Andrew Cooper
  2016-08-01 12:53     ` Mihai Donțu
  2016-08-01 13:38   ` Jan Beulich
  1 sibling, 1 reply; 26+ messages in thread
From: Andrew Cooper @ 2016-08-01  9:52 UTC (permalink / raw)
  To: Mihai Donțu, xen-devel; +Cc: Zhi Wang, Jan Beulich

On 01/08/16 03:52, Mihai Donțu wrote:
> Found that Windows driver was using a SSE2 instruction MOVD.
>
> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
> Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
> ---
> Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper
>
> Changed since v2:
>  * handle the case where the destination is a GPR
> ---
>  xen/arch/x86/x86_emulate/x86_emulate.c | 38 +++++++++++++++++++++++++++++++---
>  1 file changed, 35 insertions(+), 3 deletions(-)
>
> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
> index 44de3b6..9f89ada 100644
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
>      /* 0x60 - 0x6F */
>      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>      /* 0x70 - 0x7F */
> -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, ImplicitOps|ModRM,
>      /* 0x80 - 0x87 */
>      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
> @@ -4409,6 +4409,10 @@ x86_emulate(
>      case 0x6f: /* movq mm/m64,mm */
>                 /* {,v}movdq{a,u} xmm/m128,xmm */
>                 /* vmovdq{a,u} ymm/m256,ymm */
> +    case 0x7e: /* movd mm,r/m32 */
> +               /* movq mm,r/m64 */
> +               /* {,v}movd xmm,r/m32 */
> +               /* {,v}movq xmm,r/m64 */
>      case 0x7f: /* movq mm,mm/m64 */
>                 /* {,v}movdq{a,u} xmm,xmm/m128 */
>                 /* vmovdq{a,u} ymm,ymm/m256 */
> @@ -4432,7 +4436,17 @@ x86_emulate(
>                  host_and_vcpu_must_have(sse2);
>                  buf[0] = 0x66; /* SSE */
>                  get_fpu(X86EMUL_FPU_xmm, &fic);
> -                ea.bytes = (b == 0xd6 ? 8 : 16);
> +                switch ( b )
> +                {
> +                case 0x7e:
> +                    ea.bytes = 4;
> +                    break;
> +                case 0xd6:
> +                    ea.bytes = 8;
> +                    break;
> +                default:
> +                    ea.bytes = 16;
> +                }
>                  break;
>              case vex_none:
>                  if ( b != 0xe7 )
> @@ -4452,7 +4466,17 @@ x86_emulate(
>                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
>              host_and_vcpu_must_have(avx);
>              get_fpu(X86EMUL_FPU_ymm, &fic);
> -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
> +            switch ( b )
> +            {
> +            case 0x7e:
> +                ea.bytes = 4;
> +                break;
> +            case 0xd6:
> +                ea.bytes = 8;
> +                break;
> +            default:
> +                ea.bytes = 16 << vex.l;
> +            }
>          }
>          if ( ea.type == OP_MEM )
>          {
> @@ -4468,6 +4492,14 @@ x86_emulate(
>              vex.b = 1;
>              buf[4] &= 0x38;
>          }
> +        else if ( b == 0x7e )
> +        {
> +            /* convert the GPR destination to (%rAX) */
> +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
> +            rex_prefix &= ~REX_B;
> +            vex.b = 1;
> +            buf[4] &= 0x38;
> +        }

Thankyou for doing this.  However, looking at it, it has some code in
common with the "ea.type == OP_MEM" clause.

Would this work?

diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
b/xen/arch/x86/x86_emulate/x86_emulate.c
index fe594ba..90db067 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -4453,16 +4453,25 @@ x86_emulate(
             get_fpu(X86EMUL_FPU_ymm, &fic);
             ea.bytes = 16 << vex.l;
         }
-        if ( ea.type == OP_MEM )
+        if ( ea.type == OP_MEM || ea.type == OP_REG )
         {
-            /* XXX enable once there is ops->ea() or equivalent
-            generate_exception_if((vex.pfx == vex_66) &&
-                                  (ops->ea(ea.mem.seg, ea.mem.off)
-                                   & (ea.bytes - 1)), EXC_GP, 0); */
-            if ( b == 0x6f )
-                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
-                               ea.bytes, ctxt);
             /* convert memory operand to (%rAX) */
+
+            if ( ea.type == OP_MEM)
+            {
+                /* XXX enable once there is ops->ea() or equivalent
+                   generate_exception_if((vex.pfx == vex_66) &&
+                   (ops->ea(ea.mem.seg, ea.mem.off)
+                   & (ea.bytes - 1)), EXC_GP, 0); */
+                if ( b == 0x6f )
+                    rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
+                                   ea.bytes, ctxt);
+            }
+            else if ( ea.type == OP_REG )
+            {
+                *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
+            }
+
             rex_prefix &= ~REX_B;
             vex.b = 1;
             buf[4] &= 0x38;


This is untested, but avoids duplicating this bit of state maniupulation.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 3/3] x86/emulate: added tests for {, v}movd mm, r32/m32 and {, v}movq xmm, r64/m64
  2016-08-01  2:52 ` [PATCH v3 3/3] x86/emulate: added tests for {, v}movd mm, r32/m32 and {, v}movq xmm, r64/m64 Mihai Donțu
@ 2016-08-01  9:54   ` Andrew Cooper
  2016-08-01 12:46     ` Mihai Donțu
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Cooper @ 2016-08-01  9:54 UTC (permalink / raw)
  To: Mihai Donțu, xen-devel; +Cc: Jan Beulich

On 01/08/16 03:52, Mihai Donțu wrote:
> Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
> ---
> Changed since v2:
>  * added tests for {,v}movq xmm,r64
> ---
>  tools/tests/x86_emulator/test_x86_emulator.c | 120 +++++++++++++++++++++++++++
>  1 file changed, 120 insertions(+)
>
> diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c
> index 8994149..fb59b0f 100644
> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> @@ -650,6 +650,88 @@ int main(int argc, char **argv)
>  #define check_eip(which) (regs.eip == (unsigned long)instr + \
>                                        (unsigned long)which##_len)
>  
> +    printf("%-40s", "Testing movd %%mm3,32(%%eax)...");
> +    if ( stack_exec && cpu_has_mmx )
> +    {
> +        decl_insn(movd_to_mem32);
> +
> +        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
> +                       put_insn(movd_to_mem32, "movd %%mm3, 32(%%eax)")
> +                       :: );
> +
> +        *(res + 8) = 0xbdbdbdbd;
> +        set_insn(movd_to_mem32);
> +        regs.eax = (unsigned long)res;
> +        rc = x86_emulate(&ctxt, &emulops);
> +        if ( (rc != X86EMUL_OKAY) || *(res + 8) != 0xffffffff ||
> +             !check_eip(movd_to_mem32) )
> +            goto fail;
> +        printf("okay\n");
> +    }
> +    else
> +        printf("skipped\n");
> +
> +    printf("%-40s", "Testing movd %%mm3,%%eax...");

Could we possibly change this to %ebx instead of %eax?

That case is far more likely to go bang in a debug build if the emulator
is wrong.

Otherwise, Revewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 3/3] x86/emulate: added tests for {, v}movd mm, r32/m32 and {, v}movq xmm, r64/m64
  2016-08-01  9:54   ` Andrew Cooper
@ 2016-08-01 12:46     ` Mihai Donțu
  0 siblings, 0 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 12:46 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Jan Beulich, xen-devel

On Monday 01 August 2016 10:54:10 Andrew Cooper wrote:
> On 01/08/16 03:52, Mihai Donțu wrote:
> > Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
> > ---
> > Changed since v2:
> >  * added tests for {,v}movq xmm,r64
> > ---
> >  tools/tests/x86_emulator/test_x86_emulator.c | 120 +++++++++++++++++++++++++++
> >  1 file changed, 120 insertions(+)
> >
> > diff --git a/tools/tests/x86_emulator/test_x86_emulator.c b/tools/tests/x86_emulator/test_x86_emulator.c
> > index 8994149..fb59b0f 100644
> > --- a/tools/tests/x86_emulator/test_x86_emulator.c
> > +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> > @@ -650,6 +650,88 @@ int main(int argc, char **argv)
> >  #define check_eip(which) (regs.eip == (unsigned long)instr + \
> >                                        (unsigned long)which##_len)
> >  
> > +    printf("%-40s", "Testing movd %%mm3,32(%%eax)...");
> > +    if ( stack_exec && cpu_has_mmx )
> > +    {
> > +        decl_insn(movd_to_mem32);
> > +
> > +        asm volatile ( "pcmpeqb %%mm3, %%mm3\n"
> > +                       put_insn(movd_to_mem32, "movd %%mm3, 32(%%eax)")
> > +                       :: );
> > +
> > +        *(res + 8) = 0xbdbdbdbd;
> > +        set_insn(movd_to_mem32);
> > +        regs.eax = (unsigned long)res;
> > +        rc = x86_emulate(&ctxt, &emulops);
> > +        if ( (rc != X86EMUL_OKAY) || *(res + 8) != 0xffffffff ||
> > +             !check_eip(movd_to_mem32) )
> > +            goto fail;
> > +        printf("okay\n");
> > +    }
> > +    else
> > +        printf("skipped\n");
> > +
> > +    printf("%-40s", "Testing movd %%mm3,%%eax...");  
> 
> Could we possibly change this to %ebx instead of %eax?

Sure thing!

> That case is far more likely to go bang in a debug build if the emulator
> is wrong.
> 
> Otherwise, Revewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01  9:52   ` Andrew Cooper
@ 2016-08-01 12:53     ` Mihai Donțu
  2016-08-01 12:56       ` Mihai Donțu
                         ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 12:53 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Zhi Wang, Jan Beulich, xen-devel

On Monday 01 August 2016 10:52:12 Andrew Cooper wrote:
> On 01/08/16 03:52, Mihai Donțu wrote:
> > Found that Windows driver was using a SSE2 instruction MOVD.
> >
> > Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
> > Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
> > ---
> > Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper
> >
> > Changed since v2:
> >  * handle the case where the destination is a GPR
> > ---
> >  xen/arch/x86/x86_emulate/x86_emulate.c | 38 +++++++++++++++++++++++++++++++---
> >  1 file changed, 35 insertions(+), 3 deletions(-)
> >
> > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
> > index 44de3b6..9f89ada 100644
> > --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> > @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
> >      /* 0x60 - 0x6F */
> >      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> >      /* 0x70 - 0x7F */
> > -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> > +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, ImplicitOps|ModRM,
> >      /* 0x80 - 0x87 */
> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
> > @@ -4409,6 +4409,10 @@ x86_emulate(
> >      case 0x6f: /* movq mm/m64,mm */
> >                 /* {,v}movdq{a,u} xmm/m128,xmm */
> >                 /* vmovdq{a,u} ymm/m256,ymm */
> > +    case 0x7e: /* movd mm,r/m32 */
> > +               /* movq mm,r/m64 */
> > +               /* {,v}movd xmm,r/m32 */
> > +               /* {,v}movq xmm,r/m64 */
> >      case 0x7f: /* movq mm,mm/m64 */
> >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
> >                 /* vmovdq{a,u} ymm,ymm/m256 */
> > @@ -4432,7 +4436,17 @@ x86_emulate(
> >                  host_and_vcpu_must_have(sse2);
> >                  buf[0] = 0x66; /* SSE */
> >                  get_fpu(X86EMUL_FPU_xmm, &fic);
> > -                ea.bytes = (b == 0xd6 ? 8 : 16);
> > +                switch ( b )
> > +                {
> > +                case 0x7e:
> > +                    ea.bytes = 4;
> > +                    break;
> > +                case 0xd6:
> > +                    ea.bytes = 8;
> > +                    break;
> > +                default:
> > +                    ea.bytes = 16;
> > +                }
> >                  break;
> >              case vex_none:
> >                  if ( b != 0xe7 )
> > @@ -4452,7 +4466,17 @@ x86_emulate(
> >                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
> >              host_and_vcpu_must_have(avx);
> >              get_fpu(X86EMUL_FPU_ymm, &fic);
> > -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
> > +            switch ( b )
> > +            {
> > +            case 0x7e:
> > +                ea.bytes = 4;
> > +                break;
> > +            case 0xd6:
> > +                ea.bytes = 8;
> > +                break;
> > +            default:
> > +                ea.bytes = 16 << vex.l;
> > +            }
> >          }
> >          if ( ea.type == OP_MEM )
> >          {
> > @@ -4468,6 +4492,14 @@ x86_emulate(
> >              vex.b = 1;
> >              buf[4] &= 0x38;
> >          }
> > +        else if ( b == 0x7e )
> > +        {
> > +            /* convert the GPR destination to (%rAX) */
> > +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
> > +            rex_prefix &= ~REX_B;
> > +            vex.b = 1;
> > +            buf[4] &= 0x38;
> > +        }  
> 
> Thankyou for doing this.  However, looking at it, it has some code in
> common with the "ea.type == OP_MEM" clause.
> 
> Would this work?
> 
> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
> b/xen/arch/x86/x86_emulate/x86_emulate.c
> index fe594ba..90db067 100644
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -4453,16 +4453,25 @@ x86_emulate(
>              get_fpu(X86EMUL_FPU_ymm, &fic);
>              ea.bytes = 16 << vex.l;
>          }
> -        if ( ea.type == OP_MEM )
> +        if ( ea.type == OP_MEM || ea.type == OP_REG )
>          {
> -            /* XXX enable once there is ops->ea() or equivalent
> -            generate_exception_if((vex.pfx == vex_66) &&
> -                                  (ops->ea(ea.mem.seg, ea.mem.off)
> -                                   & (ea.bytes - 1)), EXC_GP, 0); */
> -            if ( b == 0x6f )
> -                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
> -                               ea.bytes, ctxt);
>              /* convert memory operand to (%rAX) */
> +
> +            if ( ea.type == OP_MEM)
> +            {
> +                /* XXX enable once there is ops->ea() or equivalent
> +                   generate_exception_if((vex.pfx == vex_66) &&
> +                   (ops->ea(ea.mem.seg, ea.mem.off)
> +                   & (ea.bytes - 1)), EXC_GP, 0); */
> +                if ( b == 0x6f )
> +                    rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
> +                                   ea.bytes, ctxt);
> +            }
> +            else if ( ea.type == OP_REG )
> +            {
> +                *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
> +            }
> +
>              rex_prefix &= ~REX_B;
>              vex.b = 1;
>              buf[4] &= 0x38;
> 
> 
> This is untested, but avoids duplicating this bit of state maniupulation.

Your suggestion makes sense, but I'm starting to doubt my initial
patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
GPR-handling route and I can't seem to be able to easily prevent it
with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
to take a harder look at how that class of instructions is coded.

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 12:53     ` Mihai Donțu
@ 2016-08-01 12:56       ` Mihai Donțu
  2016-08-01 12:57       ` Andrew Cooper
  2016-08-01 12:59       ` Jan Beulich
  2 siblings, 0 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 12:56 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Zhi Wang, Jan Beulich, xen-devel

On Monday 01 August 2016 15:53:27 Mihai Donțu wrote:
> On Monday 01 August 2016 10:52:12 Andrew Cooper wrote:
> > On 01/08/16 03:52, Mihai Donțu wrote:  
> > > Found that Windows driver was using a SSE2 instruction MOVD.
> > >
> > > Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
> > > Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
> > > ---
> > > Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper
> > >
> > > Changed since v2:
> > >  * handle the case where the destination is a GPR
> > > ---
> > >  xen/arch/x86/x86_emulate/x86_emulate.c | 38 +++++++++++++++++++++++++++++++---
> > >  1 file changed, 35 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
> > > index 44de3b6..9f89ada 100644
> > > --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> > > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> > > @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
> > >      /* 0x60 - 0x6F */
> > >      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> > >      /* 0x70 - 0x7F */
> > > -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> > > +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, ImplicitOps|ModRM,
> > >      /* 0x80 - 0x87 */
> > >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
> > >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
> > > @@ -4409,6 +4409,10 @@ x86_emulate(
> > >      case 0x6f: /* movq mm/m64,mm */
> > >                 /* {,v}movdq{a,u} xmm/m128,xmm */
> > >                 /* vmovdq{a,u} ymm/m256,ymm */
> > > +    case 0x7e: /* movd mm,r/m32 */
> > > +               /* movq mm,r/m64 */
> > > +               /* {,v}movd xmm,r/m32 */
> > > +               /* {,v}movq xmm,r/m64 */
> > >      case 0x7f: /* movq mm,mm/m64 */
> > >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
> > >                 /* vmovdq{a,u} ymm,ymm/m256 */
> > > @@ -4432,7 +4436,17 @@ x86_emulate(
> > >                  host_and_vcpu_must_have(sse2);
> > >                  buf[0] = 0x66; /* SSE */
> > >                  get_fpu(X86EMUL_FPU_xmm, &fic);
> > > -                ea.bytes = (b == 0xd6 ? 8 : 16);
> > > +                switch ( b )
> > > +                {
> > > +                case 0x7e:
> > > +                    ea.bytes = 4;
> > > +                    break;
> > > +                case 0xd6:
> > > +                    ea.bytes = 8;
> > > +                    break;
> > > +                default:
> > > +                    ea.bytes = 16;
> > > +                }
> > >                  break;
> > >              case vex_none:
> > >                  if ( b != 0xe7 )
> > > @@ -4452,7 +4466,17 @@ x86_emulate(
> > >                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
> > >              host_and_vcpu_must_have(avx);
> > >              get_fpu(X86EMUL_FPU_ymm, &fic);
> > > -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
> > > +            switch ( b )
> > > +            {
> > > +            case 0x7e:
> > > +                ea.bytes = 4;
> > > +                break;
> > > +            case 0xd6:
> > > +                ea.bytes = 8;
> > > +                break;
> > > +            default:
> > > +                ea.bytes = 16 << vex.l;
> > > +            }
> > >          }
> > >          if ( ea.type == OP_MEM )
> > >          {
> > > @@ -4468,6 +4492,14 @@ x86_emulate(
> > >              vex.b = 1;
> > >              buf[4] &= 0x38;
> > >          }
> > > +        else if ( b == 0x7e )
> > > +        {
> > > +            /* convert the GPR destination to (%rAX) */
> > > +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
> > > +            rex_prefix &= ~REX_B;
> > > +            vex.b = 1;
> > > +            buf[4] &= 0x38;
> > > +        }    
> > 
> > Thankyou for doing this.  However, looking at it, it has some code in
> > common with the "ea.type == OP_MEM" clause.
> > 
> > Would this work?
> > 
> > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
> > b/xen/arch/x86/x86_emulate/x86_emulate.c
> > index fe594ba..90db067 100644
> > --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> > @@ -4453,16 +4453,25 @@ x86_emulate(
> >              get_fpu(X86EMUL_FPU_ymm, &fic);
> >              ea.bytes = 16 << vex.l;
> >          }
> > -        if ( ea.type == OP_MEM )
> > +        if ( ea.type == OP_MEM || ea.type == OP_REG )
> >          {
> > -            /* XXX enable once there is ops->ea() or equivalent
> > -            generate_exception_if((vex.pfx == vex_66) &&
> > -                                  (ops->ea(ea.mem.seg, ea.mem.off)
> > -                                   & (ea.bytes - 1)), EXC_GP, 0); */
> > -            if ( b == 0x6f )
> > -                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
> > -                               ea.bytes, ctxt);
> >              /* convert memory operand to (%rAX) */
> > +
> > +            if ( ea.type == OP_MEM)
> > +            {
> > +                /* XXX enable once there is ops->ea() or equivalent
> > +                   generate_exception_if((vex.pfx == vex_66) &&
> > +                   (ops->ea(ea.mem.seg, ea.mem.off)
> > +                   & (ea.bytes - 1)), EXC_GP, 0); */
> > +                if ( b == 0x6f )
> > +                    rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
> > +                                   ea.bytes, ctxt);
> > +            }
> > +            else if ( ea.type == OP_REG )
> > +            {
> > +                *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
> > +            }
> > +
> >              rex_prefix &= ~REX_B;
> >              vex.b = 1;
> >              buf[4] &= 0x38;
> > 
> > 
> > This is untested, but avoids duplicating this bit of state maniupulation.  
> 
> Your suggestion makes sense, but I'm starting to doubt my initial
> patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
> GPR-handling route and I can't seem to be able to easily prevent it
> with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
> to take a harder look at how that class of instructions is coded.

Err, I meant "vmovq xmm1, xmm1".

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 12:53     ` Mihai Donțu
  2016-08-01 12:56       ` Mihai Donțu
@ 2016-08-01 12:57       ` Andrew Cooper
  2016-08-01 12:59       ` Jan Beulich
  2 siblings, 0 replies; 26+ messages in thread
From: Andrew Cooper @ 2016-08-01 12:57 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Zhi Wang, Jan Beulich, xen-devel

On 01/08/16 13:53, Mihai Donțu wrote:
> On Monday 01 August 2016 10:52:12 Andrew Cooper wrote:
>> On 01/08/16 03:52, Mihai Donțu wrote:
>>> Found that Windows driver was using a SSE2 instruction MOVD.
>>>
>>> Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
>>> Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
>>> ---
>>> Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper
>>>
>>> Changed since v2:
>>>  * handle the case where the destination is a GPR
>>> ---
>>>  xen/arch/x86/x86_emulate/x86_emulate.c | 38 +++++++++++++++++++++++++++++++---
>>>  1 file changed, 35 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> index 44de3b6..9f89ada 100644
>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>>> @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
>>>      /* 0x60 - 0x6F */
>>>      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>>>      /* 0x70 - 0x7F */
>>> -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>>> +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, ImplicitOps|ModRM,
>>>      /* 0x80 - 0x87 */
>>>      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>>>      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>>> @@ -4409,6 +4409,10 @@ x86_emulate(
>>>      case 0x6f: /* movq mm/m64,mm */
>>>                 /* {,v}movdq{a,u} xmm/m128,xmm */
>>>                 /* vmovdq{a,u} ymm/m256,ymm */
>>> +    case 0x7e: /* movd mm,r/m32 */
>>> +               /* movq mm,r/m64 */
>>> +               /* {,v}movd xmm,r/m32 */
>>> +               /* {,v}movq xmm,r/m64 */
>>>      case 0x7f: /* movq mm,mm/m64 */
>>>                 /* {,v}movdq{a,u} xmm,xmm/m128 */
>>>                 /* vmovdq{a,u} ymm,ymm/m256 */
>>> @@ -4432,7 +4436,17 @@ x86_emulate(
>>>                  host_and_vcpu_must_have(sse2);
>>>                  buf[0] = 0x66; /* SSE */
>>>                  get_fpu(X86EMUL_FPU_xmm, &fic);
>>> -                ea.bytes = (b == 0xd6 ? 8 : 16);
>>> +                switch ( b )
>>> +                {
>>> +                case 0x7e:
>>> +                    ea.bytes = 4;
>>> +                    break;
>>> +                case 0xd6:
>>> +                    ea.bytes = 8;
>>> +                    break;
>>> +                default:
>>> +                    ea.bytes = 16;
>>> +                }
>>>                  break;
>>>              case vex_none:
>>>                  if ( b != 0xe7 )
>>> @@ -4452,7 +4466,17 @@ x86_emulate(
>>>                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
>>>              host_and_vcpu_must_have(avx);
>>>              get_fpu(X86EMUL_FPU_ymm, &fic);
>>> -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
>>> +            switch ( b )
>>> +            {
>>> +            case 0x7e:
>>> +                ea.bytes = 4;
>>> +                break;
>>> +            case 0xd6:
>>> +                ea.bytes = 8;
>>> +                break;
>>> +            default:
>>> +                ea.bytes = 16 << vex.l;
>>> +            }
>>>          }
>>>          if ( ea.type == OP_MEM )
>>>          {
>>> @@ -4468,6 +4492,14 @@ x86_emulate(
>>>              vex.b = 1;
>>>              buf[4] &= 0x38;
>>>          }
>>> +        else if ( b == 0x7e )
>>> +        {
>>> +            /* convert the GPR destination to (%rAX) */
>>> +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
>>> +            rex_prefix &= ~REX_B;
>>> +            vex.b = 1;
>>> +            buf[4] &= 0x38;
>>> +        }  
>> Thankyou for doing this.  However, looking at it, it has some code in
>> common with the "ea.type == OP_MEM" clause.
>>
>> Would this work?
>>
>> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
>> b/xen/arch/x86/x86_emulate/x86_emulate.c
>> index fe594ba..90db067 100644
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -4453,16 +4453,25 @@ x86_emulate(
>>              get_fpu(X86EMUL_FPU_ymm, &fic);
>>              ea.bytes = 16 << vex.l;
>>          }
>> -        if ( ea.type == OP_MEM )
>> +        if ( ea.type == OP_MEM || ea.type == OP_REG )
>>          {
>> -            /* XXX enable once there is ops->ea() or equivalent
>> -            generate_exception_if((vex.pfx == vex_66) &&
>> -                                  (ops->ea(ea.mem.seg, ea.mem.off)
>> -                                   & (ea.bytes - 1)), EXC_GP, 0); */
>> -            if ( b == 0x6f )
>> -                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>> -                               ea.bytes, ctxt);
>>              /* convert memory operand to (%rAX) */
>> +
>> +            if ( ea.type == OP_MEM)
>> +            {
>> +                /* XXX enable once there is ops->ea() or equivalent
>> +                   generate_exception_if((vex.pfx == vex_66) &&
>> +                   (ops->ea(ea.mem.seg, ea.mem.off)
>> +                   & (ea.bytes - 1)), EXC_GP, 0); */
>> +                if ( b == 0x6f )
>> +                    rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>> +                                   ea.bytes, ctxt);
>> +            }
>> +            else if ( ea.type == OP_REG )
>> +            {
>> +                *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
>> +            }
>> +
>>              rex_prefix &= ~REX_B;
>>              vex.b = 1;
>>              buf[4] &= 0x38;
>>
>>
>> This is untested, but avoids duplicating this bit of state maniupulation.
> Your suggestion makes sense, but I'm starting to doubt my initial
> patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
> GPR-handling route and I can't seem to be able to easily prevent it
> with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
> to take a harder look at how that class of instructions is coded.
>

That is a very good point - my suggestion doesn't differentiate GPR
register destinations from non-GPR register destinations, and it is only
the GPR register destinations we want to turn into memory accesses

what about this?

else if ( ea.type == OP_REG && b == 0x7e )
{
    /* GPR register destination - point into the register block. */
    *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
}

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 12:53     ` Mihai Donțu
  2016-08-01 12:56       ` Mihai Donțu
  2016-08-01 12:57       ` Andrew Cooper
@ 2016-08-01 12:59       ` Jan Beulich
  2016-08-01 13:28         ` Mihai Donțu
  2 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2016-08-01 12:59 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Andrew Cooper, Zhi Wang, xen-devel

>>> On 01.08.16 at 14:53, <mdontu@bitdefender.com> wrote:
> On Monday 01 August 2016 10:52:12 Andrew Cooper wrote:
>> On 01/08/16 03:52, Mihai Donțu wrote:
>> > Found that Windows driver was using a SSE2 instruction MOVD.
>> >
>> > Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
>> > Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
>> > ---
>> > Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper
>> >
>> > Changed since v2:
>> >  * handle the case where the destination is a GPR
>> > ---
>> >  xen/arch/x86/x86_emulate/x86_emulate.c | 38 
> +++++++++++++++++++++++++++++++---
>> >  1 file changed, 35 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c 
> b/xen/arch/x86/x86_emulate/x86_emulate.c
>> > index 44de3b6..9f89ada 100644
>> > --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> > @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
>> >      /* 0x60 - 0x6F */
>> >      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>> >      /* 0x70 - 0x7F */
>> > -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>> > +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 
> ImplicitOps|ModRM,
>> >      /* 0x80 - 0x87 */
>> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>> > @@ -4409,6 +4409,10 @@ x86_emulate(
>> >      case 0x6f: /* movq mm/m64,mm */
>> >                 /* {,v}movdq{a,u} xmm/m128,xmm */
>> >                 /* vmovdq{a,u} ymm/m256,ymm */
>> > +    case 0x7e: /* movd mm,r/m32 */
>> > +               /* movq mm,r/m64 */
>> > +               /* {,v}movd xmm,r/m32 */
>> > +               /* {,v}movq xmm,r/m64 */
>> >      case 0x7f: /* movq mm,mm/m64 */
>> >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
>> >                 /* vmovdq{a,u} ymm,ymm/m256 */
>> > @@ -4432,7 +4436,17 @@ x86_emulate(
>> >                  host_and_vcpu_must_have(sse2);
>> >                  buf[0] = 0x66; /* SSE */
>> >                  get_fpu(X86EMUL_FPU_xmm, &fic);
>> > -                ea.bytes = (b == 0xd6 ? 8 : 16);
>> > +                switch ( b )
>> > +                {
>> > +                case 0x7e:
>> > +                    ea.bytes = 4;
>> > +                    break;
>> > +                case 0xd6:
>> > +                    ea.bytes = 8;
>> > +                    break;
>> > +                default:
>> > +                    ea.bytes = 16;
>> > +                }
>> >                  break;
>> >              case vex_none:
>> >                  if ( b != 0xe7 )
>> > @@ -4452,7 +4466,17 @@ x86_emulate(
>> >                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
>> >              host_and_vcpu_must_have(avx);
>> >              get_fpu(X86EMUL_FPU_ymm, &fic);
>> > -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
>> > +            switch ( b )
>> > +            {
>> > +            case 0x7e:
>> > +                ea.bytes = 4;
>> > +                break;
>> > +            case 0xd6:
>> > +                ea.bytes = 8;
>> > +                break;
>> > +            default:
>> > +                ea.bytes = 16 << vex.l;
>> > +            }
>> >          }
>> >          if ( ea.type == OP_MEM )
>> >          {
>> > @@ -4468,6 +4492,14 @@ x86_emulate(
>> >              vex.b = 1;
>> >              buf[4] &= 0x38;
>> >          }
>> > +        else if ( b == 0x7e )
>> > +        {
>> > +            /* convert the GPR destination to (%rAX) */
>> > +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
>> > +            rex_prefix &= ~REX_B;
>> > +            vex.b = 1;
>> > +            buf[4] &= 0x38;
>> > +        }  
>> 
>> Thankyou for doing this.  However, looking at it, it has some code in
>> common with the "ea.type == OP_MEM" clause.
>> 
>> Would this work?
>> 
>> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
>> b/xen/arch/x86/x86_emulate/x86_emulate.c
>> index fe594ba..90db067 100644
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -4453,16 +4453,25 @@ x86_emulate(
>>              get_fpu(X86EMUL_FPU_ymm, &fic);
>>              ea.bytes = 16 << vex.l;
>>          }
>> -        if ( ea.type == OP_MEM )
>> +        if ( ea.type == OP_MEM || ea.type == OP_REG )
>>          {
>> -            /* XXX enable once there is ops->ea() or equivalent
>> -            generate_exception_if((vex.pfx == vex_66) &&
>> -                                  (ops->ea(ea.mem.seg, ea.mem.off)
>> -                                   & (ea.bytes - 1)), EXC_GP, 0); */
>> -            if ( b == 0x6f )
>> -                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>> -                               ea.bytes, ctxt);
>>              /* convert memory operand to (%rAX) */
>> +
>> +            if ( ea.type == OP_MEM)
>> +            {
>> +                /* XXX enable once there is ops->ea() or equivalent
>> +                   generate_exception_if((vex.pfx == vex_66) &&
>> +                   (ops->ea(ea.mem.seg, ea.mem.off)
>> +                   & (ea.bytes - 1)), EXC_GP, 0); */
>> +                if ( b == 0x6f )
>> +                    rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>> +                                   ea.bytes, ctxt);
>> +            }
>> +            else if ( ea.type == OP_REG )
>> +            {
>> +                *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
>> +            }
>> +
>>              rex_prefix &= ~REX_B;
>>              vex.b = 1;
>>              buf[4] &= 0x38;
>> 
>> 
>> This is untested, but avoids duplicating this bit of state maniupulation.
> 
> Your suggestion makes sense, but I'm starting to doubt my initial
> patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
> GPR-handling route and I can't seem to be able to easily prevent it
> with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
> to take a harder look at how that class of instructions is coded.

You obviously need to distinguish the two kinds of register sources/
destinations: GPRs need suitable re-writing of the instruction (without
having looked at the most recent version of the patch yet I btw doubt
converting register to memory operands is the most efficient model),
while MMs, XMMs, and YMMs can retain their register encoding.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64
  2016-08-01  2:52 [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Mihai Donțu
                   ` (2 preceding siblings ...)
  2016-08-01  9:18 ` [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Andrew Cooper
@ 2016-08-01 13:19 ` Jan Beulich
  2016-08-01 13:25   ` Mihai Donțu
  2016-08-01 23:19   ` Mihai Donțu
  3 siblings, 2 replies; 26+ messages in thread
From: Jan Beulich @ 2016-08-01 13:19 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Andrew Cooper, xen-devel

>>> On 01.08.16 at 04:52, <mdontu@bitdefender.com> wrote:
> @@ -4412,6 +4412,7 @@ x86_emulate(
>      case 0x7f: /* movq mm,mm/m64 */
>                 /* {,v}movdq{a,u} xmm,xmm/m128 */
>                 /* vmovdq{a,u} ymm,ymm/m256 */
> +    case 0xd6: /* {,v}movq xmm,xmm/m64 */
>      {
>          uint8_t *buf = get_stub(stub);
>          struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
> @@ -4429,9 +4430,9 @@ x86_emulate(
>              case vex_66:
>              case vex_f3:
>                  host_and_vcpu_must_have(sse2);
> -                buf[0] = 0x66; /* movdqa */
> +                buf[0] = 0x66; /* SSE */

The comment change here indicates a problem: So far it was indicating
that despite the possible F3 prefix (movdqu) we encode a 66 one
(movdqa). Opcode D6 prefixed with F3, however, is movq2dq, which
you then either don't emulate correctly, or if it happens to be
emulated correctly you should include in the comment accompanying
the case label. And its AVX counterpart should then produce #UD.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64
  2016-08-01 13:19 ` Jan Beulich
@ 2016-08-01 13:25   ` Mihai Donțu
  2016-08-01 23:19   ` Mihai Donțu
  1 sibling, 0 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 13:25 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, xen-devel

On Monday 01 August 2016 07:19:51 Jan Beulich wrote:
> >>> On 01.08.16 at 04:52, <mdontu@bitdefender.com> wrote:  
> > @@ -4412,6 +4412,7 @@ x86_emulate(
> >      case 0x7f: /* movq mm,mm/m64 */
> >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
> >                 /* vmovdq{a,u} ymm,ymm/m256 */
> > +    case 0xd6: /* {,v}movq xmm,xmm/m64 */
> >      {
> >          uint8_t *buf = get_stub(stub);
> >          struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
> > @@ -4429,9 +4430,9 @@ x86_emulate(
> >              case vex_66:
> >              case vex_f3:
> >                  host_and_vcpu_must_have(sse2);
> > -                buf[0] = 0x66; /* movdqa */
> > +                buf[0] = 0x66; /* SSE */  
> 
> The comment change here indicates a problem: So far it was indicating
> that despite the possible F3 prefix (movdqu) we encode a 66 one
> (movdqa). Opcode D6 prefixed with F3, however, is movq2dq, which
> you then either don't emulate correctly, or if it happens to be
> emulated correctly you should include in the comment accompanying
> the case label. And its AVX counterpart should then produce #UD.

I see. Thanks for the hint. I'll try to write a test case and see if it
"just works" or needs extra handling.

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 12:59       ` Jan Beulich
@ 2016-08-01 13:28         ` Mihai Donțu
  2016-08-01 13:43           ` Jan Beulich
  0 siblings, 1 reply; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 13:28 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Zhi Wang, xen-devel

On Monday 01 August 2016 06:59:08 Jan Beulich wrote:
> >>> On 01.08.16 at 14:53, <mdontu@bitdefender.com> wrote:  
> > On Monday 01 August 2016 10:52:12 Andrew Cooper wrote:  
> >> On 01/08/16 03:52, Mihai Donțu wrote:  
> >> > Found that Windows driver was using a SSE2 instruction MOVD.
> >> >
> >> > Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
> >> > Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
> >> > ---
> >> > Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper
> >> >
> >> > Changed since v2:
> >> >  * handle the case where the destination is a GPR
> >> > ---
> >> >  xen/arch/x86/x86_emulate/x86_emulate.c | 38   
> > +++++++++++++++++++++++++++++++---  
> >> >  1 file changed, 35 insertions(+), 3 deletions(-)
> >> >
> >> > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c   
> > b/xen/arch/x86/x86_emulate/x86_emulate.c  
> >> > index 44de3b6..9f89ada 100644
> >> > --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> >> > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> >> > @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
> >> >      /* 0x60 - 0x6F */
> >> >      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> >> >      /* 0x70 - 0x7F */
> >> > -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> >> > +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,   
> > ImplicitOps|ModRM,  
> >> >      /* 0x80 - 0x87 */
> >> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
> >> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
> >> > @@ -4409,6 +4409,10 @@ x86_emulate(
> >> >      case 0x6f: /* movq mm/m64,mm */
> >> >                 /* {,v}movdq{a,u} xmm/m128,xmm */
> >> >                 /* vmovdq{a,u} ymm/m256,ymm */
> >> > +    case 0x7e: /* movd mm,r/m32 */
> >> > +               /* movq mm,r/m64 */
> >> > +               /* {,v}movd xmm,r/m32 */
> >> > +               /* {,v}movq xmm,r/m64 */
> >> >      case 0x7f: /* movq mm,mm/m64 */
> >> >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
> >> >                 /* vmovdq{a,u} ymm,ymm/m256 */
> >> > @@ -4432,7 +4436,17 @@ x86_emulate(
> >> >                  host_and_vcpu_must_have(sse2);
> >> >                  buf[0] = 0x66; /* SSE */
> >> >                  get_fpu(X86EMUL_FPU_xmm, &fic);
> >> > -                ea.bytes = (b == 0xd6 ? 8 : 16);
> >> > +                switch ( b )
> >> > +                {
> >> > +                case 0x7e:
> >> > +                    ea.bytes = 4;
> >> > +                    break;
> >> > +                case 0xd6:
> >> > +                    ea.bytes = 8;
> >> > +                    break;
> >> > +                default:
> >> > +                    ea.bytes = 16;
> >> > +                }
> >> >                  break;
> >> >              case vex_none:
> >> >                  if ( b != 0xe7 )
> >> > @@ -4452,7 +4466,17 @@ x86_emulate(
> >> >                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
> >> >              host_and_vcpu_must_have(avx);
> >> >              get_fpu(X86EMUL_FPU_ymm, &fic);
> >> > -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
> >> > +            switch ( b )
> >> > +            {
> >> > +            case 0x7e:
> >> > +                ea.bytes = 4;
> >> > +                break;
> >> > +            case 0xd6:
> >> > +                ea.bytes = 8;
> >> > +                break;
> >> > +            default:
> >> > +                ea.bytes = 16 << vex.l;
> >> > +            }
> >> >          }
> >> >          if ( ea.type == OP_MEM )
> >> >          {
> >> > @@ -4468,6 +4492,14 @@ x86_emulate(
> >> >              vex.b = 1;
> >> >              buf[4] &= 0x38;
> >> >          }
> >> > +        else if ( b == 0x7e )
> >> > +        {
> >> > +            /* convert the GPR destination to (%rAX) */
> >> > +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
> >> > +            rex_prefix &= ~REX_B;
> >> > +            vex.b = 1;
> >> > +            buf[4] &= 0x38;
> >> > +        }    
> >> 
> >> Thankyou for doing this.  However, looking at it, it has some code in
> >> common with the "ea.type == OP_MEM" clause.
> >> 
> >> Would this work?
> >> 
> >> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
> >> b/xen/arch/x86/x86_emulate/x86_emulate.c
> >> index fe594ba..90db067 100644
> >> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> >> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> >> @@ -4453,16 +4453,25 @@ x86_emulate(
> >>              get_fpu(X86EMUL_FPU_ymm, &fic);
> >>              ea.bytes = 16 << vex.l;
> >>          }
> >> -        if ( ea.type == OP_MEM )
> >> +        if ( ea.type == OP_MEM || ea.type == OP_REG )
> >>          {
> >> -            /* XXX enable once there is ops->ea() or equivalent
> >> -            generate_exception_if((vex.pfx == vex_66) &&
> >> -                                  (ops->ea(ea.mem.seg, ea.mem.off)
> >> -                                   & (ea.bytes - 1)), EXC_GP, 0); */
> >> -            if ( b == 0x6f )
> >> -                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
> >> -                               ea.bytes, ctxt);
> >>              /* convert memory operand to (%rAX) */
> >> +
> >> +            if ( ea.type == OP_MEM)
> >> +            {
> >> +                /* XXX enable once there is ops->ea() or equivalent
> >> +                   generate_exception_if((vex.pfx == vex_66) &&
> >> +                   (ops->ea(ea.mem.seg, ea.mem.off)
> >> +                   & (ea.bytes - 1)), EXC_GP, 0); */
> >> +                if ( b == 0x6f )
> >> +                    rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
> >> +                                   ea.bytes, ctxt);
> >> +            }
> >> +            else if ( ea.type == OP_REG )
> >> +            {
> >> +                *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
> >> +            }
> >> +
> >>              rex_prefix &= ~REX_B;
> >>              vex.b = 1;
> >>              buf[4] &= 0x38;
> >> 
> >> 
> >> This is untested, but avoids duplicating this bit of state maniupulation.  
> > 
> > Your suggestion makes sense, but I'm starting to doubt my initial
> > patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
> > GPR-handling route and I can't seem to be able to easily prevent it
> > with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
> > to take a harder look at how that class of instructions is coded.  
> 
> You obviously need to distinguish the two kinds of register sources/
> destinations: GPRs need suitable re-writing of the instruction (without
> having looked at the most recent version of the patch yet I btw doubt
> converting register to memory operands is the most efficient model),
> while MMs, XMMs, and YMMs can retain their register encoding.

Regarding efficiency, I'm not married with the approach I've proposed.
If you can give me a few more hints, I can give it a try.

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01  2:52 ` [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64 Mihai Donțu
  2016-08-01  9:52   ` Andrew Cooper
@ 2016-08-01 13:38   ` Jan Beulich
  1 sibling, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2016-08-01 13:38 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Andrew Cooper, Zhi Wang, xen-devel

>>> On 01.08.16 at 04:52, <mdontu@bitdefender.com> wrote:
> Found that Windows driver was using a SSE2 instruction MOVD.

Nevertheless the title shouldn't say "SSE2", as you also add AVX ones.
Nor are you limiting things to MM register sources. Hence I think the
title needs changing.

> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
>      /* 0x60 - 0x6F */
>      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>      /* 0x70 - 0x7F */
> -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
> +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, ImplicitOps|ModRM,

What about the moves in the other direction (opcode 6E)?

> @@ -4409,6 +4409,10 @@ x86_emulate(
>      case 0x6f: /* movq mm/m64,mm */
>                 /* {,v}movdq{a,u} xmm/m128,xmm */
>                 /* vmovdq{a,u} ymm/m256,ymm */
> +    case 0x7e: /* movd mm,r/m32 */
> +               /* movq mm,r/m64 */
> +               /* {,v}movd xmm,r/m32 */
> +               /* {,v}movq xmm,r/m64 */

Same problem as in the other patch - 7E prefixed by F3 has yet
another meaning ({,v}movq xmm,xmm/m64).

> @@ -4432,7 +4436,17 @@ x86_emulate(
>                  host_and_vcpu_must_have(sse2);
>                  buf[0] = 0x66; /* SSE */
>                  get_fpu(X86EMUL_FPU_xmm, &fic);
> -                ea.bytes = (b == 0xd6 ? 8 : 16);
> +                switch ( b )
> +                {
> +                case 0x7e:
> +                    ea.bytes = 4;
> +                    break;
> +                case 0xd6:
> +                    ea.bytes = 8;
> +                    break;
> +                default:
> +                    ea.bytes = 16;
> +                }

Please don't omit the final break statement (also below).

> @@ -4452,7 +4466,17 @@ x86_emulate(
>                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
>              host_and_vcpu_must_have(avx);
>              get_fpu(X86EMUL_FPU_ymm, &fic);
> -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
> +            switch ( b )
> +            {
> +            case 0x7e:
> +                ea.bytes = 4;
> +                break;
> +            case 0xd6:
> +                ea.bytes = 8;
> +                break;
> +            default:
> +                ea.bytes = 16 << vex.l;
> +            }
>          }
>          if ( ea.type == OP_MEM )
>          {
> @@ -4468,6 +4492,14 @@ x86_emulate(
>              vex.b = 1;
>              buf[4] &= 0x38;
>          }
> +        else if ( b == 0x7e )
> +        {
> +            /* convert the GPR destination to (%rAX) */
> +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;

Hmm, you write the original register value to the memory location
you want to use, but the instruction means to _write_ to the
register. Nor would this mechanism, even if used properly, clear the
high half of the GPR for a 32-bit destination.

Also - comment style (admittedly the pre-existing nearby one is
wrong too).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 13:28         ` Mihai Donțu
@ 2016-08-01 13:43           ` Jan Beulich
  2016-08-01 14:48             ` Mihai Donțu
  0 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2016-08-01 13:43 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Andrew Cooper, Zhi Wang, xen-devel

>>> On 01.08.16 at 15:28, <mdontu@bitdefender.com> wrote:
> On Monday 01 August 2016 06:59:08 Jan Beulich wrote:
>> >>> On 01.08.16 at 14:53, <mdontu@bitdefender.com> wrote:  
>> > On Monday 01 August 2016 10:52:12 Andrew Cooper wrote:  
>> >> On 01/08/16 03:52, Mihai Donțu wrote:  
>> >> > Found that Windows driver was using a SSE2 instruction MOVD.
>> >> >
>> >> > Signed-off-by: Zhi Wang <zhi.a.wang@intel.com>
>> >> > Signed-off-by: Mihai Donțu <mdontu@bitdefender.com>
>> >> > ---
>> >> > Picked from the XenServer 7 patch queue, as suggested by Andrew Cooper
>> >> >
>> >> > Changed since v2:
>> >> >  * handle the case where the destination is a GPR
>> >> > ---
>> >> >  xen/arch/x86/x86_emulate/x86_emulate.c | 38   
>> > +++++++++++++++++++++++++++++++---  
>> >> >  1 file changed, 35 insertions(+), 3 deletions(-)
>> >> >
>> >> > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c   
>> > b/xen/arch/x86/x86_emulate/x86_emulate.c  
>> >> > index 44de3b6..9f89ada 100644
>> >> > --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> >> > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> >> > @@ -204,7 +204,7 @@ static uint8_t twobyte_table[256] = {
>> >> >      /* 0x60 - 0x6F */
>> >> >      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>> >> >      /* 0x70 - 0x7F */
>> >> > -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,
>> >> > +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM,   
>> > ImplicitOps|ModRM,  
>> >> >      /* 0x80 - 0x87 */
>> >> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>> >> >      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
>> >> > @@ -4409,6 +4409,10 @@ x86_emulate(
>> >> >      case 0x6f: /* movq mm/m64,mm */
>> >> >                 /* {,v}movdq{a,u} xmm/m128,xmm */
>> >> >                 /* vmovdq{a,u} ymm/m256,ymm */
>> >> > +    case 0x7e: /* movd mm,r/m32 */
>> >> > +               /* movq mm,r/m64 */
>> >> > +               /* {,v}movd xmm,r/m32 */
>> >> > +               /* {,v}movq xmm,r/m64 */
>> >> >      case 0x7f: /* movq mm,mm/m64 */
>> >> >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
>> >> >                 /* vmovdq{a,u} ymm,ymm/m256 */
>> >> > @@ -4432,7 +4436,17 @@ x86_emulate(
>> >> >                  host_and_vcpu_must_have(sse2);
>> >> >                  buf[0] = 0x66; /* SSE */
>> >> >                  get_fpu(X86EMUL_FPU_xmm, &fic);
>> >> > -                ea.bytes = (b == 0xd6 ? 8 : 16);
>> >> > +                switch ( b )
>> >> > +                {
>> >> > +                case 0x7e:
>> >> > +                    ea.bytes = 4;
>> >> > +                    break;
>> >> > +                case 0xd6:
>> >> > +                    ea.bytes = 8;
>> >> > +                    break;
>> >> > +                default:
>> >> > +                    ea.bytes = 16;
>> >> > +                }
>> >> >                  break;
>> >> >              case vex_none:
>> >> >                  if ( b != 0xe7 )
>> >> > @@ -4452,7 +4466,17 @@ x86_emulate(
>> >> >                      ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
>> >> >              host_and_vcpu_must_have(avx);
>> >> >              get_fpu(X86EMUL_FPU_ymm, &fic);
>> >> > -            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
>> >> > +            switch ( b )
>> >> > +            {
>> >> > +            case 0x7e:
>> >> > +                ea.bytes = 4;
>> >> > +                break;
>> >> > +            case 0xd6:
>> >> > +                ea.bytes = 8;
>> >> > +                break;
>> >> > +            default:
>> >> > +                ea.bytes = 16 << vex.l;
>> >> > +            }
>> >> >          }
>> >> >          if ( ea.type == OP_MEM )
>> >> >          {
>> >> > @@ -4468,6 +4492,14 @@ x86_emulate(
>> >> >              vex.b = 1;
>> >> >              buf[4] &= 0x38;
>> >> >          }
>> >> > +        else if ( b == 0x7e )
>> >> > +        {
>> >> > +            /* convert the GPR destination to (%rAX) */
>> >> > +            *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
>> >> > +            rex_prefix &= ~REX_B;
>> >> > +            vex.b = 1;
>> >> > +            buf[4] &= 0x38;
>> >> > +        }    
>> >> 
>> >> Thankyou for doing this.  However, looking at it, it has some code in
>> >> common with the "ea.type == OP_MEM" clause.
>> >> 
>> >> Would this work?
>> >> 
>> >> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
>> >> b/xen/arch/x86/x86_emulate/x86_emulate.c
>> >> index fe594ba..90db067 100644
>> >> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> >> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> >> @@ -4453,16 +4453,25 @@ x86_emulate(
>> >>              get_fpu(X86EMUL_FPU_ymm, &fic);
>> >>              ea.bytes = 16 << vex.l;
>> >>          }
>> >> -        if ( ea.type == OP_MEM )
>> >> +        if ( ea.type == OP_MEM || ea.type == OP_REG )
>> >>          {
>> >> -            /* XXX enable once there is ops->ea() or equivalent
>> >> -            generate_exception_if((vex.pfx == vex_66) &&
>> >> -                                  (ops->ea(ea.mem.seg, ea.mem.off)
>> >> -                                   & (ea.bytes - 1)), EXC_GP, 0); */
>> >> -            if ( b == 0x6f )
>> >> -                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>> >> -                               ea.bytes, ctxt);
>> >>              /* convert memory operand to (%rAX) */
>> >> +
>> >> +            if ( ea.type == OP_MEM)
>> >> +            {
>> >> +                /* XXX enable once there is ops->ea() or equivalent
>> >> +                   generate_exception_if((vex.pfx == vex_66) &&
>> >> +                   (ops->ea(ea.mem.seg, ea.mem.off)
>> >> +                   & (ea.bytes - 1)), EXC_GP, 0); */
>> >> +                if ( b == 0x6f )
>> >> +                    rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
>> >> +                                   ea.bytes, ctxt);
>> >> +            }
>> >> +            else if ( ea.type == OP_REG )
>> >> +            {
>> >> +                *((unsigned long *)&mmvalp) = (unsigned long)ea.reg;
>> >> +            }
>> >> +
>> >>              rex_prefix &= ~REX_B;
>> >>              vex.b = 1;
>> >>              buf[4] &= 0x38;
>> >> 
>> >> 
>> >> This is untested, but avoids duplicating this bit of state maniupulation.  
>> > 
>> > Your suggestion makes sense, but I'm starting to doubt my initial
>> > patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
>> > GPR-handling route and I can't seem to be able to easily prevent it
>> > with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
>> > to take a harder look at how that class of instructions is coded.  
>> 
>> You obviously need to distinguish the two kinds of register sources/
>> destinations: GPRs need suitable re-writing of the instruction (without
>> having looked at the most recent version of the patch yet I btw doubt
>> converting register to memory operands is the most efficient model),
>> while MMs, XMMs, and YMMs can retain their register encoding.
> 
> Regarding efficiency, I'm not married with the approach I've proposed.
> If you can give me a few more hints, I can give it a try.

I'd rather pick a fixed register and update the regs->... field from that
after the stub was executed. E.g. using rAX and treating it just like a
return value of the "call". But maybe I'm imagining this easier than it
really is; as an alternative I'd then suggest really following what Andrew
said - use a pointer into regs->, not mmvalp. But (as said in the review
mail) you'd then have the problem of the missing zero-extension for
writes to 32-bit GPRs

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 13:43           ` Jan Beulich
@ 2016-08-01 14:48             ` Mihai Donțu
  2016-08-01 14:53               ` Andrew Cooper
                                 ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 14:48 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Zhi Wang, xen-devel

On Monday 01 August 2016 07:43:20 Jan Beulich wrote:
> > > > Your suggestion makes sense, but I'm starting to doubt my initial
> > > > patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
> > > > GPR-handling route and I can't seem to be able to easily prevent it
> > > > with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
> > > > to take a harder look at how that class of instructions is coded.    
> > > 
> > > You obviously need to distinguish the two kinds of register sources/
> > > destinations: GPRs need suitable re-writing of the instruction (without
> > > having looked at the most recent version of the patch yet I btw doubt
> > > converting register to memory operands is the most efficient model),
> > > while MMs, XMMs, and YMMs can retain their register encoding.  
> > 
> > Regarding efficiency, I'm not married with the approach I've proposed.
> > If you can give me a few more hints, I can give it a try.  
> 
> I'd rather pick a fixed register and update the regs->... field from that
> after the stub was executed. E.g. using rAX and treating it just like a
> return value of the "call". But maybe I'm imagining this easier than it
> really is; as an alternative I'd then suggest really following what Andrew
> said - use a pointer into regs->, not mmvalp. But (as said in the review
> mail) you'd then have the problem of the missing zero-extension for
> writes to 32-bit GPRs

I thought that by re-using (hijacking, really) mmvalp, the patch will
look less intrusive and thus not add too much to an already complex
code.

Assuming I'll just pass to the stub "a"(ea.reg), would it be a good
idea to just zero-out the 64bit register before that? It does not
appear to be any instructions that write just the low dword. Or am I
misunderstanding the zero-extension concept?

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 14:48             ` Mihai Donțu
@ 2016-08-01 14:53               ` Andrew Cooper
  2016-08-01 15:10                 ` Mihai Donțu
  2016-08-01 14:55               ` Mihai Donțu
  2016-08-01 14:56               ` Jan Beulich
  2 siblings, 1 reply; 26+ messages in thread
From: Andrew Cooper @ 2016-08-01 14:53 UTC (permalink / raw)
  To: Mihai Donțu, Jan Beulich; +Cc: Zhi Wang, xen-devel

On 01/08/16 15:48, Mihai Donțu wrote:
> On Monday 01 August 2016 07:43:20 Jan Beulich wrote:
>>>>> Your suggestion makes sense, but I'm starting to doubt my initial
>>>>> patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
>>>>> GPR-handling route and I can't seem to be able to easily prevent it
>>>>> with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
>>>>> to take a harder look at how that class of instructions is coded.    
>>>> You obviously need to distinguish the two kinds of register sources/
>>>> destinations: GPRs need suitable re-writing of the instruction (without
>>>> having looked at the most recent version of the patch yet I btw doubt
>>>> converting register to memory operands is the most efficient model),
>>>> while MMs, XMMs, and YMMs can retain their register encoding.  
>>> Regarding efficiency, I'm not married with the approach I've proposed.
>>> If you can give me a few more hints, I can give it a try.  
>> I'd rather pick a fixed register and update the regs->... field from that
>> after the stub was executed. E.g. using rAX and treating it just like a
>> return value of the "call". But maybe I'm imagining this easier than it
>> really is; as an alternative I'd then suggest really following what Andrew
>> said - use a pointer into regs->, not mmvalp. But (as said in the review
>> mail) you'd then have the problem of the missing zero-extension for
>> writes to 32-bit GPRs
> I thought that by re-using (hijacking, really) mmvalp, the patch will
> look less intrusive and thus not add too much to an already complex
> code.
>
> Assuming I'll just pass to the stub "a"(ea.reg), would it be a good
> idea to just zero-out the 64bit register before that? It does not
> appear to be any instructions that write just the low dword. Or am I
> misunderstanding the zero-extension concept?

Any write to a 32bit GPR zero extends it to 64 bits.  This is specified
by AMD64 and only applies to 32bit writes.  8 and 16 bit writes leave
the upper bits alone.

Therefore movd %mm, %eax will move 32bits from %mm to eax, then zero
extend the upper 32 bits of %rax.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 14:48             ` Mihai Donțu
  2016-08-01 14:53               ` Andrew Cooper
@ 2016-08-01 14:55               ` Mihai Donțu
  2016-08-01 14:59                 ` Jan Beulich
  2016-08-01 14:56               ` Jan Beulich
  2 siblings, 1 reply; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 14:55 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Zhi Wang, xen-devel

On Monday 01 August 2016 17:48:33 Mihai Donțu wrote:
> On Monday 01 August 2016 07:43:20 Jan Beulich wrote:
> > > > > Your suggestion makes sense, but I'm starting to doubt my initial
> > > > > patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
> > > > > GPR-handling route and I can't seem to be able to easily prevent it
> > > > > with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
> > > > > to take a harder look at how that class of instructions is coded.      
> > > > 
> > > > You obviously need to distinguish the two kinds of register sources/
> > > > destinations: GPRs need suitable re-writing of the instruction (without
> > > > having looked at the most recent version of the patch yet I btw doubt
> > > > converting register to memory operands is the most efficient model),
> > > > while MMs, XMMs, and YMMs can retain their register encoding.    
> > > 
> > > Regarding efficiency, I'm not married with the approach I've proposed.
> > > If you can give me a few more hints, I can give it a try.    
> > 
> > I'd rather pick a fixed register and update the regs->... field from that
> > after the stub was executed. E.g. using rAX and treating it just like a
> > return value of the "call". But maybe I'm imagining this easier than it
> > really is; as an alternative I'd then suggest really following what Andrew
> > said - use a pointer into regs->, not mmvalp. But (as said in the review
> > mail) you'd then have the problem of the missing zero-extension for
> > writes to 32-bit GPRs  
> 
> I thought that by re-using (hijacking, really) mmvalp, the patch will
> look less intrusive and thus not add too much to an already complex
> code.
> 
> Assuming I'll just pass to the stub "a"(ea.reg), would it be a good
> idea to just zero-out the 64bit register before that? It does not
> appear to be any instructions that write just the low dword. Or am I
> misunderstanding the zero-extension concept?
> 

Just to be sure I'm making myself understood, ea.reg contains the
output of decode_register() which, in turn, returns a pointer in regs.

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 14:48             ` Mihai Donțu
  2016-08-01 14:53               ` Andrew Cooper
  2016-08-01 14:55               ` Mihai Donțu
@ 2016-08-01 14:56               ` Jan Beulich
  2 siblings, 0 replies; 26+ messages in thread
From: Jan Beulich @ 2016-08-01 14:56 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Andrew Cooper, Zhi Wang, xen-devel

>>> On 01.08.16 at 16:48, <mdontu@bitdefender.com> wrote:
> On Monday 01 August 2016 07:43:20 Jan Beulich wrote:
>> > > > Your suggestion makes sense, but I'm starting to doubt my initial
>> > > > patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
>> > > > GPR-handling route and I can't seem to be able to easily prevent it
>> > > > with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
>> > > > to take a harder look at how that class of instructions is coded.    
>> > > 
>> > > You obviously need to distinguish the two kinds of register sources/
>> > > destinations: GPRs need suitable re-writing of the instruction (without
>> > > having looked at the most recent version of the patch yet I btw doubt
>> > > converting register to memory operands is the most efficient model),
>> > > while MMs, XMMs, and YMMs can retain their register encoding.  
>> > 
>> > Regarding efficiency, I'm not married with the approach I've proposed.
>> > If you can give me a few more hints, I can give it a try.  
>> 
>> I'd rather pick a fixed register and update the regs->... field from that
>> after the stub was executed. E.g. using rAX and treating it just like a
>> return value of the "call". But maybe I'm imagining this easier than it
>> really is; as an alternative I'd then suggest really following what Andrew
>> said - use a pointer into regs->, not mmvalp. But (as said in the review
>> mail) you'd then have the problem of the missing zero-extension for
>> writes to 32-bit GPRs
> 
> I thought that by re-using (hijacking, really) mmvalp, the patch will
> look less intrusive and thus not add too much to an already complex
> code.

Well, if that was really all it takes, I would agree. But as said in the
other sub-thread, I don't think it's as simple.

> Assuming I'll just pass to the stub "a"(ea.reg), would it be a good
> idea to just zero-out the 64bit register before that? It does not
> appear to be any instructions that write just the low dword. Or am I
> misunderstanding the zero-extension concept?

That's an option, as long as there's no fault delivery possible past
the point where you do that zeroing. Yet I'm afraid there might be
such a path; you'll need to check carefully.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 14:55               ` Mihai Donțu
@ 2016-08-01 14:59                 ` Jan Beulich
  2016-08-01 15:01                   ` Andrew Cooper
  0 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2016-08-01 14:59 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Andrew Cooper, Zhi Wang, xen-devel

>>> On 01.08.16 at 16:55, <mdontu@bitdefender.com> wrote:
> On Monday 01 August 2016 17:48:33 Mihai Donțu wrote:
>> On Monday 01 August 2016 07:43:20 Jan Beulich wrote:
>> > > > > Your suggestion makes sense, but I'm starting to doubt my initial
>> > > > > patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
>> > > > > GPR-handling route and I can't seem to be able to easily prevent it
>> > > > > with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
>> > > > > to take a harder look at how that class of instructions is coded.      
>> > > > 
>> > > > You obviously need to distinguish the two kinds of register sources/
>> > > > destinations: GPRs need suitable re-writing of the instruction (without
>> > > > having looked at the most recent version of the patch yet I btw doubt
>> > > > converting register to memory operands is the most efficient model),
>> > > > while MMs, XMMs, and YMMs can retain their register encoding.    
>> > > 
>> > > Regarding efficiency, I'm not married with the approach I've proposed.
>> > > If you can give me a few more hints, I can give it a try.    
>> > 
>> > I'd rather pick a fixed register and update the regs->... field from that
>> > after the stub was executed. E.g. using rAX and treating it just like a
>> > return value of the "call". But maybe I'm imagining this easier than it
>> > really is; as an alternative I'd then suggest really following what Andrew
>> > said - use a pointer into regs->, not mmvalp. But (as said in the review
>> > mail) you'd then have the problem of the missing zero-extension for
>> > writes to 32-bit GPRs  
>> 
>> I thought that by re-using (hijacking, really) mmvalp, the patch will
>> look less intrusive and thus not add too much to an already complex
>> code.
>> 
>> Assuming I'll just pass to the stub "a"(ea.reg), would it be a good
>> idea to just zero-out the 64bit register before that? It does not
>> appear to be any instructions that write just the low dword. Or am I
>> misunderstanding the zero-extension concept?
> 
> Just to be sure I'm making myself understood, ea.reg contains the
> output of decode_register() which, in turn, returns a pointer in regs.

Hmm, now that you say that maybe I got confused be the expression
you used: If, rather than doing a cast on the address of mmvalp, you
could get away with a simply assignment (and maybe a cast on the
rvalue), then I suppose your code might not be as wrong as it seemed
to me.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 14:59                 ` Jan Beulich
@ 2016-08-01 15:01                   ` Andrew Cooper
  0 siblings, 0 replies; 26+ messages in thread
From: Andrew Cooper @ 2016-08-01 15:01 UTC (permalink / raw)
  To: Jan Beulich, Mihai Donțu; +Cc: Zhi Wang, xen-devel

On 01/08/16 15:59, Jan Beulich wrote:
>>>> On 01.08.16 at 16:55, <mdontu@bitdefender.com> wrote:
>> On Monday 01 August 2016 17:48:33 Mihai Donțu wrote:
>>> On Monday 01 August 2016 07:43:20 Jan Beulich wrote:
>>>>>>> Your suggestion makes sense, but I'm starting to doubt my initial
>>>>>>> patch. :-) I'm testing "movq xmm1, xmm1" and noticing that it takes the
>>>>>>> GPR-handling route and I can't seem to be able to easily prevent it
>>>>>>> with !(rex_prefix & REX_B), as rex_prefix == 0 and vex.b == 1. I need
>>>>>>> to take a harder look at how that class of instructions is coded.      
>>>>>> You obviously need to distinguish the two kinds of register sources/
>>>>>> destinations: GPRs need suitable re-writing of the instruction (without
>>>>>> having looked at the most recent version of the patch yet I btw doubt
>>>>>> converting register to memory operands is the most efficient model),
>>>>>> while MMs, XMMs, and YMMs can retain their register encoding.    
>>>>> Regarding efficiency, I'm not married with the approach I've proposed.
>>>>> If you can give me a few more hints, I can give it a try.    
>>>> I'd rather pick a fixed register and update the regs->... field from that
>>>> after the stub was executed. E.g. using rAX and treating it just like a
>>>> return value of the "call". But maybe I'm imagining this easier than it
>>>> really is; as an alternative I'd then suggest really following what Andrew
>>>> said - use a pointer into regs->, not mmvalp. But (as said in the review
>>>> mail) you'd then have the problem of the missing zero-extension for
>>>> writes to 32-bit GPRs  
>>> I thought that by re-using (hijacking, really) mmvalp, the patch will
>>> look less intrusive and thus not add too much to an already complex
>>> code.
>>>
>>> Assuming I'll just pass to the stub "a"(ea.reg), would it be a good
>>> idea to just zero-out the 64bit register before that? It does not
>>> appear to be any instructions that write just the low dword. Or am I
>>> misunderstanding the zero-extension concept?
>> Just to be sure I'm making myself understood, ea.reg contains the
>> output of decode_register() which, in turn, returns a pointer in regs.
> Hmm, now that you say that maybe I got confused be the expression
> you used: If, rather than doing a cast on the address of mmvalp, you
> could get away with a simply assignment (and maybe a cast on the
> rvalue), then I suppose your code might not be as wrong as it seemed
> to me.

I had to spend quite a long time convincing myself that it was right.

The code would be easier if mmvalp was a plain void* rather than const
mmval_t *.

~Andrew

>
> Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64
  2016-08-01 14:53               ` Andrew Cooper
@ 2016-08-01 15:10                 ` Mihai Donțu
  0 siblings, 0 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 15:10 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Zhi Wang, Jan Beulich, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 1580 bytes --]

On Monday 01 August 2016 15:53:56 Andrew Cooper wrote:
> On 01/08/16 15:48, Mihai Donțu wrote:
> > > I'd rather pick a fixed register and update the regs->... field from that
> > > after the stub was executed. E.g. using rAX and treating it just like a
> > > return value of the "call". But maybe I'm imagining this easier than it
> > > really is; as an alternative I'd then suggest really following what Andrew
> > > said - use a pointer into regs->, not mmvalp. But (as said in the review
> > > mail) you'd then have the problem of the missing zero-extension for
> > > writes to 32-bit GPRs
> > 
> > I thought that by re-using (hijacking, really) mmvalp, the patch will
> > look less intrusive and thus not add too much to an already complex
> > code.
> >
> > Assuming I'll just pass to the stub "a"(ea.reg), would it be a good
> > idea to just zero-out the 64bit register before that? It does not
> > appear to be any instructions that write just the low dword. Or am I
> > misunderstanding the zero-extension concept?  
> 
> Any write to a 32bit GPR zero extends it to 64 bits.  This is specified
> by AMD64 and only applies to 32bit writes.  8 and 16 bit writes leave
> the upper bits alone.
> 
> Therefore movd %mm, %eax will move 32bits from %mm to eax, then zero
> extend the upper 32 bits of %rax.

... and given that the stub has (%rAX) as destination, the
zero-extension is not implicit and I have to do it myself. Which also
means two of my tests in a previous patch are wrong (0xbdbdbdbdffffffff
checks). Darn! :-)

-- 
Mihai DONȚU

[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64
  2016-08-01 13:19 ` Jan Beulich
  2016-08-01 13:25   ` Mihai Donțu
@ 2016-08-01 23:19   ` Mihai Donțu
  2016-08-02  6:19     ` Jan Beulich
  1 sibling, 1 reply; 26+ messages in thread
From: Mihai Donțu @ 2016-08-01 23:19 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, xen-devel

On Monday 01 August 2016 07:19:51 Jan Beulich wrote:
> >>> On 01.08.16 at 04:52, <mdontu@bitdefender.com> wrote:  
> > @@ -4412,6 +4412,7 @@ x86_emulate(
> >      case 0x7f: /* movq mm,mm/m64 */
> >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
> >                 /* vmovdq{a,u} ymm,ymm/m256 */
> > +    case 0xd6: /* {,v}movq xmm,xmm/m64 */
> >      {
> >          uint8_t *buf = get_stub(stub);
> >          struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
> > @@ -4429,9 +4430,9 @@ x86_emulate(
> >              case vex_66:
> >              case vex_f3:
> >                  host_and_vcpu_must_have(sse2);
> > -                buf[0] = 0x66; /* movdqa */
> > +                buf[0] = 0x66; /* SSE */  
> 
> The comment change here indicates a problem: So far it was indicating
> that despite the possible F3 prefix (movdqu) we encode a 66 one
> (movdqa). Opcode D6 prefixed with F3, however, is movq2dq, which
> you then either don't emulate correctly, or if it happens to be
> emulated correctly you should include in the comment accompanying
> the case label. And its AVX counterpart should then produce #UD.

I fiddled with this for a while and the attached patch (adjusted)
appears to be doing the right thing: ie. movq2dq gets emulated
correctly too. copy_REX_VEX() does not work OK with movq2dq, but it
looked easy to single out this case.

All tests pass, including for {,v}movq xmm/m64 and movq2dq. There does
not appear to be an AVX variant for the latter, or I'm not reading the
Intel SDM right (or binutils' as is lying to me).

diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
index fe594ba..d6c199b 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -245,7 +245,7 @@ static uint8_t twobyte_table[256] = {
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xDF */
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     /* 0xE0 - 0xEF */
     0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
     /* 0xF0 - 0xFF */
@@ -4412,6 +4412,8 @@ x86_emulate(
     case 0x7f: /* movq mm,mm/m64 */
                /* {,v}movdq{a,u} xmm,xmm/m128 */
                /* vmovdq{a,u} ymm,ymm/m256 */
+    case 0xd6: /* {,v}movq xmm,xmm/m64 */
+               /* movq2dq mm,xmm */
     {
         uint8_t *buf = get_stub(stub);
         struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
@@ -4431,7 +4433,7 @@ x86_emulate(
                 host_and_vcpu_must_have(sse2);
                 buf[0] = 0x66; /* movdqa */
                 get_fpu(X86EMUL_FPU_xmm, &fic);
-                ea.bytes = 16;
+                ea.bytes = (b == 0xd6 ? 8 : 16);
                 break;
             case vex_none:
                 if ( b != 0xe7 )
@@ -4451,7 +4453,7 @@ x86_emulate(
                     ((vex.pfx != vex_66) && (vex.pfx != vex_f3)));
             host_and_vcpu_must_have(avx);
             get_fpu(X86EMUL_FPU_ymm, &fic);
-            ea.bytes = 16 << vex.l;
+            ea.bytes = (b == 0xd6 ? 8 : (16 << vex.l));
         }
         if ( ea.type == OP_MEM )
         {
@@ -4469,7 +4471,11 @@ x86_emulate(
         }
         if ( !rc )
         {
-           copy_REX_VEX(buf, rex_prefix, vex);
+           /* try to preserve the mandatory prefix for movq2dq */
+           if ( !rex_prefix && vex.opcx == vex_none && vex.pfx == vex_f3 )
+               buf[0] = 0xf3;
+           else
+               copy_REX_VEX(buf, rex_prefix, vex);
            asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
                                      : "memory" );
         }

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64
  2016-08-01 23:19   ` Mihai Donțu
@ 2016-08-02  6:19     ` Jan Beulich
  2016-08-02  8:13       ` Mihai Donțu
  0 siblings, 1 reply; 26+ messages in thread
From: Jan Beulich @ 2016-08-02  6:19 UTC (permalink / raw)
  To: Mihai Donțu; +Cc: Andrew Cooper, xen-devel

>>> On 02.08.16 at 01:19, <mdontu@bitdefender.com> wrote:
> On Monday 01 August 2016 07:19:51 Jan Beulich wrote:
>> >>> On 01.08.16 at 04:52, <mdontu@bitdefender.com> wrote:  
>> > @@ -4412,6 +4412,7 @@ x86_emulate(
>> >      case 0x7f: /* movq mm,mm/m64 */
>> >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
>> >                 /* vmovdq{a,u} ymm,ymm/m256 */
>> > +    case 0xd6: /* {,v}movq xmm,xmm/m64 */
>> >      {
>> >          uint8_t *buf = get_stub(stub);
>> >          struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
>> > @@ -4429,9 +4430,9 @@ x86_emulate(
>> >              case vex_66:
>> >              case vex_f3:
>> >                  host_and_vcpu_must_have(sse2);
>> > -                buf[0] = 0x66; /* movdqa */
>> > +                buf[0] = 0x66; /* SSE */  
>> 
>> The comment change here indicates a problem: So far it was indicating
>> that despite the possible F3 prefix (movdqu) we encode a 66 one
>> (movdqa). Opcode D6 prefixed with F3, however, is movq2dq, which
>> you then either don't emulate correctly, or if it happens to be
>> emulated correctly you should include in the comment accompanying
>> the case label. And its AVX counterpart should then produce #UD.
> 
> I fiddled with this for a while and the attached patch (adjusted)
> appears to be doing the right thing: ie. movq2dq gets emulated
> correctly too. copy_REX_VEX() does not work OK with movq2dq, but it
> looked easy to single out this case.

Except that you can't really avoid it (see below). Without you being
more explicit I also can't tell what it is that doesn't work right in that
case.

> All tests pass, including for {,v}movq xmm/m64 and movq2dq. There does
> not appear to be an AVX variant for the latter, or I'm not reading the
> Intel SDM right (or binutils' as is lying to me).

Well, that's what I said earlier on (still visible above), and what you
still fail to deal with.

> @@ -4469,7 +4471,11 @@ x86_emulate(
>          }
>          if ( !rc )
>          {
> -           copy_REX_VEX(buf, rex_prefix, vex);
> +           /* try to preserve the mandatory prefix for movq2dq */
> +           if ( !rex_prefix && vex.opcx == vex_none && vex.pfx == vex_f3 )
> +               buf[0] = 0xf3;
> +           else
> +               copy_REX_VEX(buf, rex_prefix, vex);

So what about a movq2dq having a REX prefix to encode XMM8..15
for one or both of its operands? The writing of the F3 prefix really
needs to go elsewhere - probably the best place is where the 66
one gets written unconditionally right now. And afaict then
copy_REX_VEX() will work fine here too.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64
  2016-08-02  6:19     ` Jan Beulich
@ 2016-08-02  8:13       ` Mihai Donțu
  0 siblings, 0 replies; 26+ messages in thread
From: Mihai Donțu @ 2016-08-02  8:13 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, xen-devel

On Tuesday 02 August 2016 00:19:22 Jan Beulich wrote:
> > > > On 02.08.16 at 01:19, <mdontu@bitdefender.com> wrote:  
> > > > @@ -4412,6 +4412,7 @@ x86_emulate(
> > > >      case 0x7f: /* movq mm,mm/m64 */
> > > >                 /* {,v}movdq{a,u} xmm,xmm/m128 */
> > > >                 /* vmovdq{a,u} ymm,ymm/m256 */
> > > > +    case 0xd6: /* {,v}movq xmm,xmm/m64 */
> > > >      {
> > > >          uint8_t *buf = get_stub(stub);
> > > >          struct fpu_insn_ctxt fic = { .insn_bytes = 5 };
> > > > @@ -4429,9 +4430,9 @@ x86_emulate(
> > > >              case vex_66:
> > > >              case vex_f3:
> > > >                  host_and_vcpu_must_have(sse2);
> > > > -                buf[0] = 0x66; /* movdqa */
> > > > +                buf[0] = 0x66; /* SSE */    
> > > 
> > > The comment change here indicates a problem: So far it was indicating
> > > that despite the possible F3 prefix (movdqu) we encode a 66 one
> > > (movdqa). Opcode D6 prefixed with F3, however, is movq2dq, which
> > > you then either don't emulate correctly, or if it happens to be
> > > emulated correctly you should include in the comment accompanying
> > > the case label. And its AVX counterpart should then produce #UD.  
> > 
> > I fiddled with this for a while and the attached patch (adjusted)
> > appears to be doing the right thing: ie. movq2dq gets emulated
> > correctly too. copy_REX_VEX() does not work OK with movq2dq, but it
> > looked easy to single out this case.  
> 
> Except that you can't really avoid it (see below). Without you being
> more explicit I also can't tell what it is that doesn't work right in that
> case.

Sorry about that. In x86_emulate(), after buf[0] == 0x66 and
copy_REX_VEX():

   f3 0f d6 d1     movq2dq %mm1,%xmm2

becomes:

   66 40 0f d6 d1  rex movq %xmm2,%xmm1

Now that I slept over it, I can see it's not really a problem with
copy_REX_VEX().

> > All tests pass, including for {,v}movq xmm/m64 and movq2dq. There does
> > not appear to be an AVX variant for the latter, or I'm not reading the
> > Intel SDM right (or binutils' as is lying to me).  
> 
> Well, that's what I said earlier on (still visible above), and what you
> still fail to deal with.
> 
> > @@ -4469,7 +4471,11 @@ x86_emulate(
> >          }
> >          if ( !rc )
> >          {
> > -           copy_REX_VEX(buf, rex_prefix, vex);
> > +           /* try to preserve the mandatory prefix for movq2dq */
> > +           if ( !rex_prefix && vex.opcx == vex_none && vex.pfx == vex_f3 )
> > +               buf[0] = 0xf3;
> > +           else
> > +               copy_REX_VEX(buf, rex_prefix, vex);  
> 
> So what about a movq2dq having a REX prefix to encode XMM8..15
> for one or both of its operands? The writing of the F3 prefix really
> needs to go elsewhere - probably the best place is where the 66
> one gets written unconditionally right now. And afaict then
> copy_REX_VEX() will work fine here too.

-- 
Mihai DONȚU

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2016-08-02  8:13 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-01  2:52 [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Mihai Donțu
2016-08-01  2:52 ` [PATCH v3 2/3] x86/emulate: add support of emulating SSE2 instruction {, v}movd mm, r32/m32 and {, v}movq mm, r64 Mihai Donțu
2016-08-01  9:52   ` Andrew Cooper
2016-08-01 12:53     ` Mihai Donțu
2016-08-01 12:56       ` Mihai Donțu
2016-08-01 12:57       ` Andrew Cooper
2016-08-01 12:59       ` Jan Beulich
2016-08-01 13:28         ` Mihai Donțu
2016-08-01 13:43           ` Jan Beulich
2016-08-01 14:48             ` Mihai Donțu
2016-08-01 14:53               ` Andrew Cooper
2016-08-01 15:10                 ` Mihai Donțu
2016-08-01 14:55               ` Mihai Donțu
2016-08-01 14:59                 ` Jan Beulich
2016-08-01 15:01                   ` Andrew Cooper
2016-08-01 14:56               ` Jan Beulich
2016-08-01 13:38   ` Jan Beulich
2016-08-01  2:52 ` [PATCH v3 3/3] x86/emulate: added tests for {, v}movd mm, r32/m32 and {, v}movq xmm, r64/m64 Mihai Donțu
2016-08-01  9:54   ` Andrew Cooper
2016-08-01 12:46     ` Mihai Donțu
2016-08-01  9:18 ` [PATCH v3 1/3] x86/emulate: add support for {, v}movq xmm, xmm/m64 Andrew Cooper
2016-08-01 13:19 ` Jan Beulich
2016-08-01 13:25   ` Mihai Donțu
2016-08-01 23:19   ` Mihai Donțu
2016-08-02  6:19     ` Jan Beulich
2016-08-02  8:13       ` Mihai Donțu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.