All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jan Beulich" <JBeulich@suse.com>
To: xen-devel <xen-devel@lists.xenproject.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: [PATCH v4 12/17] x86emul: support SSE4.1 insns
Date: Tue, 28 Feb 2017 05:56:29 -0700	[thread overview]
Message-ID: <58B5818D020000780013E2F1@prv-mh.provo.novell.com> (raw)
In-Reply-To: <58B57E43020000780013E26B@prv-mh.provo.novell.com>

[-- Attachment #1: Type: text/plain, Size: 18294 bytes --]

... and their AVX equivalents.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Or in ByteOp for {,v}pinsrb instead of assigning it (in
    x86_decode_0f3a()). Correct case label for ptest. Add missing
    copy_REX_VEX() to {,v}ptest handling. Add missing immediate bytes
    to {,v}pextr* etc handling. dppd requires vex.l clear. Use
    consistent approach for stub setup in code paths shared between
    VEX- and non-VEX-encoded insns.
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -219,6 +219,13 @@ enum simd_opsize {
      */
     simd_single_fp,
 
+    /*
+     * Scalar floating point:
+     * - 32 bits with low opcode bit clear (scalar single)
+     * - 64 bits with low opcode bit set (scalar double)
+     */
+    simd_scalar_fp,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -349,21 +356,45 @@ static const struct {
     uint8_t vsib:1;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x10] = { .simd_size = simd_packed_int },
+    [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+    [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
+    [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x2b] = { .simd_size = simd_packed_int },
+    [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
+    [0x38 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x40] = { .simd_size = simd_packed_int },
+    [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = { .two_op = 1 },
     [0xf1] = { .to_memory = 1, .two_op = 1 },
     [0xf2 ... 0xf3] = {},
     [0xf5 ... 0xf7] = {},
 };
 
+/* Shift values between src and dst sizes of pmov{s,z}x{b,w,d}{w,d,q}. */
+static const uint8_t pmov_convert_delta[] = { 1, 2, 3, 1, 2, 1 };
+
 static const struct {
     uint8_t simd_size:5;
     uint8_t to_memory:1;
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
-    [0x0f] = { .simd_size = simd_packed_int },
+    [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
+    [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
+    [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
+    [0x14 ... 0x17] = { .simd_size = simd_none, .to_memory = 1, .two_op = 1 },
+    [0x20] = { .simd_size = simd_none },
+    [0x21] = { .simd_size = simd_other },
+    [0x22] = { .simd_size = simd_none },
+    [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
+    [0x42] = { .simd_size = simd_packed_int },
+    [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0xf0] = {},
 };
 
@@ -2314,6 +2345,33 @@ x86_decode_0f38(
 }
 
 static int
+x86_decode_0f3a(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    if ( !vex.opcx )
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
+    {
+    case X86EMUL_OPC_66(0, 0x20):     /* pinsrb */
+    case X86EMUL_OPC_VEX_66(0, 0x20): /* vpinsrb */
+        state->desc = DstImplicit | SrcMem;
+        if ( modrm_mod != 3 )
+            state->desc |= ByteOp;
+        break;
+
+    case X86EMUL_OPC_66(0, 0x22):     /* pinsr{d,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x22): /* vpinsr{d,q} */
+        state->desc = DstImplicit | SrcMem;
+        break;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -2801,8 +2859,7 @@ x86_decode(
             imm1 &= 0x7f;
         state->desc = d;
         state->simd_size = ext0f3a_table[b].simd_size;
-        if ( !vex.opcx )
-            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        rc = x86_decode_0f3a(state, ctxt, ops);
         break;
 
     case ext_8f08:
@@ -2866,6 +2923,10 @@ x86_decode(
         }
         break;
 
+    case simd_scalar_fp:
+        op_bytes = 4 << (ctxt->opcode & 1);
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -5935,6 +5996,18 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1c): /* vpabsb {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1d): /* vpabsw {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1e): /* vpabsd {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x28): /* vpmuldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x29): /* vpcmpeqq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2b): /* vpackusdw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x38): /* vpminsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x39): /* vpminsd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3a): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3b): /* vpminud {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3c): /* vpmaxsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3d): /* vpmaxsd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3e): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3f): /* vpmaxud {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( !vex.l )
                 goto simd_0f_avx;
             host_and_vcpu_must_have(avx2);
@@ -5947,6 +6020,10 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx, &fic);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        goto simd_0f_avx;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -6030,11 +6107,20 @@ x86_emulate(
     simd_0f_int_imm8:
         if ( vex.opcx != vex_none )
         {
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0e): /* vpblendw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( vex.l )
                 host_and_vcpu_must_have(avx2);
             else
             {
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x09): /* vroundpd $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0a): /* vroundss $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0b): /* vroundsd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0c): /* vblendps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0d): /* vblendpd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x40): /* vdpps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     simd_0f_imm8_avx:
                 host_and_vcpu_must_have(avx);
             }
@@ -6931,7 +7017,10 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f38, 0x1e): /* pabsd xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
         if ( vex.pfx )
+        {
+    simd_0f38_common:
             get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
         else
         {
             host_and_vcpu_must_have(mmx);
@@ -6951,6 +7040,97 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
+        op_bytes = 16 >> pmov_convert_delta[b & 7];
+        /* fall through */
+    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f38_common;
+
+    case X86EMUL_OPC_66(0x0f38, 0x17):     /* ptest xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
+        if ( vex.opcx == vex_none )
+        {
+            host_and_vcpu_must_have(sse4_1);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            generate_exception_if(vex.reg != 0xf, EXC_UD);
+            host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+
+        opc = init_prefixes(stub);
+        if ( vex.opcx == vex_none )
+            opc[0] = 0x38;
+        opc[vex.opcx == vex_none] = b;
+        opc[1 + (vex.opcx == vex_none)] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 16 << vex.l, ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
+
+            /* Convert memory operand to (%rAX). */
+            rex_prefix &= ~REX_B;
+            vex.b = 1;
+            opc[1 + (vex.opcx == vex_none)] &= 0x38;
+        }
+        fic.insn_bytes = PFX_BYTES + 2 + (vex.opcx == vex_none);
+        opc[2 + (vex.opcx == vex_none)] = 0xc3;
+
+        copy_REX_VEX(opc, rex_prefix, vex);
+        emulate_stub("+m" (*mmvalp), "a" (mmvalp));
+
+        put_stub(stub);
+        put_fpu(&fic);
+
+        state->simd_size = simd_none;
+        dst.type = OP_NONE;
+        break;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x23): /* vpmovsxwd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x25): /* vpmovsxdq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x30): /* vpmovzxbw xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x33): /* vpmovzxwd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x35): /* vpmovzxdq xmm/mem,{x,y}mm */
+        op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
+        goto simd_0f_int;
+
     case X86EMUL_OPC_66(0x0f38, 0x2a):     /* movntdqa m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7136,7 +7316,10 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
         if ( vex.pfx )
+        {
+    simd_0f3a_common:
             get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
         else
         {
             host_and_vcpu_must_have(mmx);
@@ -7157,6 +7340,103 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 4;
         break;
 
+    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
+    case X86EMUL_OPC_66(0x0f3a, 0x15): /* pextrw $imm8,xmm,r/m */
+    case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */
+    case X86EMUL_OPC_66(0x0f3a, 0x17): /* extractps $imm8,xmm,r/m */
+        host_and_vcpu_must_have(sse4_1);
+        get_fpu(X86EMUL_FPU_xmm, &fic);
+
+        opc = init_prefixes(stub);
+        opc[0] = 0x3a;
+    pextr:
+        opc[vex.opcx == vex_none] = b;
+        /* Convert memory/GPR operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1 + (vex.opcx == vex_none)] = modrm & 0x38;
+        opc[2 + (vex.opcx == vex_none)] = imm1;
+        fic.insn_bytes = PFX_BYTES + 3 + (vex.opcx == vex_none);
+        opc[3 + (vex.opcx == vex_none)] = 0xc3;
+
+        copy_REX_VEX(opc, rex_prefix, vex);
+        invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
+
+        put_stub(stub);
+        put_fpu(&fic);
+
+        dst.bytes = dst.type == OP_REG || b == 0x17 ? 4 : 1 << (b & 3);
+        if ( b == 0x16 && (rex_prefix & REX_W) )
+            dst.bytes = 8;
+        break;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x14): /* vpextrb $imm8,xmm,r/m */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x15): /* vpextrw $imm8,xmm,r/m */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x16): /* vpextr{d,q} $imm8,xmm,r/m */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
+        generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+        host_and_vcpu_must_have(avx);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+        opc = init_prefixes(stub);
+        goto pextr;
+
+    case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        get_fpu(X86EMUL_FPU_xmm, &fic);
+        memcpy(mmvalp, &src.val, op_bytes);
+        ea.type = OP_MEM;
+        op_bytes = src.bytes;
+        d = SrcMem16; /* Fake for the common SIMD code below. */
+        state->simd_size = simd_other;
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        memcpy(mmvalp, &src.val, op_bytes);
+        ea.type = OP_MEM;
+        op_bytes = src.bytes;
+        d = SrcMem16; /* Fake for the common SIMD code below. */
+        state->simd_size = simd_other;
+        goto simd_0f_int_imm8;
+
+    case X86EMUL_OPC_66(0x0f3a, 0x21): /* insertps $imm8,xmm/m32,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        op_bytes = 4;
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m128,xmm,xmm */
+        op_bytes = 4;
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x41): /* vdppd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.l, EXC_UD);
+        goto simd_0f_imm8_avx;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_int_imm8;
+
     case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
         vcpu_must_have(bmi2);
         generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -40,6 +40,7 @@
 #define cpu_has_mmx		boot_cpu_has(X86_FEATURE_MMX)
 #define cpu_has_sse3		boot_cpu_has(X86_FEATURE_SSE3)
 #define cpu_has_ssse3		boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_sse4_1		boot_cpu_has(X86_FEATURE_SSE4_1)
 #define cpu_has_sse4_2		boot_cpu_has(X86_FEATURE_SSE4_2)
 #define cpu_has_popcnt		boot_cpu_has(X86_FEATURE_POPCNT)
 #define cpu_has_htt		boot_cpu_has(X86_FEATURE_HTT)



[-- Attachment #2: x86emul-SSE41.patch --]
[-- Type: text/plain, Size: 18323 bytes --]

x86emul: support SSE4.1 insns

... and their AVX equivalents.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Or in ByteOp for {,v}pinsrb instead of assigning it (in
    x86_decode_0f3a()). Correct case label for ptest. Add missing
    copy_REX_VEX() to {,v}ptest handling. Add missing immediate bytes
    to {,v}pextr* etc handling. dppd requires vex.l clear. Use
    consistent approach for stub setup in code paths shared between
    VEX- and non-VEX-encoded insns.
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -219,6 +219,13 @@ enum simd_opsize {
      */
     simd_single_fp,
 
+    /*
+     * Scalar floating point:
+     * - 32 bits with low opcode bit clear (scalar single)
+     * - 64 bits with low opcode bit set (scalar double)
+     */
+    simd_scalar_fp,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -349,21 +356,45 @@ static const struct {
     uint8_t vsib:1;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x10] = { .simd_size = simd_packed_int },
+    [0x14 ... 0x15] = { .simd_size = simd_packed_fp },
+    [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
+    [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x2b] = { .simd_size = simd_packed_int },
+    [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
+    [0x38 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x40] = { .simd_size = simd_packed_int },
+    [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = { .two_op = 1 },
     [0xf1] = { .to_memory = 1, .two_op = 1 },
     [0xf2 ... 0xf3] = {},
     [0xf5 ... 0xf7] = {},
 };
 
+/* Shift values between src and dst sizes of pmov{s,z}x{b,w,d}{w,d,q}. */
+static const uint8_t pmov_convert_delta[] = { 1, 2, 3, 1, 2, 1 };
+
 static const struct {
     uint8_t simd_size:5;
     uint8_t to_memory:1;
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext0f3a_table[256] = {
-    [0x0f] = { .simd_size = simd_packed_int },
+    [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
+    [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
+    [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
+    [0x14 ... 0x17] = { .simd_size = simd_none, .to_memory = 1, .two_op = 1 },
+    [0x20] = { .simd_size = simd_none },
+    [0x21] = { .simd_size = simd_other },
+    [0x22] = { .simd_size = simd_none },
+    [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
+    [0x42] = { .simd_size = simd_packed_int },
+    [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
+    [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0xf0] = {},
 };
 
@@ -2314,6 +2345,33 @@ x86_decode_0f38(
 }
 
 static int
+x86_decode_0f3a(
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt,
+    const struct x86_emulate_ops *ops)
+{
+    if ( !vex.opcx )
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+
+    switch ( ctxt->opcode & X86EMUL_OPC_MASK )
+    {
+    case X86EMUL_OPC_66(0, 0x20):     /* pinsrb */
+    case X86EMUL_OPC_VEX_66(0, 0x20): /* vpinsrb */
+        state->desc = DstImplicit | SrcMem;
+        if ( modrm_mod != 3 )
+            state->desc |= ByteOp;
+        break;
+
+    case X86EMUL_OPC_66(0, 0x22):     /* pinsr{d,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x22): /* vpinsr{d,q} */
+        state->desc = DstImplicit | SrcMem;
+        break;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int
 x86_decode(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt,
@@ -2801,8 +2859,7 @@ x86_decode(
             imm1 &= 0x7f;
         state->desc = d;
         state->simd_size = ext0f3a_table[b].simd_size;
-        if ( !vex.opcx )
-            ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        rc = x86_decode_0f3a(state, ctxt, ops);
         break;
 
     case ext_8f08:
@@ -2866,6 +2923,10 @@ x86_decode(
         }
         break;
 
+    case simd_scalar_fp:
+        op_bytes = 4 << (ctxt->opcode & 1);
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -5935,6 +5996,18 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1c): /* vpabsb {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1d): /* vpabsw {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1e): /* vpabsd {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x28): /* vpmuldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x29): /* vpcmpeqq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x2b): /* vpackusdw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x38): /* vpminsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x39): /* vpminsd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3a): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3b): /* vpminud {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3c): /* vpmaxsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3d): /* vpmaxsd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3e): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x3f): /* vpmaxud {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x40): /* vpmulld {x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( !vex.l )
                 goto simd_0f_avx;
             host_and_vcpu_must_have(avx2);
@@ -5947,6 +6020,10 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx, &fic);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x41): /* vphminposuw xmm/m128,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        goto simd_0f_avx;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -6030,11 +6107,20 @@ x86_emulate(
     simd_0f_int_imm8:
         if ( vex.opcx != vex_none )
         {
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0e): /* vpblendw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
             if ( vex.l )
                 host_and_vcpu_must_have(avx2);
             else
             {
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x09): /* vroundpd $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0a): /* vroundss $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0b): /* vroundsd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0c): /* vblendps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x0d): /* vblendpd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x40): /* vdpps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     simd_0f_imm8_avx:
                 host_and_vcpu_must_have(avx);
             }
@@ -6931,7 +7017,10 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f38, 0x1e): /* pabsd xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
         if ( vex.pfx )
+        {
+    simd_0f38_common:
             get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
         else
         {
             host_and_vcpu_must_have(mmx);
@@ -6951,6 +7040,97 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x23): /* pmovsxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x24): /* pmovsxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x25): /* pmovsxdq xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x30): /* pmovzxbw xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x31): /* pmovzxbd xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x32): /* pmovzxbq xmm/m16,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x33): /* pmovzxwd xmm/m64,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x34): /* pmovzxwq xmm/m32,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x35): /* pmovzxdq xmm/m64,xmm */
+        op_bytes = 16 >> pmov_convert_delta[b & 7];
+        /* fall through */
+    case X86EMUL_OPC_66(0x0f38, 0x10): /* pblendvb XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x14): /* blendvps XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x15): /* blendvpd XMM0,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x28): /* pmuldq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x29): /* pcmpeqq xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x2b): /* packusdw xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x38): /* pminsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x39): /* pminsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3a): /* pminub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3b): /* pminud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3c): /* pmaxsb xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3d): /* pmaxsd xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3e): /* pmaxub xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x3f): /* pmaxud xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x40): /* pmulld xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f38, 0x41): /* phminposuw xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f38_common;
+
+    case X86EMUL_OPC_66(0x0f38, 0x17):     /* ptest xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x17): /* vptest {x,y}mm/mem,{x,y}mm */
+        if ( vex.opcx == vex_none )
+        {
+            host_and_vcpu_must_have(sse4_1);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            generate_exception_if(vex.reg != 0xf, EXC_UD);
+            host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+
+        opc = init_prefixes(stub);
+        if ( vex.opcx == vex_none )
+            opc[0] = 0x38;
+        opc[vex.opcx == vex_none] = b;
+        opc[1 + (vex.opcx == vex_none)] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 16 << vex.l, ctxt);
+            if ( rc != X86EMUL_OKAY )
+                goto done;
+
+            /* Convert memory operand to (%rAX). */
+            rex_prefix &= ~REX_B;
+            vex.b = 1;
+            opc[1 + (vex.opcx == vex_none)] &= 0x38;
+        }
+        fic.insn_bytes = PFX_BYTES + 2 + (vex.opcx == vex_none);
+        opc[2 + (vex.opcx == vex_none)] = 0xc3;
+
+        copy_REX_VEX(opc, rex_prefix, vex);
+        emulate_stub("+m" (*mmvalp), "a" (mmvalp));
+
+        put_stub(stub);
+        put_fpu(&fic);
+
+        state->simd_size = simd_none;
+        dst.type = OP_NONE;
+        break;
+
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x23): /* vpmovsxwd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x25): /* vpmovsxdq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x30): /* vpmovzxbw xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x33): /* vpmovzxwd xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x35): /* vpmovzxdq xmm/mem,{x,y}mm */
+        op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
+        goto simd_0f_int;
+
     case X86EMUL_OPC_66(0x0f38, 0x2a):     /* movntdqa m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7136,7 +7316,10 @@ x86_emulate(
     case X86EMUL_OPC_66(0x0f3a, 0x0f): /* palignr $imm8,xmm/m128,xmm */
         host_and_vcpu_must_have(ssse3);
         if ( vex.pfx )
+        {
+    simd_0f3a_common:
             get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
         else
         {
             host_and_vcpu_must_have(mmx);
@@ -7157,6 +7340,103 @@ x86_emulate(
         fic.insn_bytes = PFX_BYTES + 4;
         break;
 
+    case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0b): /* roundsd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0c): /* blendps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0d): /* blendpd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x0e): /* pblendw $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x40): /* dpps $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x41): /* dppd $imm8,xmm/m128,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x42): /* mpsadbw $imm8,xmm/m128,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_66(0x0f3a, 0x14): /* pextrb $imm8,xmm,r/m */
+    case X86EMUL_OPC_66(0x0f3a, 0x15): /* pextrw $imm8,xmm,r/m */
+    case X86EMUL_OPC_66(0x0f3a, 0x16): /* pextr{d,q} $imm8,xmm,r/m */
+    case X86EMUL_OPC_66(0x0f3a, 0x17): /* extractps $imm8,xmm,r/m */
+        host_and_vcpu_must_have(sse4_1);
+        get_fpu(X86EMUL_FPU_xmm, &fic);
+
+        opc = init_prefixes(stub);
+        opc[0] = 0x3a;
+    pextr:
+        opc[vex.opcx == vex_none] = b;
+        /* Convert memory/GPR operand to (%rAX). */
+        rex_prefix &= ~REX_B;
+        vex.b = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1 + (vex.opcx == vex_none)] = modrm & 0x38;
+        opc[2 + (vex.opcx == vex_none)] = imm1;
+        fic.insn_bytes = PFX_BYTES + 3 + (vex.opcx == vex_none);
+        opc[3 + (vex.opcx == vex_none)] = 0xc3;
+
+        copy_REX_VEX(opc, rex_prefix, vex);
+        invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
+
+        put_stub(stub);
+        put_fpu(&fic);
+
+        dst.bytes = dst.type == OP_REG || b == 0x17 ? 4 : 1 << (b & 3);
+        if ( b == 0x16 && (rex_prefix & REX_W) )
+            dst.bytes = 8;
+        break;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x14): /* vpextrb $imm8,xmm,r/m */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x15): /* vpextrw $imm8,xmm,r/m */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x16): /* vpextr{d,q} $imm8,xmm,r/m */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
+        generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+        host_and_vcpu_must_have(avx);
+        get_fpu(X86EMUL_FPU_ymm, &fic);
+        opc = init_prefixes(stub);
+        goto pextr;
+
+    case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
+    case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        get_fpu(X86EMUL_FPU_xmm, &fic);
+        memcpy(mmvalp, &src.val, op_bytes);
+        ea.type = OP_MEM;
+        op_bytes = src.bytes;
+        d = SrcMem16; /* Fake for the common SIMD code below. */
+        state->simd_size = simd_other;
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        memcpy(mmvalp, &src.val, op_bytes);
+        ea.type = OP_MEM;
+        op_bytes = src.bytes;
+        d = SrcMem16; /* Fake for the common SIMD code below. */
+        state->simd_size = simd_other;
+        goto simd_0f_int_imm8;
+
+    case X86EMUL_OPC_66(0x0f3a, 0x21): /* insertps $imm8,xmm/m32,xmm */
+        host_and_vcpu_must_have(sse4_1);
+        op_bytes = 4;
+        goto simd_0f3a_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m128,xmm,xmm */
+        op_bytes = 4;
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x41): /* vdppd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.l, EXC_UD);
+        goto simd_0f_imm8_avx;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_imm8_avx;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_int_imm8;
+
     case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
         vcpu_must_have(bmi2);
         generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -40,6 +40,7 @@
 #define cpu_has_mmx		boot_cpu_has(X86_FEATURE_MMX)
 #define cpu_has_sse3		boot_cpu_has(X86_FEATURE_SSE3)
 #define cpu_has_ssse3		boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_sse4_1		boot_cpu_has(X86_FEATURE_SSE4_1)
 #define cpu_has_sse4_2		boot_cpu_has(X86_FEATURE_SSE4_2)
 #define cpu_has_popcnt		boot_cpu_has(X86_FEATURE_POPCNT)
 #define cpu_has_htt		boot_cpu_has(X86_FEATURE_HTT)

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

  parent reply	other threads:[~2017-02-28 12:56 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-02-28 12:42 [PATCH v4 00/17] x86emul: MMX/SSEn support Jan Beulich
2017-02-28 12:49 ` [PATCH v4 01/17] x86emul: support most memory accessing MMX/SSE{, 2, 3} insns Jan Beulich
2017-03-01 13:17   ` Andrew Cooper
2017-03-01 13:50     ` Jan Beulich
2017-03-01 18:08       ` Andrew Cooper
2017-02-28 12:50 ` [PATCH v4 02/17] x86emul: support MMX/SSE{,2,3} moves Jan Beulich
2017-03-01 13:59   ` [PATCH v4 02/17] x86emul: support MMX/SSE{, 2, 3} moves Andrew Cooper
2017-03-01 14:19     ` Jan Beulich
2017-03-01 19:56       ` Andrew Cooper
2017-03-02  8:07         ` Jan Beulich
2017-02-28 12:51 ` [PATCH v4 03/17] x86emul: support MMX/SSE/SSE2 converts Jan Beulich
2017-03-01 14:09   ` Andrew Cooper
2017-02-28 12:51 ` [PATCH v4 04/17] x86emul: support {,V}{,U}COMIS{S,D} Jan Beulich
2017-03-01 14:16   ` [PATCH v4 04/17] x86emul: support {, V}{, U}COMIS{S, D} Andrew Cooper
2017-03-01 14:26     ` Jan Beulich
2017-03-01 14:31       ` Andrew Cooper
2017-02-28 12:52 ` [PATCH v4 05/17] x86emul: support MMX/SSE{, 2, 4a} insns with only register operands Jan Beulich
2017-03-01 14:36   ` Andrew Cooper
2017-03-01 14:43     ` Jan Beulich
2017-03-01 20:01       ` Andrew Cooper
2017-02-28 12:52 ` [PATCH v4 06/17] x86emul: support {,V}{LD,ST}MXCSR Jan Beulich
2017-03-01 14:57   ` Andrew Cooper
2017-02-28 12:53 ` [PATCH v4 07/17] x86emul: support {,V}MOVNTDQA Jan Beulich
2017-03-01 14:58   ` Andrew Cooper
2017-02-28 12:53 ` [PATCH v4 08/17] x86emul: test coverage for SSE/SSE2 insns Jan Beulich
2017-02-28 12:54 ` [PATCH v4 09/17] x86emul: honor MMXEXT feature flag Jan Beulich
2017-02-28 12:54 ` [PATCH v4 10/17] x86emul: add tables for 0f38 and 0f3a extension space Jan Beulich
2017-03-01 15:49   ` Andrew Cooper
2017-03-01 16:11     ` Jan Beulich
2017-03-01 20:35       ` Andrew Cooper
2017-03-02  8:15         ` Jan Beulich
2017-02-28 12:55 ` [PATCH v4 11/17] x86emul: support SSSE3 insns Jan Beulich
2017-03-01 16:06   ` Andrew Cooper
2017-02-28 12:56 ` Jan Beulich [this message]
2017-03-01 16:58   ` [PATCH v4 12/17] x86emul: support SSE4.1 insns Andrew Cooper
2017-03-02  8:26     ` Jan Beulich
2017-02-28 12:56 ` [PATCH v4 13/17] x86emul: support SSE4.2 insns Jan Beulich
2017-03-01 17:21   ` Andrew Cooper
2017-02-28 12:57 ` [PATCH v4 14/17] x86emul: test coverage for SSE3/SSSE3/SSE4* insns Jan Beulich
2017-03-01 17:22   ` Andrew Cooper
2017-02-28 12:58 ` [PATCH v4 15/17] x86emul: support PCLMULQDQ Jan Beulich
2017-03-01 17:44   ` Andrew Cooper
2017-03-02  8:30     ` Jan Beulich
2017-02-28 12:58 ` [PATCH v4 16/17] x86emul: support AESNI insns Jan Beulich
2017-02-28 12:59 ` [PATCH v4 17/17] x86emul: support SHA insns Jan Beulich
2017-03-01 17:51   ` Andrew Cooper

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=58B5818D020000780013E2F1@prv-mh.provo.novell.com \
    --to=jbeulich@suse.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.