All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi
@ 2014-01-31 14:43 Richard Henderson
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 1/5] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX Richard Henderson
                   ` (5 more replies)
  0 siblings, 6 replies; 21+ messages in thread
From: Richard Henderson @ 2014-01-31 14:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

There are three separate architecture extensions for logical operations,
BMI, BMI2, and TBM.  The first two are supported on Intel Haswell and
AMD Excavator, while slightly earlier AMD support only BMI and TBM.

The following adds support for the interesting BMI and BMI2 instructions,
where it is easy to do so.  Most of the rest of the new instructions are
irrelevant to TCG.

When I added support for the ANDC opcode, I noticed some optimization
regressions when looking at ppc64 guest dumps.  I will address these
in a separate patch set.


r~


Richard Henderson (5):
  disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX
  tcg/i386: Move TCG_CT_CONST_* to tcg-target.c
  tcg/i386: Add tcg_out_vex_modrm
  tcg/i386: Use ANDN instruction
  tcg/i386: Use SHLX/SHRX/SARX instructions

 disas/i386.c          | 146 +++++++++++++++++++++++++++++++++++++++++-----
 tcg/i386/tcg-target.c | 156 ++++++++++++++++++++++++++++++++++++++++++--------
 tcg/i386/tcg-target.h |   9 ++-
 3 files changed, 268 insertions(+), 43 deletions(-)

-- 
1.8.5.3

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 1/5] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX
  2014-01-31 14:43 [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
@ 2014-01-31 14:43 ` Richard Henderson
  2014-02-16 18:12   ` Aurelien Jarno
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 2/5] tcg/i386: Move TCG_CT_CONST_* to tcg-target.c Richard Henderson
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2014-01-31 14:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 disas/i386.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 132 insertions(+), 14 deletions(-)

diff --git a/disas/i386.c b/disas/i386.c
index 044e02c..104524f 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -171,6 +171,7 @@ static void print_operand_value (char *buf, size_t bufsize, int hex, bfd_vma dis
 static void print_displacement (char *, bfd_vma);
 static void OP_E (int, int);
 static void OP_G (int, int);
+static void OP_vvvv (int, int);
 static bfd_vma get64 (void);
 static bfd_signed_vma get32 (void);
 static bfd_signed_vma get32s (void);
@@ -264,6 +265,9 @@ static int rex_used;
    current instruction.  */
 static int used_prefixes;
 
+/* The VEX.vvvv register, unencoded.  */
+static int vex_reg;
+
 /* Flags stored in PREFIXES.  */
 #define PREFIX_REPZ 1
 #define PREFIX_REPNZ 2
@@ -278,6 +282,10 @@ static int used_prefixes;
 #define PREFIX_ADDR 0x400
 #define PREFIX_FWAIT 0x800
 
+#define PREFIX_VEX_0F    0x1000
+#define PREFIX_VEX_0F38  0x2000
+#define PREFIX_VEX_0F3A  0x4000
+
 /* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive)
    to ADDR (exclusive) are valid.  Returns 1 for success, longjmps
    on error.  */
@@ -323,6 +331,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
 
 #define XX { NULL, 0 }
 
+#define Bv { OP_vvvv, v_mode }
 #define Eb { OP_E, b_mode }
 #define Ev { OP_E, v_mode }
 #define Ed { OP_E, d_mode }
@@ -671,7 +680,8 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
 #define PREGRP102 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 102 } }
 #define PREGRP103 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 103 } }
 #define PREGRP104 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 104 } }
-
+#define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } }
+#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } }
 
 #define X86_64_0  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } }
 #define X86_64_1  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } }
@@ -1449,7 +1459,7 @@ static const unsigned char threebyte_0x38_uses_DATA_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -1473,7 +1483,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -1497,7 +1507,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -2774,6 +2784,22 @@ static const struct dis386 prefix_user_table[][4] = {
     { "(bad)",	{ XX } },
   },
 
+  /* PREGRP105 */
+  {
+    { "andnS",	{ Gv, Bv, Ev } },
+    { "(bad)",	{ XX } },
+    { "(bad)",	{ XX } },
+    { "(bad)",	{ XX } },
+  },
+
+  /* PREGRP106 */
+  {
+    { "bextrS",	{ Gv, Ev, Bv } },
+    { "sarxS",	{ Gv, Ev, Bv } },
+    { "shlxS",	{ Gv, Ev, Bv } },
+    { "shrxS",	{ Gv, Ev, Bv } },
+  },
+
 };
 
 static const struct dis386 x86_64_table[][2] = {
@@ -3071,12 +3097,12 @@ static const struct dis386 three_byte_table[][256] = {
     /* f0 */
     { PREGRP87 },
     { PREGRP88 },
+    { PREGRP105 },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
-    { "(bad)", { XX } },
-    { "(bad)", { XX } },
+    { PREGRP106 },
     /* f8 */
     { "(bad)", { XX } },
     { "(bad)", { XX } },
@@ -3477,6 +3503,74 @@ ckprefix (void)
     }
 }
 
+static void
+ckvexprefix (void)
+{
+    int op, vex2, vex3, newrex = REX_OPCODE, newpfx = prefixes;
+
+    if (address_mode == mode_16bit) {
+        return;
+    }
+
+    fetch_data(the_info, codep + 1);
+    op = *codep;
+
+    if (op != 0xc4 && op != 0xc5) {
+        return;
+    }
+
+    fetch_data(the_info, codep + 2);
+    vex2 = codep[1];
+
+    if (address_mode == mode_32bit && (vex2 & 0xc0) != 0xc0) {
+        return;
+    }
+
+    if (op == 0xc4) {
+        /* Three byte VEX prefix.  */
+        fetch_data(the_info, codep + 3);
+        vex3 = codep[2];
+
+        newrex |= (vex2 & 0x80 ? 0 : REX_R);
+        newrex |= (vex2 & 0x40 ? 0 : REX_X);
+        newrex |= (vex2 & 0x20 ? 0 : REX_B);
+        newrex |= (vex3 & 0x80 ? REX_W : 0);
+        switch (vex2 & 0x1f) {      /* VEX.m-mmmm */
+        case 1:
+            newpfx |= PREFIX_VEX_0F;
+            break;
+        case 2:
+            newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F38;
+            break;
+        case 3:
+            newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F3A;
+            break;
+        }
+        vex2 = vex3;
+        codep += 3;
+    } else {
+        /* Two byte VEX prefix.  */
+        newrex |= (vex2 & 0x80 ? 0 : REX_R);
+        codep += 2;
+    }
+
+    vex_reg = (~vex2 >> 3) & 15;     /* VEX.vvvv */
+    switch (vex2 & 3) {              /* VEX.pp */
+    case 1:
+        newpfx |= PREFIX_DATA;     /* 0x66 */
+        break;
+    case 2:
+        newpfx |= PREFIX_REPZ;     /* 0xf3 */
+        break;
+    case 3:
+        newpfx |= PREFIX_REPNZ;    /* 0xf2 */
+        break;
+    }
+
+    rex = newrex;
+    prefixes = newpfx;
+}
+
 /* Return the name of the prefix byte PREF, or NULL if PREF is not a
    prefix byte.  */
 
@@ -3598,6 +3692,7 @@ print_insn (bfd_vma pc, disassemble_info *info)
   const char *p;
   struct dis_private priv;
   unsigned char op;
+  unsigned char threebyte;
 
   if (info->mach == bfd_mach_x86_64_intel_syntax
       || info->mach == bfd_mach_x86_64)
@@ -3752,6 +3847,7 @@ print_insn (bfd_vma pc, disassemble_info *info)
 
   obufp = obuf;
   ckprefix ();
+  ckvexprefix ();
 
   insn_codep = codep;
   sizeflag = priv.orig_sizeflag;
@@ -3775,18 +3871,29 @@ print_insn (bfd_vma pc, disassemble_info *info)
     }
 
   op = 0;
+  if (prefixes & PREFIX_VEX_0F)
+    {
+      used_prefixes |= PREFIX_VEX_0F | PREFIX_VEX_0F38 | PREFIX_VEX_0F3A;
+      if (prefixes & PREFIX_VEX_0F38)
+        threebyte = 0x38;
+      else if (prefixes & PREFIX_VEX_0F3A)
+        threebyte = 0x3a;
+      else
+        threebyte = *codep++;
+      goto vex_opcode;
+    }
   if (*codep == 0x0f)
     {
-      unsigned char threebyte;
       fetch_data(info, codep + 2);
-      threebyte = *++codep;
+      threebyte = codep[1];
+      codep += 2;
+    vex_opcode:
       dp = &dis386_twobyte[threebyte];
-      need_modrm = twobyte_has_modrm[*codep];
-      uses_DATA_prefix = twobyte_uses_DATA_prefix[*codep];
-      uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[*codep];
-      uses_REPZ_prefix = twobyte_uses_REPZ_prefix[*codep];
-      uses_LOCK_prefix = (*codep & ~0x02) == 0x20;
-      codep++;
+      need_modrm = twobyte_has_modrm[threebyte];
+      uses_DATA_prefix = twobyte_uses_DATA_prefix[threebyte];
+      uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[threebyte];
+      uses_REPZ_prefix = twobyte_uses_REPZ_prefix[threebyte];
+      uses_LOCK_prefix = (threebyte & ~0x02) == 0x20;
       if (dp->name == NULL && dp->op[0].bytemode == IS_3BYTE_OPCODE)
 	{
           fetch_data(info, codep + 2);
@@ -5291,6 +5398,17 @@ OP_G (int bytemode, int sizeflag)
     }
 }
 
+static void
+OP_vvvv (int bytemode, int sizeflags)
+{
+    USED_REX (REX_W);
+    if (rex & REX_W) {
+        oappend(names64[vex_reg]);
+    } else {
+        oappend(names32[vex_reg]);
+    }
+}
+
 static bfd_vma
 get64 (void)
 {
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 2/5] tcg/i386: Move TCG_CT_CONST_* to tcg-target.c
  2014-01-31 14:43 [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 1/5] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX Richard Henderson
@ 2014-01-31 14:43 ` Richard Henderson
  2014-02-16 18:12   ` Aurelien Jarno
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 3/5] tcg/i386: Add tcg_out_vex_modrm Richard Henderson
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2014-01-31 14:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

These are not needed by users of tcg-target.h.  No need to recompile
when we adjust them.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c | 4 ++++
 tcg/i386/tcg-target.h | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 5d4cf93..7008b0e 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -88,6 +88,10 @@ static const int tcg_target_call_oarg_regs[] = {
 #endif
 };
 
+/* Constants we accept.  */
+#define TCG_CT_CONST_S32 0x100
+#define TCG_CT_CONST_U32 0x200
+
 /* Registers used with L constraint, which are the first argument 
    registers on x86_64, and two random call clobbered registers on
    i386. */
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 92c0fcd..747b797 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -64,9 +64,6 @@ typedef enum {
     TCG_REG_RDI = TCG_REG_EDI,
 } TCGReg;
 
-#define TCG_CT_CONST_S32 0x100
-#define TCG_CT_CONST_U32 0x200
-
 /* used for function call generation */
 #define TCG_REG_CALL_STACK TCG_REG_ESP 
 #define TCG_TARGET_STACK_ALIGN 16
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 3/5] tcg/i386: Add tcg_out_vex_modrm
  2014-01-31 14:43 [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 1/5] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX Richard Henderson
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 2/5] tcg/i386: Move TCG_CT_CONST_* to tcg-target.c Richard Henderson
@ 2014-01-31 14:43 ` Richard Henderson
  2014-02-16 18:12   ` Aurelien Jarno
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction Richard Henderson
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2014-01-31 14:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Prepare for emitting BMI insns which require VEX encoding.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 7008b0e..00dbc3b 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -402,9 +402,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 
     rex = 0;
     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
-    rex |= (r & 8) >> 1;		/* REX.R */
-    rex |= (x & 8) >> 2;		/* REX.X */
-    rex |= (rm & 8) >> 3;		/* REX.B */
+    rex |= (r & 8) >> 1;                /* REX.R */
+    rex |= (x & 8) >> 2;                /* REX.X */
+    rex |= (rm & 8) >> 3;               /* REX.B */
 
     /* P_REXB_{R,RM} indicates that the given register is the low byte.
        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
@@ -453,6 +453,41 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+    int tmp;
+
+    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
+        /* Three byte VEX prefix.  */
+        tcg_out8(s, 0xc4);
+
+        /* VEX.m-mmmm */
+        if (opc & P_EXT38) {
+            tmp = 2;
+        } else if (opc & P_EXT) {
+            tmp = 1;
+        } else {
+            tcg_abort();
+        }
+        tmp |= 0x40;                       /* VEX.X */
+        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */
+        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */
+        tcg_out8(s, tmp);
+
+        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */
+    } else {
+        /* Two byte VEX prefix.  */
+        tcg_out8(s, 0xc5);
+
+        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
+    }
+    tmp |= (opc & P_DATA16 ? 1 : 0);       /* VEX.pp */
+    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
+    tcg_out8(s, tmp);
+    tcg_out8(s, opc);
+    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
+}
+
 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
    We handle either RM and INDEX missing with a negative value.  In 64-bit
    mode for absolute addresses, ~RM is the size of the immediate operand
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction
  2014-01-31 14:43 [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
                   ` (2 preceding siblings ...)
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 3/5] tcg/i386: Add tcg_out_vex_modrm Richard Henderson
@ 2014-01-31 14:43 ` Richard Henderson
  2014-02-16 18:12   ` Aurelien Jarno
  2014-02-20 16:25   ` Peter Maydell
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions Richard Henderson
  2014-02-14 21:44 ` [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
  5 siblings, 2 replies; 21+ messages in thread
From: Richard Henderson @ 2014-01-31 14:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C
so we must handle constants in the implementation of andc.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c | 52 ++++++++++++++++++++++++++++++++++++++++-----------
 tcg/i386/tcg-target.h |  6 ++++--
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 00dbc3b..4f6b9c1 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -91,6 +91,7 @@ static const int tcg_target_call_oarg_regs[] = {
 /* Constants we accept.  */
 #define TCG_CT_CONST_S32 0x100
 #define TCG_CT_CONST_U32 0x200
+#define TCG_CT_CONST_I32 0x400
 
 /* Registers used with L constraint, which are the first argument 
    registers on x86_64, and two random call clobbered registers on
@@ -128,6 +129,10 @@ static bool have_movbe;
 # define have_movbe 0
 #endif
 
+/* We need this symbol in tcg-target.h, and we can't properly conditionalize
+   it there.  Therefore we always define the variable.  */
+bool have_bmi1;
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -224,6 +229,9 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
     case 'Z':
         ct->ct |= TCG_CT_CONST_U32;
         break;
+    case 'I':
+        ct->ct |= TCG_CT_CONST_I32;
+        break;
 
     default:
         return -1;
@@ -247,6 +255,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
     if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
         return 1;
     }
+    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
+        return 1;
+    }
     return 0;
 }
 
@@ -276,6 +287,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
+#define OPC_ANDN        (0xf2 | P_EXT38)
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_CALL_Jz	(0xe8)
@@ -1813,6 +1825,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    OP_32_64(andc):
+        if (const_args[2]) {
+            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32,
+                        args[0], args[1]);
+            tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0);
+        } else {
+            tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]);
+        }
+        break;
+
     OP_32_64(mul):
         if (const_args[2]) {
             int32_t val;
@@ -2041,6 +2063,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_and_i32, { "r", "0", "ri" } },
     { INDEX_op_or_i32, { "r", "0", "ri" } },
     { INDEX_op_xor_i32, { "r", "0", "ri" } },
+    { INDEX_op_andc_i32, { "r", "r", "ri" } },
 
     { INDEX_op_shl_i32, { "r", "0", "ci" } },
     { INDEX_op_shr_i32, { "r", "0", "ci" } },
@@ -2098,6 +2121,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_and_i64, { "r", "0", "reZ" } },
     { INDEX_op_or_i64, { "r", "0", "re" } },
     { INDEX_op_xor_i64, { "r", "0", "re" } },
+    { INDEX_op_andc_i64, { "r", "r", "rI" } },
 
     { INDEX_op_shl_i64, { "r", "0", "ci" } },
     { INDEX_op_shr_i64, { "r", "0", "ci" } },
@@ -2235,25 +2259,31 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
 static void tcg_target_init(TCGContext *s)
 {
-#if !(defined(have_cmov) && defined(have_movbe))
-    {
-        unsigned a, b, c, d;
-        int ret = __get_cpuid(1, &a, &b, &c, &d);
+    unsigned a, b, c, d;
+    int max = __get_cpuid_max(0, 0);
 
-# ifndef have_cmov
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+#ifndef have_cmov
         /* For 32-bit, 99% certainty that we're running on hardware that
            supports cmov, but we still need to check.  In case cmov is not
            available, we'll use a small forward branch.  */
-        have_cmov = ret && (d & bit_CMOV);
-# endif
-
-# ifndef have_movbe
+        have_cmov = (d & bit_CMOV) != 0;
+#endif
+#ifndef have_movbe
         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
            need to probe for it.  */
-        have_movbe = ret && (c & bit_MOVBE);
-# endif
+        have_movbe = (c & bit_MOVBE) != 0;
+#endif
     }
+
+    if (max >= 7) {
+        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
+        __cpuid_count(7, 0, a, b, c, d);
+#ifdef bit_BMI
+        have_bmi1 = (b & bit_BMI) != 0;
 #endif
+    }
 
     if (TCG_TARGET_REG_BITS == 64) {
         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 747b797..bdf2222 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -73,6 +73,8 @@ typedef enum {
 #define TCG_TARGET_CALL_STACK_OFFSET 0
 #endif
 
+extern bool have_bmi1;
+
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
 #define TCG_TARGET_HAS_rot_i32          1
@@ -84,7 +86,7 @@ typedef enum {
 #define TCG_TARGET_HAS_bswap32_i32      1
 #define TCG_TARGET_HAS_neg_i32          1
 #define TCG_TARGET_HAS_not_i32          1
-#define TCG_TARGET_HAS_andc_i32         0
+#define TCG_TARGET_HAS_andc_i32         have_bmi1
 #define TCG_TARGET_HAS_orc_i32          0
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
@@ -112,7 +114,7 @@ typedef enum {
 #define TCG_TARGET_HAS_bswap64_i64      1
 #define TCG_TARGET_HAS_neg_i64          1
 #define TCG_TARGET_HAS_not_i64          1
-#define TCG_TARGET_HAS_andc_i64         0
+#define TCG_TARGET_HAS_andc_i64         have_bmi1
 #define TCG_TARGET_HAS_orc_i64          0
 #define TCG_TARGET_HAS_eqv_i64          0
 #define TCG_TARGET_HAS_nand_i64         0
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
  2014-01-31 14:43 [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
                   ` (3 preceding siblings ...)
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction Richard Henderson
@ 2014-01-31 14:43 ` Richard Henderson
  2014-02-16 14:21   ` Paolo Bonzini
  2014-02-16 18:12   ` Aurelien Jarno
  2014-02-14 21:44 ` [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
  5 siblings, 2 replies; 21+ messages in thread
From: Richard Henderson @ 2014-01-31 14:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

These three-operand shift instructions do not require the shift count
to be placed into ECX.  This reduces the number of mov insns required,
with the mere addition of a new register constraint.

Don't attempt to get rid of the matching constraint, as that's impossible
to manipulate with just a new constraint.  In addition, constant shifts
still need the matching constraint.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c | 61 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4f6b9c1..fef1717 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -133,6 +133,12 @@ static bool have_movbe;
    it there.  Therefore we always define the variable.  */
 bool have_bmi1;
 
+#if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
+static bool have_bmi2;
+#else
+# define have_bmi2 0
+#endif
+
 static uint8_t *tb_ret_addr;
 
 static void patch_reloc(uint8_t *code_ptr, int type,
@@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
         break;
     case 'c':
+    case_c:
         ct->ct |= TCG_CT_REG;
         tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
         break;
@@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         tcg_regset_set32(ct->u.regs, 0, 0xf);
         break;
     case 'r':
+    case_r:
         ct->ct |= TCG_CT_REG;
         if (TCG_TARGET_REG_BITS == 64) {
             tcg_regset_set32(ct->u.regs, 0, 0xffff);
@@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
             tcg_regset_set32(ct->u.regs, 0, 0xff);
         }
         break;
+    case 'C':
+        /* With SHRX et al, we need not use ECX as shift count register.  */
+        if (have_bmi2) {
+            goto case_r;
+        } else {
+            goto case_c;
+        }
 
         /* qemu_ld/st address constraint */
     case 'L':
@@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
 # define P_REXB_RM	0
 # define P_GS           0
 #endif
+#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
+#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
@@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
+#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
+#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
+#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 #define OPC_TESTL	(0x85)
 #define OPC_XCHG_ax_r32	(0x90)
 
@@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 
         tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
     }
-    tmp |= (opc & P_DATA16 ? 1 : 0);       /* VEX.pp */
+    /* VEX.pp */
+    if (opc & P_DATA16) {
+        tmp |= 1;                          /* 0x66 */
+    } else if (opc & P_SIMDF3) {
+        tmp |= 2;                          /* 0xf3 */
+    } else if (opc & P_SIMDF2) {
+        tmp |= 3;                          /* 0xf2 */
+    }
     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
     tcg_out8(s, tmp);
     tcg_out8(s, opc);
@@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
-    int c, rexw = 0;
+    int c, vexop, rexw = 0;
 
 #if TCG_TARGET_REG_BITS == 64
 # define OP_32_64(x) \
@@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     OP_32_64(shl):
         c = SHIFT_SHL;
-        goto gen_shift;
+        vexop = OPC_SHLX;
+        goto gen_shift_maybe_vex;
     OP_32_64(shr):
         c = SHIFT_SHR;
-        goto gen_shift;
+        vexop = OPC_SHRX;
+        goto gen_shift_maybe_vex;
     OP_32_64(sar):
         c = SHIFT_SAR;
-        goto gen_shift;
+        vexop = OPC_SARX;
+        goto gen_shift_maybe_vex;
     OP_32_64(rotl):
         c = SHIFT_ROL;
         goto gen_shift;
     OP_32_64(rotr):
         c = SHIFT_ROR;
         goto gen_shift;
+    gen_shift_maybe_vex:
+        if (have_bmi2 && !const_args[2]) {
+            tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
+            break;
+        }
+        /* FALLTHRU */
     gen_shift:
         if (const_args[2]) {
             tcg_out_shifti(s, c + rexw, args[0], args[2]);
@@ -2065,9 +2101,9 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_xor_i32, { "r", "0", "ri" } },
     { INDEX_op_andc_i32, { "r", "r", "ri" } },
 
-    { INDEX_op_shl_i32, { "r", "0", "ci" } },
-    { INDEX_op_shr_i32, { "r", "0", "ci" } },
-    { INDEX_op_sar_i32, { "r", "0", "ci" } },
+    { INDEX_op_shl_i32, { "r", "0", "Ci" } },
+    { INDEX_op_shr_i32, { "r", "0", "Ci" } },
+    { INDEX_op_sar_i32, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i32, { "r", "0", "ci" } },
     { INDEX_op_rotr_i32, { "r", "0", "ci" } },
 
@@ -2123,9 +2159,9 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_xor_i64, { "r", "0", "re" } },
     { INDEX_op_andc_i64, { "r", "r", "rI" } },
 
-    { INDEX_op_shl_i64, { "r", "0", "ci" } },
-    { INDEX_op_shr_i64, { "r", "0", "ci" } },
-    { INDEX_op_sar_i64, { "r", "0", "ci" } },
+    { INDEX_op_shl_i64, { "r", "0", "Ci" } },
+    { INDEX_op_shr_i64, { "r", "0", "Ci" } },
+    { INDEX_op_sar_i64, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i64, { "r", "0", "ci" } },
     { INDEX_op_rotr_i64, { "r", "0", "ci" } },
 
@@ -2283,6 +2319,9 @@ static void tcg_target_init(TCGContext *s)
 #ifdef bit_BMI
         have_bmi1 = (b & bit_BMI) != 0;
 #endif
+#ifndef have_bmi2
+        have_bmi2 = (b & bit_BMI2) != 0;
+#endif
     }
 
     if (TCG_TARGET_REG_BITS == 64) {
-- 
1.8.5.3

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi
  2014-01-31 14:43 [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
                   ` (4 preceding siblings ...)
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions Richard Henderson
@ 2014-02-14 21:44 ` Richard Henderson
  2014-02-16 14:22   ` Paolo Bonzini
  5 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2014-02-14 21:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Ping.

On 01/31/2014 06:43 AM, Richard Henderson wrote:
> There are three separate architecture extensions for logical operations,
> BMI, BMI2, and TBM.  The first two are supported on Intel Haswell and
> AMD Excavator, while slightly earlier AMD support only BMI and TBM.
> 
> The following adds support for the interesting BMI and BMI2 instructions,
> where it is easy to do so.  Most of the rest of the new instructions are
> irrelevant to TCG.
> 
> When I added support for the ANDC opcode, I noticed some optimization
> regressions when looking at ppc64 guest dumps.  I will address these
> in a separate patch set.
> 
> 
> r~
> 
> 
> Richard Henderson (5):
>   disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX
>   tcg/i386: Move TCG_CT_CONST_* to tcg-target.c
>   tcg/i386: Add tcg_out_vex_modrm
>   tcg/i386: Use ANDN instruction
>   tcg/i386: Use SHLX/SHRX/SARX instructions
> 
>  disas/i386.c          | 146 +++++++++++++++++++++++++++++++++++++++++-----
>  tcg/i386/tcg-target.c | 156 ++++++++++++++++++++++++++++++++++++++++++--------
>  tcg/i386/tcg-target.h |   9 ++-
>  3 files changed, 268 insertions(+), 43 deletions(-)
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions Richard Henderson
@ 2014-02-16 14:21   ` Paolo Bonzini
  2014-02-16 17:57     ` Richard Henderson
  2014-02-17 16:01     ` Richard Henderson
  2014-02-16 18:12   ` Aurelien Jarno
  1 sibling, 2 replies; 21+ messages in thread
From: Paolo Bonzini @ 2014-02-16 14:21 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: aurelien

Il 31/01/2014 15:43, Richard Henderson ha scritto:
> +    gen_shift_maybe_vex:
> +        if (have_bmi2 && !const_args[2]) {
> +            tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
> +            break;
> +        }
> +        /* FALLTHRU */

What if args[2] happens to be ECX?

Apart from this,

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>

so feel free to post PATCH 6/5 and then squash it in the pull request.

Paolo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi
  2014-02-14 21:44 ` [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
@ 2014-02-16 14:22   ` Paolo Bonzini
  0 siblings, 0 replies; 21+ messages in thread
From: Paolo Bonzini @ 2014-02-16 14:22 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: aurelien

Il 14/02/2014 22:44, Richard Henderson ha scritto:
> Ping.
>
> On 01/31/2014 06:43 AM, Richard Henderson wrote:
>> There are three separate architecture extensions for logical operations,
>> BMI, BMI2, and TBM.  The first two are supported on Intel Haswell and
>> AMD Excavator, while slightly earlier AMD support only BMI and TBM.
>>
>> The following adds support for the interesting BMI and BMI2 instructions,
>> where it is easy to do so.  Most of the rest of the new instructions are
>> irrelevant to TCG.
>>
>> When I added support for the ANDC opcode, I noticed some optimization
>> regressions when looking at ppc64 guest dumps.  I will address these
>> in a separate patch set.

Just a small comment on patch 5, everything else looks fine.

Paolo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
  2014-02-16 14:21   ` Paolo Bonzini
@ 2014-02-16 17:57     ` Richard Henderson
  2014-02-17 16:01     ` Richard Henderson
  1 sibling, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2014-02-16 17:57 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel; +Cc: aurelien

On 02/16/2014 06:21 AM, Paolo Bonzini wrote:
> Il 31/01/2014 15:43, Richard Henderson ha scritto:
>> +    gen_shift_maybe_vex:
>> +        if (have_bmi2 && !const_args[2]) {
>> +            tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
>> +            break;
>> +        }
>> +        /* FALLTHRU */
> 
> What if args[2] happens to be ECX?

shlx handles that just fine.  I don't think it's worth an extra check to
fall back to shl on the off-chance that ecx is used; it's pretty far down
on the register allocation order list, so it wouldn't happen often.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction Richard Henderson
@ 2014-02-16 18:12   ` Aurelien Jarno
  2014-02-17 16:18     ` Richard Henderson
  2014-02-20 16:25   ` Peter Maydell
  1 sibling, 1 reply; 21+ messages in thread
From: Aurelien Jarno @ 2014-02-16 18:12 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Jan 31, 2014 at 08:43:37AM -0600, Richard Henderson wrote:
> Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C
> so we must handle constants in the implementation of andc.

I do wonder if it actually won't be a better idea to add this
simplification to the optimizer instead of adding it to the backend.

The best to do that would be to check with tcg_target_const_match to
see if ANDC would accept such a constraint and to convert it to AND
if not.

The same can probably be done for ORC.

> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c | 52 ++++++++++++++++++++++++++++++++++++++++-----------
>  tcg/i386/tcg-target.h |  6 ++++--
>  2 files changed, 45 insertions(+), 13 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 00dbc3b..4f6b9c1 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -91,6 +91,7 @@ static const int tcg_target_call_oarg_regs[] = {
>  /* Constants we accept.  */
>  #define TCG_CT_CONST_S32 0x100
>  #define TCG_CT_CONST_U32 0x200
> +#define TCG_CT_CONST_I32 0x400
>  
>  /* Registers used with L constraint, which are the first argument 
>     registers on x86_64, and two random call clobbered registers on
> @@ -128,6 +129,10 @@ static bool have_movbe;
>  # define have_movbe 0
>  #endif
>  
> +/* We need this symbol in tcg-target.h, and we can't properly conditionalize
> +   it there.  Therefore we always define the variable.  */
> +bool have_bmi1;
> +
>  static uint8_t *tb_ret_addr;
>  
>  static void patch_reloc(uint8_t *code_ptr, int type,
> @@ -224,6 +229,9 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>      case 'Z':
>          ct->ct |= TCG_CT_CONST_U32;
>          break;
> +    case 'I':
> +        ct->ct |= TCG_CT_CONST_I32;
> +        break;
>  
>      default:
>          return -1;
> @@ -247,6 +255,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
>      if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
>          return 1;
>      }
> +    if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
> +        return 1;
> +    }
>      return 0;
>  }
>  
> @@ -276,6 +287,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define OPC_ARITH_EvIz	(0x81)
>  #define OPC_ARITH_EvIb	(0x83)
>  #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
> +#define OPC_ANDN        (0xf2 | P_EXT38)
>  #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
>  #define OPC_BSWAP	(0xc8 | P_EXT)
>  #define OPC_CALL_Jz	(0xe8)
> @@ -1813,6 +1825,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          }
>          break;
>  
> +    OP_32_64(andc):
> +        if (const_args[2]) {
> +            tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32,
> +                        args[0], args[1]);
> +            tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0);
> +        } else {
> +            tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]);
> +        }
> +        break;
> +
>      OP_32_64(mul):
>          if (const_args[2]) {
>              int32_t val;
> @@ -2041,6 +2063,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_and_i32, { "r", "0", "ri" } },
>      { INDEX_op_or_i32, { "r", "0", "ri" } },
>      { INDEX_op_xor_i32, { "r", "0", "ri" } },
> +    { INDEX_op_andc_i32, { "r", "r", "ri" } },
>  
>      { INDEX_op_shl_i32, { "r", "0", "ci" } },
>      { INDEX_op_shr_i32, { "r", "0", "ci" } },
> @@ -2098,6 +2121,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_and_i64, { "r", "0", "reZ" } },
>      { INDEX_op_or_i64, { "r", "0", "re" } },
>      { INDEX_op_xor_i64, { "r", "0", "re" } },
> +    { INDEX_op_andc_i64, { "r", "r", "rI" } },
>  
>      { INDEX_op_shl_i64, { "r", "0", "ci" } },
>      { INDEX_op_shr_i64, { "r", "0", "ci" } },
> @@ -2235,25 +2259,31 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>  
>  static void tcg_target_init(TCGContext *s)
>  {
> -#if !(defined(have_cmov) && defined(have_movbe))
> -    {
> -        unsigned a, b, c, d;
> -        int ret = __get_cpuid(1, &a, &b, &c, &d);
> +    unsigned a, b, c, d;
> +    int max = __get_cpuid_max(0, 0);
>  
> -# ifndef have_cmov
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +#ifndef have_cmov
>          /* For 32-bit, 99% certainty that we're running on hardware that
>             supports cmov, but we still need to check.  In case cmov is not
>             available, we'll use a small forward branch.  */
> -        have_cmov = ret && (d & bit_CMOV);
> -# endif
> -
> -# ifndef have_movbe
> +        have_cmov = (d & bit_CMOV) != 0;
> +#endif
> +#ifndef have_movbe
>          /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
>             need to probe for it.  */
> -        have_movbe = ret && (c & bit_MOVBE);
> -# endif
> +        have_movbe = (c & bit_MOVBE) != 0;
> +#endif
>      }
> +
> +    if (max >= 7) {
> +        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
> +        __cpuid_count(7, 0, a, b, c, d);
> +#ifdef bit_BMI
> +        have_bmi1 = (b & bit_BMI) != 0;
>  #endif
> +    }
>  
>      if (TCG_TARGET_REG_BITS == 64) {
>          tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> index 747b797..bdf2222 100644
> --- a/tcg/i386/tcg-target.h
> +++ b/tcg/i386/tcg-target.h
> @@ -73,6 +73,8 @@ typedef enum {
>  #define TCG_TARGET_CALL_STACK_OFFSET 0
>  #endif
>  
> +extern bool have_bmi1;
> +
>  /* optional instructions */
>  #define TCG_TARGET_HAS_div2_i32         1
>  #define TCG_TARGET_HAS_rot_i32          1
> @@ -84,7 +86,7 @@ typedef enum {
>  #define TCG_TARGET_HAS_bswap32_i32      1
>  #define TCG_TARGET_HAS_neg_i32          1
>  #define TCG_TARGET_HAS_not_i32          1
> -#define TCG_TARGET_HAS_andc_i32         0
> +#define TCG_TARGET_HAS_andc_i32         have_bmi1
>  #define TCG_TARGET_HAS_orc_i32          0
>  #define TCG_TARGET_HAS_eqv_i32          0
>  #define TCG_TARGET_HAS_nand_i32         0
> @@ -112,7 +114,7 @@ typedef enum {
>  #define TCG_TARGET_HAS_bswap64_i64      1
>  #define TCG_TARGET_HAS_neg_i64          1
>  #define TCG_TARGET_HAS_not_i64          1
> -#define TCG_TARGET_HAS_andc_i64         0
> +#define TCG_TARGET_HAS_andc_i64         have_bmi1
>  #define TCG_TARGET_HAS_orc_i64          0
>  #define TCG_TARGET_HAS_eqv_i64          0
>  #define TCG_TARGET_HAS_nand_i64         0

Otherwise the patch looks good to me.


-- 
Aurelien Jarno	                        GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions Richard Henderson
  2014-02-16 14:21   ` Paolo Bonzini
@ 2014-02-16 18:12   ` Aurelien Jarno
  1 sibling, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2014-02-16 18:12 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Jan 31, 2014 at 08:43:38AM -0600, Richard Henderson wrote:
> These three-operand shift instructions do not require the shift count
> to be placed into ECX.  This reduces the number of mov insns required,
> with the mere addition of a new register constraint.
> 
> Don't attempt to get rid of the matching constraint, as that's impossible
> to manipulate with just a new constraint.  In addition, constant shifts
> still need the matching constraint.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c | 61 +++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 50 insertions(+), 11 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 4f6b9c1..fef1717 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -133,6 +133,12 @@ static bool have_movbe;
>     it there.  Therefore we always define the variable.  */
>  bool have_bmi1;
>  
> +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
> +static bool have_bmi2;
> +#else
> +# define have_bmi2 0
> +#endif
> +
>  static uint8_t *tb_ret_addr;
>  
>  static void patch_reloc(uint8_t *code_ptr, int type,
> @@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>          tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
>          break;
>      case 'c':
> +    case_c:
>          ct->ct |= TCG_CT_REG;
>          tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
>          break;
> @@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>          tcg_regset_set32(ct->u.regs, 0, 0xf);
>          break;
>      case 'r':
> +    case_r:
>          ct->ct |= TCG_CT_REG;
>          if (TCG_TARGET_REG_BITS == 64) {
>              tcg_regset_set32(ct->u.regs, 0, 0xffff);
> @@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
>              tcg_regset_set32(ct->u.regs, 0, 0xff);
>          }
>          break;
> +    case 'C':
> +        /* With SHRX et al, we need not use ECX as shift count register.  */
> +        if (have_bmi2) {
> +            goto case_r;
> +        } else {
> +            goto case_c;
> +        }
>  
>          /* qemu_ld/st address constraint */
>      case 'L':
> @@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  # define P_REXB_RM	0
>  # define P_GS           0
>  #endif
> +#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
> +#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
>  
>  #define OPC_ARITH_EvIz	(0x81)
>  #define OPC_ARITH_EvIb	(0x83)
> @@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  #define OPC_SHIFT_1	(0xd1)
>  #define OPC_SHIFT_Ib	(0xc1)
>  #define OPC_SHIFT_cl	(0xd3)
> +#define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
> +#define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
> +#define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
>  #define OPC_TESTL	(0x85)
>  #define OPC_XCHG_ax_r32	(0x90)
>  
> @@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
>  
>          tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
>      }
> -    tmp |= (opc & P_DATA16 ? 1 : 0);       /* VEX.pp */
> +    /* VEX.pp */
> +    if (opc & P_DATA16) {
> +        tmp |= 1;                          /* 0x66 */
> +    } else if (opc & P_SIMDF3) {
> +        tmp |= 2;                          /* 0xf3 */
> +    } else if (opc & P_SIMDF2) {
> +        tmp |= 3;                          /* 0xf2 */
> +    }
>      tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
>      tcg_out8(s, tmp);
>      tcg_out8(s, opc);
> @@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>                                const TCGArg *args, const int *const_args)
>  {
> -    int c, rexw = 0;
> +    int c, vexop, rexw = 0;
>  
>  #if TCG_TARGET_REG_BITS == 64
>  # define OP_32_64(x) \
> @@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>  
>      OP_32_64(shl):
>          c = SHIFT_SHL;
> -        goto gen_shift;
> +        vexop = OPC_SHLX;
> +        goto gen_shift_maybe_vex;
>      OP_32_64(shr):
>          c = SHIFT_SHR;
> -        goto gen_shift;
> +        vexop = OPC_SHRX;
> +        goto gen_shift_maybe_vex;
>      OP_32_64(sar):
>          c = SHIFT_SAR;
> -        goto gen_shift;
> +        vexop = OPC_SARX;
> +        goto gen_shift_maybe_vex;
>      OP_32_64(rotl):
>          c = SHIFT_ROL;
>          goto gen_shift;
>      OP_32_64(rotr):
>          c = SHIFT_ROR;
>          goto gen_shift;
> +    gen_shift_maybe_vex:
> +        if (have_bmi2 && !const_args[2]) {
> +            tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
> +            break;
> +        }
> +        /* FALLTHRU */
>      gen_shift:
>          if (const_args[2]) {
>              tcg_out_shifti(s, c + rexw, args[0], args[2]);
> @@ -2065,9 +2101,9 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_xor_i32, { "r", "0", "ri" } },
>      { INDEX_op_andc_i32, { "r", "r", "ri" } },
>  
> -    { INDEX_op_shl_i32, { "r", "0", "ci" } },
> -    { INDEX_op_shr_i32, { "r", "0", "ci" } },
> -    { INDEX_op_sar_i32, { "r", "0", "ci" } },
> +    { INDEX_op_shl_i32, { "r", "0", "Ci" } },
> +    { INDEX_op_shr_i32, { "r", "0", "Ci" } },
> +    { INDEX_op_sar_i32, { "r", "0", "Ci" } },
>      { INDEX_op_rotl_i32, { "r", "0", "ci" } },
>      { INDEX_op_rotr_i32, { "r", "0", "ci" } },
>  
> @@ -2123,9 +2159,9 @@ static const TCGTargetOpDef x86_op_defs[] = {
>      { INDEX_op_xor_i64, { "r", "0", "re" } },
>      { INDEX_op_andc_i64, { "r", "r", "rI" } },
>  
> -    { INDEX_op_shl_i64, { "r", "0", "ci" } },
> -    { INDEX_op_shr_i64, { "r", "0", "ci" } },
> -    { INDEX_op_sar_i64, { "r", "0", "ci" } },
> +    { INDEX_op_shl_i64, { "r", "0", "Ci" } },
> +    { INDEX_op_shr_i64, { "r", "0", "Ci" } },
> +    { INDEX_op_sar_i64, { "r", "0", "Ci" } },
>      { INDEX_op_rotl_i64, { "r", "0", "ci" } },
>      { INDEX_op_rotr_i64, { "r", "0", "ci" } },
>  
> @@ -2283,6 +2319,9 @@ static void tcg_target_init(TCGContext *s)
>  #ifdef bit_BMI
>          have_bmi1 = (b & bit_BMI) != 0;
>  #endif
> +#ifndef have_bmi2
> +        have_bmi2 = (b & bit_BMI2) != 0;
> +#endif
>      }
>  
>      if (TCG_TARGET_REG_BITS == 64) {

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno	                        GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] tcg/i386: Add tcg_out_vex_modrm
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 3/5] tcg/i386: Add tcg_out_vex_modrm Richard Henderson
@ 2014-02-16 18:12   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2014-02-16 18:12 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Jan 31, 2014 at 08:43:36AM -0600, Richard Henderson wrote:
> Prepare for emitting BMI insns which require VEX encoding.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c | 41 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 38 insertions(+), 3 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 7008b0e..00dbc3b 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -402,9 +402,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
>  
>      rex = 0;
>      rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
> -    rex |= (r & 8) >> 1;		/* REX.R */
> -    rex |= (x & 8) >> 2;		/* REX.X */
> -    rex |= (rm & 8) >> 3;		/* REX.B */
> +    rex |= (r & 8) >> 1;                /* REX.R */
> +    rex |= (x & 8) >> 2;                /* REX.X */
> +    rex |= (rm & 8) >> 3;               /* REX.B */
>  
>      /* P_REXB_{R,RM} indicates that the given register is the low byte.
>         For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
> @@ -453,6 +453,41 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
>      tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
>  }
>  
> +static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
> +{
> +    int tmp;
> +
> +    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
> +        /* Three byte VEX prefix.  */
> +        tcg_out8(s, 0xc4);
> +
> +        /* VEX.m-mmmm */
> +        if (opc & P_EXT38) {
> +            tmp = 2;
> +        } else if (opc & P_EXT) {
> +            tmp = 1;
> +        } else {
> +            tcg_abort();
> +        }
> +        tmp |= 0x40;                       /* VEX.X */
> +        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */
> +        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */
> +        tcg_out8(s, tmp);
> +
> +        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */
> +    } else {
> +        /* Two byte VEX prefix.  */
> +        tcg_out8(s, 0xc5);
> +
> +        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
> +    }
> +    tmp |= (opc & P_DATA16 ? 1 : 0);       /* VEX.pp */
> +    tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
> +    tcg_out8(s, tmp);
> +    tcg_out8(s, opc);
> +    tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
> +}
> +
>  /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
>     We handle either RM and INDEX missing with a negative value.  In 64-bit
>     mode for absolute addresses, ~RM is the size of the immediate operand

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno	                        GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 2/5] tcg/i386: Move TCG_CT_CONST_* to tcg-target.c
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 2/5] tcg/i386: Move TCG_CT_CONST_* to tcg-target.c Richard Henderson
@ 2014-02-16 18:12   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2014-02-16 18:12 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Jan 31, 2014 at 08:43:35AM -0600, Richard Henderson wrote:
> These are not needed by users of tcg-target.h.  No need to recompile
> when we adjust them.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c | 4 ++++
>  tcg/i386/tcg-target.h | 3 ---
>  2 files changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 5d4cf93..7008b0e 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -88,6 +88,10 @@ static const int tcg_target_call_oarg_regs[] = {
>  #endif
>  };
>  
> +/* Constants we accept.  */
> +#define TCG_CT_CONST_S32 0x100
> +#define TCG_CT_CONST_U32 0x200
> +
>  /* Registers used with L constraint, which are the first argument 
>     registers on x86_64, and two random call clobbered registers on
>     i386. */
> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> index 92c0fcd..747b797 100644
> --- a/tcg/i386/tcg-target.h
> +++ b/tcg/i386/tcg-target.h
> @@ -64,9 +64,6 @@ typedef enum {
>      TCG_REG_RDI = TCG_REG_EDI,
>  } TCGReg;
>  
> -#define TCG_CT_CONST_S32 0x100
> -#define TCG_CT_CONST_U32 0x200
> -
>  /* used for function call generation */
>  #define TCG_REG_CALL_STACK TCG_REG_ESP 
>  #define TCG_TARGET_STACK_ALIGN 16

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>

-- 
Aurelien Jarno	                        GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 1/5] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX Richard Henderson
@ 2014-02-16 18:12   ` Aurelien Jarno
  0 siblings, 0 replies; 21+ messages in thread
From: Aurelien Jarno @ 2014-02-16 18:12 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Fri, Jan 31, 2014 at 08:43:34AM -0600, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  disas/i386.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 132 insertions(+), 14 deletions(-)
> 
> diff --git a/disas/i386.c b/disas/i386.c
> index 044e02c..104524f 100644
> --- a/disas/i386.c
> +++ b/disas/i386.c
> @@ -171,6 +171,7 @@ static void print_operand_value (char *buf, size_t bufsize, int hex, bfd_vma dis
>  static void print_displacement (char *, bfd_vma);
>  static void OP_E (int, int);
>  static void OP_G (int, int);
> +static void OP_vvvv (int, int);
>  static bfd_vma get64 (void);
>  static bfd_signed_vma get32 (void);
>  static bfd_signed_vma get32s (void);
> @@ -264,6 +265,9 @@ static int rex_used;
>     current instruction.  */
>  static int used_prefixes;
>  
> +/* The VEX.vvvv register, unencoded.  */
> +static int vex_reg;
> +
>  /* Flags stored in PREFIXES.  */
>  #define PREFIX_REPZ 1
>  #define PREFIX_REPNZ 2
> @@ -278,6 +282,10 @@ static int used_prefixes;
>  #define PREFIX_ADDR 0x400
>  #define PREFIX_FWAIT 0x800
>  
> +#define PREFIX_VEX_0F    0x1000
> +#define PREFIX_VEX_0F38  0x2000
> +#define PREFIX_VEX_0F3A  0x4000
> +
>  /* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive)
>     to ADDR (exclusive) are valid.  Returns 1 for success, longjmps
>     on error.  */
> @@ -323,6 +331,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
>  
>  #define XX { NULL, 0 }
>  
> +#define Bv { OP_vvvv, v_mode }
>  #define Eb { OP_E, b_mode }
>  #define Ev { OP_E, v_mode }
>  #define Ed { OP_E, d_mode }
> @@ -671,7 +680,8 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
>  #define PREGRP102 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 102 } }
>  #define PREGRP103 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 103 } }
>  #define PREGRP104 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 104 } }
> -
> +#define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } }
> +#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } }
>  
>  #define X86_64_0  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } }
>  #define X86_64_1  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } }
> @@ -1449,7 +1459,7 @@ static const unsigned char threebyte_0x38_uses_DATA_prefix[256] = {
>    /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
>    /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1, /* df */
>    /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
> -  /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
> +  /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
>    /*       -------------------------------        */
>    /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
>  };
> @@ -1473,7 +1483,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = {
>    /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
>    /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
>    /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
> -  /* f0 */ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
> +  /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
>    /*       -------------------------------        */
>    /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
>  };
> @@ -1497,7 +1507,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = {
>    /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
>    /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
>    /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
> -  /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
> +  /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
>    /*       -------------------------------        */
>    /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
>  };
> @@ -2774,6 +2784,22 @@ static const struct dis386 prefix_user_table[][4] = {
>      { "(bad)",	{ XX } },
>    },
>  
> +  /* PREGRP105 */
> +  {
> +    { "andnS",	{ Gv, Bv, Ev } },
> +    { "(bad)",	{ XX } },
> +    { "(bad)",	{ XX } },
> +    { "(bad)",	{ XX } },
> +  },
> +
> +  /* PREGRP106 */
> +  {
> +    { "bextrS",	{ Gv, Ev, Bv } },
> +    { "sarxS",	{ Gv, Ev, Bv } },
> +    { "shlxS",	{ Gv, Ev, Bv } },
> +    { "shrxS",	{ Gv, Ev, Bv } },
> +  },
> +
>  };
>  
>  static const struct dis386 x86_64_table[][2] = {
> @@ -3071,12 +3097,12 @@ static const struct dis386 three_byte_table[][256] = {
>      /* f0 */
>      { PREGRP87 },
>      { PREGRP88 },
> +    { PREGRP105 },
>      { "(bad)", { XX } },
>      { "(bad)", { XX } },
>      { "(bad)", { XX } },
>      { "(bad)", { XX } },
> -    { "(bad)", { XX } },
> -    { "(bad)", { XX } },
> +    { PREGRP106 },
>      /* f8 */
>      { "(bad)", { XX } },
>      { "(bad)", { XX } },
> @@ -3477,6 +3503,74 @@ ckprefix (void)
>      }
>  }
>  
> +static void
> +ckvexprefix (void)
> +{
> +    int op, vex2, vex3, newrex = REX_OPCODE, newpfx = prefixes;
> +
> +    if (address_mode == mode_16bit) {
> +        return;
> +    }
> +
> +    fetch_data(the_info, codep + 1);
> +    op = *codep;
> +
> +    if (op != 0xc4 && op != 0xc5) {
> +        return;
> +    }
> +
> +    fetch_data(the_info, codep + 2);
> +    vex2 = codep[1];
> +
> +    if (address_mode == mode_32bit && (vex2 & 0xc0) != 0xc0) {
> +        return;
> +    }
> +
> +    if (op == 0xc4) {
> +        /* Three byte VEX prefix.  */
> +        fetch_data(the_info, codep + 3);
> +        vex3 = codep[2];
> +
> +        newrex |= (vex2 & 0x80 ? 0 : REX_R);
> +        newrex |= (vex2 & 0x40 ? 0 : REX_X);
> +        newrex |= (vex2 & 0x20 ? 0 : REX_B);
> +        newrex |= (vex3 & 0x80 ? REX_W : 0);
> +        switch (vex2 & 0x1f) {      /* VEX.m-mmmm */
> +        case 1:
> +            newpfx |= PREFIX_VEX_0F;
> +            break;
> +        case 2:
> +            newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F38;
> +            break;
> +        case 3:
> +            newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F3A;
> +            break;
> +        }
> +        vex2 = vex3;
> +        codep += 3;
> +    } else {
> +        /* Two byte VEX prefix.  */
> +        newrex |= (vex2 & 0x80 ? 0 : REX_R);
> +        codep += 2;
> +    }
> +
> +    vex_reg = (~vex2 >> 3) & 15;     /* VEX.vvvv */
> +    switch (vex2 & 3) {              /* VEX.pp */
> +    case 1:
> +        newpfx |= PREFIX_DATA;     /* 0x66 */
> +        break;
> +    case 2:
> +        newpfx |= PREFIX_REPZ;     /* 0xf3 */
> +        break;
> +    case 3:
> +        newpfx |= PREFIX_REPNZ;    /* 0xf2 */
> +        break;
> +    }
> +
> +    rex = newrex;
> +    prefixes = newpfx;
> +}
> +
>  /* Return the name of the prefix byte PREF, or NULL if PREF is not a
>     prefix byte.  */
>  
> @@ -3598,6 +3692,7 @@ print_insn (bfd_vma pc, disassemble_info *info)
>    const char *p;
>    struct dis_private priv;
>    unsigned char op;
> +  unsigned char threebyte;
>  
>    if (info->mach == bfd_mach_x86_64_intel_syntax
>        || info->mach == bfd_mach_x86_64)
> @@ -3752,6 +3847,7 @@ print_insn (bfd_vma pc, disassemble_info *info)
>  
>    obufp = obuf;
>    ckprefix ();
> +  ckvexprefix ();
>  
>    insn_codep = codep;
>    sizeflag = priv.orig_sizeflag;
> @@ -3775,18 +3871,29 @@ print_insn (bfd_vma pc, disassemble_info *info)
>      }
>  
>    op = 0;
> +  if (prefixes & PREFIX_VEX_0F)
> +    {
> +      used_prefixes |= PREFIX_VEX_0F | PREFIX_VEX_0F38 | PREFIX_VEX_0F3A;
> +      if (prefixes & PREFIX_VEX_0F38)
> +        threebyte = 0x38;
> +      else if (prefixes & PREFIX_VEX_0F3A)
> +        threebyte = 0x3a;
> +      else
> +        threebyte = *codep++;
> +      goto vex_opcode;
> +    }
>    if (*codep == 0x0f)
>      {
> -      unsigned char threebyte;
>        fetch_data(info, codep + 2);
> -      threebyte = *++codep;
> +      threebyte = codep[1];
> +      codep += 2;
> +    vex_opcode:
>        dp = &dis386_twobyte[threebyte];
> -      need_modrm = twobyte_has_modrm[*codep];
> -      uses_DATA_prefix = twobyte_uses_DATA_prefix[*codep];
> -      uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[*codep];
> -      uses_REPZ_prefix = twobyte_uses_REPZ_prefix[*codep];
> -      uses_LOCK_prefix = (*codep & ~0x02) == 0x20;
> -      codep++;
> +      need_modrm = twobyte_has_modrm[threebyte];
> +      uses_DATA_prefix = twobyte_uses_DATA_prefix[threebyte];
> +      uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[threebyte];
> +      uses_REPZ_prefix = twobyte_uses_REPZ_prefix[threebyte];
> +      uses_LOCK_prefix = (threebyte & ~0x02) == 0x20;
>        if (dp->name == NULL && dp->op[0].bytemode == IS_3BYTE_OPCODE)
>  	{
>            fetch_data(info, codep + 2);
> @@ -5291,6 +5398,17 @@ OP_G (int bytemode, int sizeflag)
>      }
>  }
>  
> +static void
> +OP_vvvv (int bytemode, int sizeflags)
> +{
> +    USED_REX (REX_W);
> +    if (rex & REX_W) {
> +        oappend(names64[vex_reg]);
> +    } else {
> +        oappend(names32[vex_reg]);
> +    }
> +}
> +
>  static bfd_vma
>  get64 (void)
>  {

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>


-- 
Aurelien Jarno	                        GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions
  2014-02-16 14:21   ` Paolo Bonzini
  2014-02-16 17:57     ` Richard Henderson
@ 2014-02-17 16:01     ` Richard Henderson
  1 sibling, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2014-02-17 16:01 UTC (permalink / raw)
  To: Paolo Bonzini, qemu-devel; +Cc: aurelien

On 02/16/2014 08:21 AM, Paolo Bonzini wrote:
> Il 31/01/2014 15:43, Richard Henderson ha scritto:
>> +    gen_shift_maybe_vex:
>> +        if (have_bmi2 && !const_args[2]) {
>> +            tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]);
>> +            break;
>> +        }
>> +        /* FALLTHRU */
> 
> What if args[2] happens to be ECX?

I ran some measurements and as I expected this basically never happens.  For
64-bit, I never saw it occur.  For 32-bit, 1/800 of all shifts used ecx.

For 64-bit, the use of shlx et al is always a size win.  The mov and shift,
including their rex prefixes, are 3 bytes each, while the shlx is 5 byes.

For 32-bit, things are more complicated.  The mov and shift are 2 bytes each,
so the use of shlx is by itself a 1 byte size penalty.  Except that sometimes
the avoidance of the mov results in fewer spills, and thus fewer bytes overall.
 So overall I see the barest fraction (< 0.01%) size decrease across all TBs.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction
  2014-02-16 18:12   ` Aurelien Jarno
@ 2014-02-17 16:18     ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2014-02-17 16:18 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: qemu-devel

On 02/16/2014 12:12 PM, Aurelien Jarno wrote:
> On Fri, Jan 31, 2014 at 08:43:37AM -0600, Richard Henderson wrote:
>> > Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C
>> > so we must handle constants in the implementation of andc.
> I do wonder if it actually won't be a better idea to add this
> simplification to the optimizer instead of adding it to the backend.
> 
> The best to do that would be to check with tcg_target_const_match to
> see if ANDC would accept such a constraint and to convert it to AND
> if not.
> 
> The same can probably be done for ORC.
> 

I suppose we could.  There are plenty of pairs for which this could apply, even
add/sub, though so far we've forced the backend to take care of that.

Perhaps if I have time I'll work something up for tcg_reg_alloc_op and see how
much savings we can achieve across the backends.  That'll tell us if it's worth
bothering with.


r~

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction
  2014-01-31 14:43 ` [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction Richard Henderson
  2014-02-16 18:12   ` Aurelien Jarno
@ 2014-02-20 16:25   ` Peter Maydell
  2014-02-20 16:42     ` Peter Maydell
  2014-02-20 16:43     ` Richard Henderson
  1 sibling, 2 replies; 21+ messages in thread
From: Peter Maydell @ 2014-02-20 16:25 UTC (permalink / raw)
  To: Richard Henderson; +Cc: QEMU Developers, Aurelien Jarno

On 31 January 2014 14:43, Richard Henderson <rth@twiddle.net> wrote:
> Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C
> so we must handle constants in the implementation of andc.

Unfortunately I failed to notice before I applied this,
but this breaks the build for w32:

  LINK  aarch64-softmmu/qemu-system-aarch64.exe
tcg/tcg.o: In function `tcg_target_init':
/home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2263:
undefined reference to `___get_cpuid_max'
/home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2266:
undefined reference to `___cpuid'
/home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2282:
undefined reference to `___cpuid_count'
collect2: ld returned 1 exit status


Can you provide a reasonably quick fix, or should I
just revert commits 9d2eec20 and 6399ab33 for the moment?

thanks
-- PMM

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction
  2014-02-20 16:25   ` Peter Maydell
@ 2014-02-20 16:42     ` Peter Maydell
  2014-02-20 16:43     ` Richard Henderson
  1 sibling, 0 replies; 21+ messages in thread
From: Peter Maydell @ 2014-02-20 16:42 UTC (permalink / raw)
  To: Richard Henderson; +Cc: QEMU Developers, Aurelien Jarno

On 20 February 2014 16:25, Peter Maydell <peter.maydell@linaro.org> wrote:
> On 31 January 2014 14:43, Richard Henderson <rth@twiddle.net> wrote:
>> Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C
>> so we must handle constants in the implementation of andc.
>
> Unfortunately I failed to notice before I applied this,
> but this breaks the build for w32:
>
>   LINK  aarch64-softmmu/qemu-system-aarch64.exe
> tcg/tcg.o: In function `tcg_target_init':
> /home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2263:
> undefined reference to `___get_cpuid_max'
> /home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2266:
> undefined reference to `___cpuid'
> /home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2282:
> undefined reference to `___cpuid_count'
> collect2: ld returned 1 exit status
>
>
> Can you provide a reasonably quick fix, or should I
> just revert commits 9d2eec20 and 6399ab33 for the moment?

Breaks clang builds too (on MacOSX and otherwise).

-- PMM

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction
  2014-02-20 16:25   ` Peter Maydell
  2014-02-20 16:42     ` Peter Maydell
@ 2014-02-20 16:43     ` Richard Henderson
  2014-02-20 17:38       ` Peter Maydell
  1 sibling, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2014-02-20 16:43 UTC (permalink / raw)
  To: Peter Maydell; +Cc: QEMU Developers, Aurelien Jarno

On 02/20/2014 10:25 AM, Peter Maydell wrote:
> On 31 January 2014 14:43, Richard Henderson <rth@twiddle.net> wrote:
>> Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C
>> so we must handle constants in the implementation of andc.
> 
> Unfortunately I failed to notice before I applied this,
> but this breaks the build for w32:
> 
>   LINK  aarch64-softmmu/qemu-system-aarch64.exe
> tcg/tcg.o: In function `tcg_target_init':
> /home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2263:
> undefined reference to `___get_cpuid_max'
> /home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2266:
> undefined reference to `___cpuid'
> /home/petmay01/linaro/qemu-for-merges/tcg/i386/tcg-target.c:2282:
> undefined reference to `___cpuid_count'
> collect2: ld returned 1 exit status
> 
> 
> Can you provide a reasonably quick fix, or should I
> just revert commits 9d2eec20 and 6399ab33 for the moment?

Can you try this?


r~


diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index fef1717..dc52e0d 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -2295,6 +2295,7 @@ static void tcg_target_qemu_prologue(TCGContext *s)

 static void tcg_target_init(TCGContext *s)
 {
+#ifdef CONFIG_CPUID_H
     unsigned a, b, c, d;
     int max = __get_cpuid_max(0, 0);

@@ -2323,6 +2324,7 @@ static void tcg_target_init(TCGContext *s)
         have_bmi2 = (b & bit_BMI2) != 0;
 #endif
     }
+#endif /* CONFIG_CPUID_H */

     if (TCG_TARGET_REG_BITS == 64) {
         tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction
  2014-02-20 16:43     ` Richard Henderson
@ 2014-02-20 17:38       ` Peter Maydell
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Maydell @ 2014-02-20 17:38 UTC (permalink / raw)
  To: Richard Henderson; +Cc: QEMU Developers, Aurelien Jarno

On 20 February 2014 16:43, Richard Henderson <rth@twiddle.net> wrote:
> Can you try this?

[summarising an irc discussion]

That fixes W32 but not MacOSX, because there clang has a
cpuid.h but it doesn't have __get_cpuid_max() or __cpuid().
I'm just testing a patch which enhances our configure test
to check for these and puts ifdefs in the right places.

thanks
-- PMM

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2014-02-20 17:38 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-01-31 14:43 [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
2014-01-31 14:43 ` [Qemu-devel] [PATCH 1/5] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX Richard Henderson
2014-02-16 18:12   ` Aurelien Jarno
2014-01-31 14:43 ` [Qemu-devel] [PATCH 2/5] tcg/i386: Move TCG_CT_CONST_* to tcg-target.c Richard Henderson
2014-02-16 18:12   ` Aurelien Jarno
2014-01-31 14:43 ` [Qemu-devel] [PATCH 3/5] tcg/i386: Add tcg_out_vex_modrm Richard Henderson
2014-02-16 18:12   ` Aurelien Jarno
2014-01-31 14:43 ` [Qemu-devel] [PATCH 4/5] tcg/i386: Use ANDN instruction Richard Henderson
2014-02-16 18:12   ` Aurelien Jarno
2014-02-17 16:18     ` Richard Henderson
2014-02-20 16:25   ` Peter Maydell
2014-02-20 16:42     ` Peter Maydell
2014-02-20 16:43     ` Richard Henderson
2014-02-20 17:38       ` Peter Maydell
2014-01-31 14:43 ` [Qemu-devel] [PATCH 5/5] tcg/i386: Use SHLX/SHRX/SARX instructions Richard Henderson
2014-02-16 14:21   ` Paolo Bonzini
2014-02-16 17:57     ` Richard Henderson
2014-02-17 16:01     ` Richard Henderson
2014-02-16 18:12   ` Aurelien Jarno
2014-02-14 21:44 ` [Qemu-devel] [PATCH 0/5] tcg/i386 support for bmi Richard Henderson
2014-02-16 14:22   ` Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.