All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements
@ 2017-08-04  6:23 Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new Richard Henderson
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04  6:23 UTC (permalink / raw)
  To: qemu-devel

We can implement extract and deposit fairly efficiently with
the bmi2 pext and pdep instructions.

I did wonder about using bextr and tcg_reg_alloc_new to
allocate the register required to hold the parameters.  But
pdep allows the mask to be stored in the constant pool, and
there's the chance that the mask could be re-used, either
by another extract or by a deposit.

I have not yet done any profiling on this yet.


r~


Richard Henderson (6):
  tcg: Add tcg_reg_alloc_new
  disas/i386: Fix disassembly of two-byte vex prefixes
  disas/i386: Add disassembly of vex.0f38.f5
  disas/i386: Add disassembly of rorx
  tcg/i386: Use pext for extract
  tcg/i386: Use pdep for deposit

 tcg/i386/tcg-target.h     |  10 ++-
 tcg/tcg.h                 |   1 +
 disas/i386.c              |  29 ++++--
 tcg/i386/tcg-target.inc.c | 223 +++++++++++++++++++++++++++++++++++-----------
 tcg/tcg.c                 |  58 +++++++++---
 5 files changed, 246 insertions(+), 75 deletions(-)

-- 
2.13.3

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new
  2017-08-04  6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
@ 2017-08-04  6:23 ` Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 2/6] disas/i386: Fix disassembly of two-byte vex prefixes Richard Henderson
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04  6:23 UTC (permalink / raw)
  To: qemu-devel

This allows the backend to allocate an otherwise unused register.
This can allow the backend to avoid having to reserve a full-time
temporary register.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/tcg.h |  1 +
 tcg/tcg.c | 58 +++++++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/tcg/tcg.h b/tcg/tcg.h
index ac94133870..dd97095af5 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -655,6 +655,7 @@ struct TCGContext {
     uintptr_t *tb_jmp_insn_offset; /* tb->jmp_target_arg if direct_jump */
     uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_arg if !direct_jump */
 
+    TCGRegSet regs_in_use;
     TCGRegSet reserved_regs;
     intptr_t current_frame_offset;
     intptr_t frame_start;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index fd8a3dfe93..787c8ba0f7 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -112,6 +112,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
 static void tcg_out_call(TCGContext *s, tcg_insn_unit *target);
 static int tcg_target_const_match(tcg_target_long val, TCGType type,
                                   const TCGArgConstraint *arg_ct);
+static TCGReg tcg_reg_alloc_new(TCGContext *s, TCGType t)
+    __attribute__((unused));
 #ifdef TCG_TARGET_NEED_LDST_LABELS
 static bool tcg_out_ldst_finalize(TCGContext *s);
 #endif
@@ -1947,16 +1949,19 @@ static void temp_sync(TCGContext *s, TCGTemp *ts,
             /* If we're going to free the temp immediately, then we won't
                require it later in a register, so attempt to store the
                constant to memory directly.  */
-            if (free_or_dead
-                && tcg_out_sti(s, ts->type, ts->val,
-                               ts->mem_base->reg, ts->mem_offset)) {
-                break;
+            if (free_or_dead) {
+                s->regs_in_use = -1;
+                if (tcg_out_sti(s, ts->type, ts->val,
+                                ts->mem_base->reg, ts->mem_offset)) {
+                    break;
+                }
             }
             temp_load(s, ts, tcg_target_available_regs[ts->type],
                       allocated_regs);
             /* fallthrough */
 
         case TEMP_VAL_REG:
+            s->regs_in_use = -1;
             tcg_out_st(s, ts->type, ts->reg,
                        ts->mem_base->reg, ts->mem_offset);
             break;
@@ -2015,6 +2020,14 @@ static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet desired_regs,
     tcg_abort();
 }
 
+static TCGReg tcg_reg_alloc_new(TCGContext *s, TCGType t)
+{
+    TCGReg r;
+    r = tcg_reg_alloc(s, tcg_target_available_regs[t], s->regs_in_use, 0);
+    tcg_regset_set_reg(s->regs_in_use, r);
+    return r;
+}
+
 /* Make sure the temporary is in a register.  If needed, allocate the register
    from DESIRED while avoiding ALLOCATED.  */
 static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
@@ -2027,11 +2040,13 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
         return;
     case TEMP_VAL_CONST:
         reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base);
+        s->regs_in_use = allocated_regs;
         tcg_out_movi(s, ts->type, reg, ts->val);
         ts->mem_coherent = 0;
         break;
     case TEMP_VAL_MEM:
         reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base);
+        s->regs_in_use = -1;
         tcg_out_ld(s, ts->type, reg, ts->mem_base->reg, ts->mem_offset);
         ts->mem_coherent = 1;
         break;
@@ -2105,6 +2120,7 @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
 {
     if (ots->fixed_reg) {
         /* For fixed registers, we do not do any constant propagation.  */
+        s->regs_in_use = s->reserved_regs;
         tcg_out_movi(s, ots->type, ots->reg, val);
         return;
     }
@@ -2129,17 +2145,16 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
     TCGTemp *ots = &s->temps[args[0]];
     tcg_target_ulong val = args[1];
 
+    s->regs_in_use = s->reserved_regs;
     tcg_reg_alloc_do_movi(s, ots, val, arg_life);
 }
 
 static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
                               const TCGArg *args, TCGLifeData arg_life)
 {
-    TCGRegSet allocated_regs;
     TCGTemp *ts, *ots;
     TCGType otype, itype;
 
-    tcg_regset_set(allocated_regs, s->reserved_regs);
     ots = &s->temps[args[0]];
     ts = &s->temps[args[1]];
 
@@ -2153,6 +2168,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
         if (IS_DEAD_ARG(1)) {
             temp_dead(s, ts);
         }
+        s->regs_in_use = s->reserved_regs;
         tcg_reg_alloc_do_movi(s, ots, val, arg_life);
         return;
     }
@@ -2162,7 +2178,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
        the SOURCE value into its own register first, that way we
        don't have to reload SOURCE the next time it is used. */
     if (ts->val_type == TEMP_VAL_MEM) {
-        temp_load(s, ts, tcg_target_available_regs[itype], allocated_regs);
+        temp_load(s, ts, tcg_target_available_regs[itype], s->reserved_regs);
     }
 
     tcg_debug_assert(ts->val_type == TEMP_VAL_REG);
@@ -2173,12 +2189,14 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
         if (!ots->mem_allocated) {
             temp_allocate_frame(s, args[0]);
         }
+        s->regs_in_use = -1;
         tcg_out_st(s, otype, ts->reg, ots->mem_base->reg, ots->mem_offset);
         if (IS_DEAD_ARG(1)) {
             temp_dead(s, ts);
         }
         temp_dead(s, ots);
     } else {
+        TCGRegSet allocated_regs;
         if (IS_DEAD_ARG(1) && !ts->fixed_reg && !ots->fixed_reg) {
             /* the mov can be suppressed */
             if (ots->val_type == TEMP_VAL_REG) {
@@ -2188,19 +2206,21 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
             temp_dead(s, ts);
         } else {
             if (ots->val_type != TEMP_VAL_REG) {
-                /* When allocating a new register, make sure to not spill the
-                   input one. */
+                /* When allocating a new register, make sure to not
+                   spill the input one. */
+                allocated_regs = s->reserved_regs;
                 tcg_regset_set_reg(allocated_regs, ts->reg);
                 ots->reg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
                                          allocated_regs, ots->indirect_base);
             }
+            s->regs_in_use = -1;
             tcg_out_mov(s, otype, ots->reg, ts->reg);
         }
         ots->val_type = TEMP_VAL_REG;
         ots->mem_coherent = 0;
         s->reg_to_temp[ots->reg] = ots;
         if (NEED_SYNC_ARG(0)) {
-            temp_sync(s, ots, allocated_regs, 0);
+            temp_sync(s, ots, s->reserved_regs, 0);
         }
     }
 }
@@ -2281,6 +2301,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
                and move the temporary register into it */
             reg = tcg_reg_alloc(s, arg_ct->u.regs, i_allocated_regs,
                                 ts->indirect_base);
+            s->regs_in_use = -1;
             tcg_out_mov(s, ts->type, reg, ts->reg);
         }
         new_args[i] = reg;
@@ -2355,6 +2376,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
     }
 
     /* emit instruction */
+    s->regs_in_use = i_allocated_regs | o_allocated_regs;
     tcg_out_op(s, opc, new_args, const_args);
     
     /* move the outputs in the correct register if needed */
@@ -2362,6 +2384,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
         ts = &s->temps[args[i]];
         reg = new_args[i];
         if (ts->fixed_reg && ts->reg != reg) {
+            s->regs_in_use = -1;
             tcg_out_mov(s, ts->type, ts->reg, reg);
         }
         if (NEED_SYNC_ARG(i)) {
@@ -2420,6 +2443,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
             ts = &s->temps[arg];
             temp_load(s, ts, tcg_target_available_regs[ts->type],
                       s->reserved_regs);
+            s->regs_in_use = -1;
             tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
         }
 #ifndef TCG_TARGET_STACK_GROWSUP
@@ -2428,7 +2452,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
     }
     
     /* assign input registers */
-    tcg_regset_set(allocated_regs, s->reserved_regs);
+    allocated_regs = s->reserved_regs;
     for(i = 0; i < nb_regs; i++) {
         arg = args[nb_oargs + i];
         if (arg != TCG_CALL_DUMMY_ARG) {
@@ -2438,6 +2462,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
 
             if (ts->val_type == TEMP_VAL_REG) {
                 if (ts->reg != reg) {
+                    s->regs_in_use = -1;
                     tcg_out_mov(s, ts->type, reg, ts->reg);
                 }
             } else {
@@ -2458,7 +2483,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
             temp_dead(s, &s->temps[args[i]]);
         }
     }
-    
+
     /* clobber call registers */
     for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
         if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
@@ -2476,10 +2501,16 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
         save_globals(s, allocated_regs);
     }
 
+    s->regs_in_use = allocated_regs;
     tcg_out_call(s, func_addr);
 
     /* assign output registers and emit moves if needed */
-    for(i = 0; i < nb_oargs; i++) {
+    allocated_regs = s->reserved_regs;
+    for (i = 0; i < nb_oargs; i++) {
+        reg = tcg_target_call_oarg_regs[i];
+        tcg_regset_set_reg(allocated_regs, reg);
+    }
+    for (i = 0; i < nb_oargs; i++) {
         arg = args[i];
         ts = &s->temps[arg];
         reg = tcg_target_call_oarg_regs[i];
@@ -2487,6 +2518,7 @@ static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
 
         if (ts->fixed_reg) {
             if (ts->reg != reg) {
+                s->regs_in_use = -1;
                 tcg_out_mov(s, ts->type, ts->reg, reg);
             }
         } else {
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH for-2.11 2/6] disas/i386: Fix disassembly of two-byte vex prefixes
  2017-08-04  6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new Richard Henderson
@ 2017-08-04  6:23 ` Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 3/6] disas/i386: Add disassembly of vex.0f38.f5 Richard Henderson
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04  6:23 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 disas/i386.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/disas/i386.c b/disas/i386.c
index f1e376ca4a..7a238b203b 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -3559,6 +3559,7 @@ ckvexprefix (void)
     } else {
         /* Two byte VEX prefix.  */
         newrex |= (vex2 & 0x80 ? 0 : REX_R);
+        newpfx |= PREFIX_VEX_0F;
         codep += 2;
     }
 
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH for-2.11 3/6] disas/i386: Add disassembly of vex.0f38.f5
  2017-08-04  6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 2/6] disas/i386: Fix disassembly of two-byte vex prefixes Richard Henderson
@ 2017-08-04  6:23 ` Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 4/6] disas/i386: Add disassembly of rorx Richard Henderson
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04  6:23 UTC (permalink / raw)
  To: qemu-devel

Which includes pext, pdep and bzhi.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 disas/i386.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/disas/i386.c b/disas/i386.c
index 7a238b203b..7eaa378a10 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -683,6 +683,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
 #define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } }
 #define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } }
 #define PREGRP107 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 107 } }
+#define PREGRP108 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 108 } }
 
 #define X86_64_0  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } }
 #define X86_64_1  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } }
@@ -1484,7 +1485,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -1508,7 +1509,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -2808,6 +2809,14 @@ static const struct dis386 prefix_user_table[][4] = {
     { "bsfS",	{ Gv, Ev } },
     { "(bad)",	{ XX } },
   },
+
+  /* PREGRP108 */
+  {
+    { "bzhi",   { Gv, Ev, Bv } },
+    { "pext",   { Gv, Bv, Ev } },
+    { "(bad)",  { XX } },
+    { "pdep",   { Gv, Bv, Ev } },
+  },
 };
 
 static const struct dis386 x86_64_table[][2] = {
@@ -3108,7 +3117,7 @@ static const struct dis386 three_byte_table[][256] = {
     { PREGRP105 },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
-    { "(bad)", { XX } },
+    { PREGRP108 },
     { "(bad)", { XX } },
     { PREGRP106 },
     /* f8 */
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH for-2.11 4/6] disas/i386: Add disassembly of rorx
  2017-08-04  6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
                   ` (2 preceding siblings ...)
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 3/6] disas/i386: Add disassembly of vex.0f38.f5 Richard Henderson
@ 2017-08-04  6:23 ` Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 5/6] tcg/i386: Use pext for extract Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 6/6] tcg/i386: Use pdep for deposit Richard Henderson
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04  6:23 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 disas/i386.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/disas/i386.c b/disas/i386.c
index 7eaa378a10..a557e678ec 100644
--- a/disas/i386.c
+++ b/disas/i386.c
@@ -684,6 +684,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr)
 #define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } }
 #define PREGRP107 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 107 } }
 #define PREGRP108 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 108 } }
+#define PREGRP109 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 109 } }
 
 #define X86_64_0  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } }
 #define X86_64_1  NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } }
@@ -1557,7 +1558,7 @@ static const unsigned char threebyte_0x3a_uses_REPNZ_prefix[256] = {
   /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */
   /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */
   /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */
-  /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
+  /* f0 */ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */
   /*       -------------------------------        */
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
@@ -2817,6 +2818,14 @@ static const struct dis386 prefix_user_table[][4] = {
     { "(bad)",  { XX } },
     { "pdep",   { Gv, Bv, Ev } },
   },
+
+  /* PREGRP109 */
+  {
+    { "(bad)",  { XX } },
+    { "(bad)",  { XX } },
+    { "(bad)",  { XX } },
+    { "rorx",   { Gv, Ev, Ib } },
+  },
 };
 
 static const struct dis386 x86_64_table[][2] = {
@@ -3403,7 +3412,7 @@ static const struct dis386 three_byte_table[][256] = {
     { "(bad)", { XX } },
     { "(bad)", { XX } },
     /* f0 */
-    { "(bad)", { XX } },
+    { PREGRP109 },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
     { "(bad)", { XX } },
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH for-2.11 5/6] tcg/i386: Use pext for extract
  2017-08-04  6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
                   ` (3 preceding siblings ...)
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 4/6] disas/i386: Add disassembly of rorx Richard Henderson
@ 2017-08-04  6:23 ` Richard Henderson
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 6/6] tcg/i386: Use pdep for deposit Richard Henderson
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04  6:23 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.h     |   6 +-
 tcg/i386/tcg-target.inc.c | 147 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 109 insertions(+), 44 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b89dababf4..85b0ccd98c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -76,6 +76,7 @@ typedef enum {
 #endif
 
 extern bool have_bmi1;
+extern bool have_bmi2;
 extern bool have_popcnt;
 
 /* optional instructions */
@@ -153,9 +154,10 @@ extern bool have_popcnt;
 
 /* Check for the possibility of high-byte extraction and, for 64-bit,
    zero-extending 32-bit right-shift.  */
-#define TCG_TARGET_extract_i32_valid(ofs, len) ((ofs) == 8 && (len) == 8)
+#define TCG_TARGET_extract_i32_valid(ofs, len) \
+    (have_bmi2 || ((ofs) == 8 && (len) == 8))
 #define TCG_TARGET_extract_i64_valid(ofs, len) \
-    (((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
+    (have_bmi2 || ((ofs) == 8 && (len) == 8) || ((ofs) + (len)) == 32)
 
 #if TCG_TARGET_REG_BITS == 64
 # define TCG_AREG0 TCG_REG_R14
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 5231056fd3..69587c82de 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -124,11 +124,11 @@ static bool have_cmov;
 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
    it there.  Therefore we always define the variable.  */
 bool have_bmi1;
+bool have_bmi2;
 bool have_popcnt;
 
 #ifdef CONFIG_CPUID_H
 static bool have_movbe;
-static bool have_bmi2;
 static bool have_lzcnt;
 #else
 # define have_movbe 0
@@ -275,13 +275,14 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 
 #define P_EXT		0x100		/* 0x0f opcode prefix */
 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
-#define P_DATA16        0x400           /* 0x66 opcode prefix */
+#define P_EXT3A         0x400           /* 0x0f 0x3a opcode prefix */
+#define P_DATA16        0x800           /* 0x66 opcode prefix */
 #if TCG_TARGET_REG_BITS == 64
-# define P_ADDR32       0x800           /* 0x67 opcode prefix */
-# define P_REXW         0x1000          /* Set REX.W = 1 */
-# define P_REXB_R       0x2000          /* REG field as byte register */
-# define P_REXB_RM      0x4000          /* R/M field as byte register */
-# define P_GS           0x8000          /* gs segment override */
+# define P_ADDR32       0x1000          /* 0x67 opcode prefix */
+# define P_REXW         0x2000          /* Set REX.W = 1 */
+# define P_REXB_R       0x4000          /* REG field as byte register */
+# define P_REXB_RM      0x8000          /* R/M field as byte register */
+# define P_GS           0x10000         /* gs segment override */
 #else
 # define P_ADDR32	0
 # define P_REXW		0
@@ -289,14 +290,15 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 # define P_REXB_RM	0
 # define P_GS           0
 #endif
-#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
-#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
+#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
+#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
 #define OPC_ANDN        (0xf2 | P_EXT38)
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_BEXTR       (0xf7 | P_EXT38)
 #define OPC_BSF         (0xbc | P_EXT)
 #define OPC_BSR         (0xbd | P_EXT)
 #define OPC_BSWAP	(0xc8 | P_EXT)
@@ -327,12 +329,14 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_MOVSLQ	(0x63 | P_REXW)
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
+#define OPC_PEXT        (0xf5 | P_EXT38 | P_SIMDF3)
 #define OPC_POP_r32	(0x58)
 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 #define OPC_PUSH_r32	(0x50)
 #define OPC_PUSH_Iv	(0x68)
 #define OPC_PUSH_Ib	(0x6a)
 #define OPC_RET		(0xc3)
+#define OPC_RORX        (0xf0 | P_EXT3A | P_SIMDF2)
 #define OPC_SETCC	(0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
@@ -455,6 +459,8 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
         tcg_out8(s, 0x0f);
         if (opc & P_EXT38) {
             tcg_out8(s, 0x38);
+        } else if (opc & P_EXT3A) {
+            tcg_out8(s, 0x3a);
         }
     }
 
@@ -475,6 +481,8 @@ static void tcg_out_opc(TCGContext *s, int opc)
         tcg_out8(s, 0x0f);
         if (opc & P_EXT38) {
             tcg_out8(s, 0x38);
+        } else if (opc & P_EXT3A) {
+            tcg_out8(s, 0x3a);
         }
     }
     tcg_out8(s, opc);
@@ -491,34 +499,29 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
-static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+static void tcg_out_vex_pfx_opc(TCGContext *s, int opc, int r, int v, int rm)
 {
     int tmp;
 
-    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
-        /* Three byte VEX prefix.  */
-        tcg_out8(s, 0xc4);
-
-        /* VEX.m-mmmm */
-        if (opc & P_EXT38) {
-            tmp = 2;
-        } else if (opc & P_EXT) {
-            tmp = 1;
-        } else {
-            tcg_abort();
-        }
-        tmp |= 0x40;                       /* VEX.X */
-        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */
-        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */
-        tcg_out8(s, tmp);
+    /* Three byte VEX prefix.  */
+    tcg_out8(s, 0xc4);
 
-        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */
+    /* VEX.m-mmmm */
+    if (opc & P_EXT3A) {
+        tmp = 3;
+    } else if (opc & P_EXT38) {
+        tmp = 2;
+    } else if (opc & P_EXT) {
+        tmp = 1;
     } else {
-        /* Two byte VEX prefix.  */
-        tcg_out8(s, 0xc5);
-
-        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
+        tcg_abort();
     }
+    tmp |= 0x40;                           /* VEX.X */
+    tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
+    tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
+    tcg_out8(s, tmp);
+
+    tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
     /* VEX.pp */
     if (opc & P_DATA16) {
         tmp |= 1;                          /* 0x66 */
@@ -530,9 +533,43 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
     tcg_out8(s, tmp);
     tcg_out8(s, opc);
+}
+
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+    tcg_out_vex_pfx_opc(s, opc, r, v, rm);
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
+static void tcg_out_sfx_pool_imm(TCGContext *s, int r, tcg_target_ulong data)
+{
+    /* modrm for 64-bit rip-relative, or 32-bit absolute addressing.  */
+    tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
+
+    if (TCG_TARGET_REG_BITS == 64) {
+        new_pool_label(s, data, R_386_PC32, s->code_ptr, -4);
+    } else {
+        new_pool_label(s, data, R_386_32, s->code_ptr, 0);
+    }
+    tcg_out32(s, 0);
+}
+
+#if 0
+static void tcg_out_opc_pool_imm(TCGContext *s, int opc, int r,
+                                 tcg_target_ulong data)
+{
+    tcg_out_opc(s, opc, r, 0, 0);
+    tcg_out_sfx_pool_imm(s, r, data);
+}
+#endif
+
+static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v,
+                                 tcg_target_ulong data)
+{
+    tcg_out_vex_pfx_opc(s, opc, r, v, 0);
+    tcg_out_sfx_pool_imm(s, r, data);
+}
+
 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
    We handle either RM and INDEX missing with a negative value.  In 64-bit
    mode for absolute addresses, ~RM is the size of the immediate operand
@@ -877,6 +914,13 @@ static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
     }
 }
 
+static void tcg_out_rorx(TCGContext *s, int rexw,
+                         TCGReg dst, TCGReg src, int c)
+{
+    tcg_out_vex_modrm(s, OPC_RORX + rexw, dst, 0, src);
+    tcg_out8(s, c);
+}
+
 /* Use SMALL != 0 to force a short forward branch.  */
 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
 {
@@ -1858,7 +1902,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
-    TCGArg a0, a1, a2;
+    TCGArg a0, a1, a2, a3;
     int c, const_a2, vexop, rexw = 0;
 
 #if TCG_TARGET_REG_BITS == 64
@@ -2244,12 +2288,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         /* On the off-chance that we can use the high-byte registers.
            Otherwise we emit the same ext16 + shift pattern that we
            would have gotten from the normal tcg-op.c expansion.  */
-        tcg_debug_assert(a2 == 8 && args[3] == 8);
-        if (a1 < 4 && a0 < 8) {
-            tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
+        a3 = args[3];
+        if (a2 == 8 && a3 == 8) {
+            if (a1 < 4 && a0 < 8) {
+                tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
+            } else {
+                tcg_out_ext16u(s, a0, a1);
+                tcg_out_shifti(s, SHIFT_SHR, a0, 8);
+            }
         } else {
-            tcg_out_ext16u(s, a0, a1);
-            tcg_out_shifti(s, SHIFT_SHR, a0, 8);
+            tcg_debug_assert(have_bmi2);
+            tcg_out_vex_pool_imm(s, OPC_PEXT + (a2 + a3 > 32) * P_REXW,
+                                 a0, a1, deposit64(0, a2, a3, -1));
         }
         break;
 
@@ -2257,12 +2307,25 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         /* We don't implement sextract_i64, as we cannot sign-extend to
            64-bits without using the REX prefix that explicitly excludes
            access to the high-byte registers.  */
-        tcg_debug_assert(a2 == 8 && args[3] == 8);
-        if (a1 < 4 && a0 < 8) {
-            tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
+        a3 = args[3];
+        if (a2 == 8 && a3 == 8) {
+            if (a1 < 4 && a0 < 8) {
+                tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
+            } else {
+                tcg_out_ext16s(s, a0, a1, 0);
+                tcg_out_shifti(s, SHIFT_SAR, a0, 8);
+            }
         } else {
-            tcg_out_ext16s(s, a0, a1, 0);
-            tcg_out_shifti(s, SHIFT_SAR, a0, 8);
+            /* ??? We only have one extract_i32_valid macro.  But as it
+               happens we can perform a useful 3-operand shift.  */
+            tcg_debug_assert(have_bmi2);
+            if (a2 + a3 < 32) {
+                /* Rotate the field in A1 to the MSB of A0.  */
+                tcg_out_rorx(s, 0, a0, a1, a2 + a3);
+            } else {
+                tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
+            }
+            tcg_out_shifti(s, SHIFT_SAR, a0, 32 - a3);
         }
         break;
 
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Qemu-devel] [PATCH for-2.11 6/6] tcg/i386: Use pdep for deposit
  2017-08-04  6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
                   ` (4 preceding siblings ...)
  2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 5/6] tcg/i386: Use pext for extract Richard Henderson
@ 2017-08-04  6:23 ` Richard Henderson
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2017-08-04  6:23 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.h     |  4 ++-
 tcg/i386/tcg-target.inc.c | 82 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 85b0ccd98c..e512648c95 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -148,7 +148,9 @@ extern bool have_popcnt;
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
-    (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
+    (have_bmi2 ||                              \
+     ((ofs) == 0 && (len) == 8) ||             \
+     ((ofs) == 8 && (len) == 8) ||             \
      ((ofs) == 0 && (len) == 16))
 #define TCG_TARGET_deposit_i64_valid    TCG_TARGET_deposit_i32_valid
 
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 69587c82de..aeefb72aa0 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -329,6 +329,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_MOVSLQ	(0x63 | P_REXW)
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
+#define OPC_PDEP        (0xf5 | P_EXT38 | P_SIMDF2)
 #define OPC_PEXT        (0xf5 | P_EXT38 | P_SIMDF3)
 #define OPC_POP_r32	(0x58)
 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
@@ -554,14 +555,12 @@ static void tcg_out_sfx_pool_imm(TCGContext *s, int r, tcg_target_ulong data)
     tcg_out32(s, 0);
 }
 
-#if 0
 static void tcg_out_opc_pool_imm(TCGContext *s, int opc, int r,
                                  tcg_target_ulong data)
 {
     tcg_out_opc(s, opc, r, 0, 0);
     tcg_out_sfx_pool_imm(s, r, data);
 }
-#endif
 
 static void tcg_out_vex_pool_imm(TCGContext *s, int opc, int r, int v,
                                  tcg_target_ulong data)
@@ -1902,7 +1901,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {
-    TCGArg a0, a1, a2, a3;
+    TCGArg a0, a1, a2, a3, a4;
     int c, const_a2, vexop, rexw = 0;
 
 #if TCG_TARGET_REG_BITS == 64
@@ -2262,17 +2261,68 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #endif
 
     OP_32_64(deposit):
-        if (args[3] == 0 && args[4] == 8) {
-            /* load bits 0..7 */
-            tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
-        } else if (args[3] == 8 && args[4] == 8) {
-            /* load bits 8..15 */
-            tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
-        } else if (args[3] == 0 && args[4] == 16) {
-            /* load bits 0..15 */
-            tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
-        } else {
-            tcg_abort();
+        a3 = args[3];
+        a4 = args[4];
+        {
+            tcg_target_ulong mask = deposit64(0, a3, a4, -1);
+
+            if (const_args[1]) {
+                tcg_debug_assert(have_bmi2);
+                if (a3 == 0 && a0 == a2) {
+                    if (a4 <= 32) {
+                        tgen_arithi(s, ARITH_AND, a0, mask, 0);
+                    } else {
+                        tcg_out_opc_pool_imm(s, OPC_ARITH_GvEv + P_REXW
+                                             + ARITH_AND * 8, a0, mask);
+                    }
+                } else {
+                    tcg_out_vex_pool_imm(s, OPC_PDEP
+                                         + (a3 + a4 > 32) * P_REXW,
+                                         a0, a2, mask);
+                }
+                a1 &= ~mask;
+                if (a1 != 0) {
+                    if (!rexw || a1 == (int)a1) {
+                        tgen_arithi(s, ARITH_OR + rexw, a0, a1, 0);
+                    } else {
+                        tcg_out_opc_pool_imm(s, OPC_ARITH_GvEv + P_REXW
+                                             + ARITH_OR * 8, a0, a1);
+                    }
+                }
+            } else if (a0 == a1 && a3 == 0 && a4 == 8) {
+                /* load bits 0..7 */
+                tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
+            } else if (a0 == a1 && a3 == 8 && a4 == 8 && a0 < 4 && a2 < 8) {
+                /* load bits 8..15 */
+                tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
+            } else if (a0 == a1 && a3 == 0 && a4 == 16) {
+                /* load bits 0..15 */
+                tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
+            } else {
+                TCGType type = rexw ? TCG_TYPE_I64 : TCG_TYPE_I32;
+                TCGReg t1 = tcg_reg_alloc_new(s, type);
+                TCGReg t2 = t1;
+
+                tcg_debug_assert(have_bmi2);
+                tcg_out_movi(s, type, t1, mask);
+                if (a0 == a2) {
+                    t2 = tcg_reg_alloc_new(s, type);
+                    tcg_out_vex_modrm(s, OPC_ANDN + rexw, t2, t1, a1);
+                    if (a3 == 0) {
+                        tgen_arithr(s, ARITH_AND + rexw, a0, t1);
+                    } else {
+                        tcg_out_vex_modrm(s, OPC_PDEP + rexw, a0, a2, t1);
+                    }
+                } else {
+                    tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, t1, a1);
+                    if (a3 == 0) {
+                        tgen_arithr(s, ARITH_AND + rexw, t1, a2);
+                    } else {
+                        tcg_out_vex_modrm(s, OPC_PDEP + rexw, t1, a2, t1);
+                    }
+                }
+                tgen_arithr(s, ARITH_OR + rexw, a0, t2);
+            }
         }
         break;
 
@@ -2480,7 +2530,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
         {
             static const TCGTargetOpDef dep
                 = { .args_ct_str = { "Q", "0", "Q" } };
-            return &dep;
+            static const TCGTargetOpDef pdep
+                = { .args_ct_str = { "r", "ri", "r" } };
+            return have_bmi2 ? &pdep : &dep;
         }
     case INDEX_op_setcond_i32:
     case INDEX_op_setcond_i64:
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-08-04  6:23 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-04  6:23 [Qemu-devel] [PATCH for-2.11 0/6] tcg/i386 haswell improvements Richard Henderson
2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 1/6] tcg: Add tcg_reg_alloc_new Richard Henderson
2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 2/6] disas/i386: Fix disassembly of two-byte vex prefixes Richard Henderson
2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 3/6] disas/i386: Add disassembly of vex.0f38.f5 Richard Henderson
2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 4/6] disas/i386: Add disassembly of rorx Richard Henderson
2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 5/6] tcg/i386: Use pext for extract Richard Henderson
2017-08-04  6:23 ` [Qemu-devel] [PATCH for-2.11 6/6] tcg/i386: Use pdep for deposit Richard Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.