All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend
@ 2020-12-11  1:14 Richard Henderson
  2020-12-11  1:14 ` [PATCH 1/2] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP Richard Henderson
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Richard Henderson @ 2020-12-11  1:14 UTC (permalink / raw)
  To: qemu-devel

Eliminating these cleans up the backend a bit, allows the
code generator more freedom to properly place the inputs.


r~


Richard Henderson (2):
  tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP
  tcg: Introduce INDEX_op_qemu_st8_i32

 include/tcg/tcg-opc.h     |   5 ++
 tcg/aarch64/tcg-target.h  |   1 +
 tcg/arm/tcg-target.h      |   1 +
 tcg/i386/tcg-target.h     |   6 +-
 tcg/mips/tcg-target.h     |   1 +
 tcg/ppc/tcg-target.h      |   1 +
 tcg/riscv/tcg-target.h    |   1 +
 tcg/s390/tcg-target.h     |   1 +
 tcg/sparc/tcg-target.h    |   1 +
 tcg/tci/tcg-target.h      |   1 +
 tcg/optimize.c            |   1 +
 tcg/tcg-op.c              |   6 +-
 tcg/tcg.c                 |   4 ++
 tcg/README                |   5 ++
 tcg/i386/tcg-target.c.inc | 138 ++++++++++++++++----------------------
 15 files changed, 91 insertions(+), 82 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/2] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP
  2020-12-11  1:14 [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend Richard Henderson
@ 2020-12-11  1:14 ` Richard Henderson
  2021-01-05 13:42   ` Philippe Mathieu-Daudé
  2020-12-11  1:14 ` [PATCH 2/2] tcg: Introduce INDEX_op_qemu_st8_i32 Richard Henderson
  2021-01-05  0:04 ` [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend Richard Henderson
  2 siblings, 1 reply; 6+ messages in thread
From: Richard Henderson @ 2020-12-11  1:14 UTC (permalink / raw)
  To: qemu-devel

Always true when movbe is available, otherwise leave
this to generic code.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |   3 +-
 tcg/i386/tcg-target.c.inc | 119 ++++++++++++++------------------------
 2 files changed, 47 insertions(+), 75 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index abd4ac7fc0..89700ab6af 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -101,6 +101,7 @@ extern bool have_bmi1;
 extern bool have_popcnt;
 extern bool have_avx1;
 extern bool have_avx2;
+extern bool have_movbe;
 
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
@@ -229,7 +230,7 @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,
 
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 
-#define TCG_TARGET_HAS_MEMORY_BSWAP  1
+#define TCG_TARGET_HAS_MEMORY_BSWAP  have_movbe
 
 #ifdef CONFIG_SOFTMMU
 #define TCG_TARGET_NEED_LDST_LABELS
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index d8797ed398..01588cdcb4 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -154,13 +154,12 @@ bool have_bmi1;
 bool have_popcnt;
 bool have_avx1;
 bool have_avx2;
+bool have_movbe;
 
 #ifdef CONFIG_CPUID_H
-static bool have_movbe;
 static bool have_bmi2;
 static bool have_lzcnt;
 #else
-# define have_movbe 0
 # define have_bmi2 0
 # define have_lzcnt 0
 #endif
@@ -1986,13 +1985,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, int index, intptr_t ofs,
                                    int seg, bool is64, MemOp memop)
 {
-    const MemOp real_bswap = memop & MO_BSWAP;
-    MemOp bswap = real_bswap;
+    bool use_movbe = false;
     int rexw = is64 * P_REXW;
     int movop = OPC_MOVL_GvEv;
 
-    if (have_movbe && real_bswap) {
-        bswap = 0;
+    /* Do big-endian loads with movbe.  */
+    if (memop & MO_BSWAP) {
+        tcg_debug_assert(have_movbe);
+        use_movbe = true;
         movop = OPC_MOVBE_GyMy;
     }
 
@@ -2006,23 +2006,28 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                  base, index, 0, ofs);
         break;
     case MO_UW:
-        tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
-                                 base, index, 0, ofs);
-        if (real_bswap) {
-            tcg_out_rolw_8(s, datalo);
-        }
-        break;
-    case MO_SW:
-        if (real_bswap) {
-            if (have_movbe) {
+        if (use_movbe) {
+            /* There is no extending movbe; only low 16-bits are modified.  */
+            if (datalo != base && datalo != index) {
+                /* XOR breaks dependency chains.  */
+                tgen_arithr(s, ARITH_XOR, datalo, datalo);
                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
                                          datalo, base, index, 0, ofs);
             } else {
-                tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
-                                         base, index, 0, ofs);
-                tcg_out_rolw_8(s, datalo);
+                tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                         datalo, base, index, 0, ofs);
+                tcg_out_ext16u(s, datalo, datalo);
             }
-            tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
+        } else {
+            tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
+                                     base, index, 0, ofs);
+        }
+        break;
+    case MO_SW:
+        if (use_movbe) {
+            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
+                                     datalo, base, index, 0, ofs);
+            tcg_out_ext16s(s, datalo, datalo, rexw);
         } else {
             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
                                      datalo, base, index, 0, ofs);
@@ -2030,18 +2035,12 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         break;
     case MO_UL:
         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
-        if (bswap) {
-            tcg_out_bswap32(s, datalo);
-        }
         break;
 #if TCG_TARGET_REG_BITS == 64
     case MO_SL:
-        if (real_bswap) {
-            tcg_out_modrm_sib_offset(s, movop + seg, datalo,
+        if (use_movbe) {
+            tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
                                      base, index, 0, ofs);
-            if (bswap) {
-                tcg_out_bswap32(s, datalo);
-            }
             tcg_out_ext32s(s, datalo, datalo);
         } else {
             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
@@ -2053,12 +2052,9 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         if (TCG_TARGET_REG_BITS == 64) {
             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
                                      base, index, 0, ofs);
-            if (bswap) {
-                tcg_out_bswap64(s, datalo);
-            }
         } else {
-            if (real_bswap) {
-                int t = datalo;
+            if (use_movbe) {
+                TCGReg t = datalo;
                 datalo = datahi;
                 datahi = t;
             }
@@ -2073,14 +2069,10 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
                                          base, index, 0, ofs);
             }
-            if (bswap) {
-                tcg_out_bswap32(s, datalo);
-                tcg_out_bswap32(s, datahi);
-            }
         }
         break;
     default:
-        tcg_abort();
+        g_assert_not_reached();
     }
 }
 
@@ -2128,24 +2120,27 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, int index, intptr_t ofs,
                                    int seg, MemOp memop)
 {
-    /* ??? Ideally we wouldn't need a scratch register.  For user-only,
-       we could perform the bswap twice to restore the original value
-       instead of moving to the scratch.  But as it is, the L constraint
-       means that TCG_REG_L0 is definitely free here.  */
     const TCGReg scratch = TCG_REG_L0;
-    const MemOp real_bswap = memop & MO_BSWAP;
-    MemOp bswap = real_bswap;
+    bool use_movbe = false;
     int movop = OPC_MOVL_EvGv;
 
-    if (have_movbe && real_bswap) {
-        bswap = 0;
+    /*
+     * Do big-endian stores with movbe or softmmu.
+     * User-only without movbe will have its swapping done generically.
+     */
+    if (memop & MO_BSWAP) {
+        tcg_debug_assert(have_movbe);
+        use_movbe = true;
         movop = OPC_MOVBE_MyGy;
     }
 
     switch (memop & MO_SIZE) {
     case MO_8:
-        /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
-           Use the scratch register if necessary.  */
+        /*
+         * In 32-bit mode, 8-bit stores can only happen from [abcd]x.
+         * TODO: Adjust constraints such that this is is forced,
+         * then we won't need a scratch at all for user-only.
+         */
         if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
             datalo = scratch;
@@ -2154,43 +2149,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                  datalo, base, index, 0, ofs);
         break;
     case MO_16:
-        if (bswap) {
-            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
-            tcg_out_rolw_8(s, scratch);
-            datalo = scratch;
-        }
         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
                                  base, index, 0, ofs);
         break;
     case MO_32:
-        if (bswap) {
-            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
-            tcg_out_bswap32(s, scratch);
-            datalo = scratch;
-        }
         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
         break;
     case MO_64:
         if (TCG_TARGET_REG_BITS == 64) {
-            if (bswap) {
-                tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
-                tcg_out_bswap64(s, scratch);
-                datalo = scratch;
-            }
             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
                                      base, index, 0, ofs);
-        } else if (bswap) {
-            tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
-            tcg_out_bswap32(s, scratch);
-            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
-                                     base, index, 0, ofs);
-            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
-            tcg_out_bswap32(s, scratch);
-            tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
-                                     base, index, 0, ofs + 4);
         } else {
-            if (real_bswap) {
-                int t = datalo;
+            if (use_movbe) {
+                TCGReg t = datalo;
                 datalo = datahi;
                 datahi = t;
             }
@@ -2201,7 +2172,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         }
         break;
     default:
-        tcg_abort();
+        g_assert_not_reached();
     }
 }
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] tcg: Introduce INDEX_op_qemu_st8_i32
  2020-12-11  1:14 [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend Richard Henderson
  2020-12-11  1:14 ` [PATCH 1/2] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP Richard Henderson
@ 2020-12-11  1:14 ` Richard Henderson
  2021-01-05 13:46   ` Philippe Mathieu-Daudé
  2021-01-05  0:04 ` [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend Richard Henderson
  2 siblings, 1 reply; 6+ messages in thread
From: Richard Henderson @ 2020-12-11  1:14 UTC (permalink / raw)
  To: qemu-devel

Enable this on i386 to restrict the set of input registers
for an 8-bit store, as required by the architecture.  This
removes the last use of scratch registers for user-only mode.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-opc.h     |  5 +++++
 tcg/aarch64/tcg-target.h  |  1 +
 tcg/arm/tcg-target.h      |  1 +
 tcg/i386/tcg-target.h     |  3 +++
 tcg/mips/tcg-target.h     |  1 +
 tcg/ppc/tcg-target.h      |  1 +
 tcg/riscv/tcg-target.h    |  1 +
 tcg/s390/tcg-target.h     |  1 +
 tcg/sparc/tcg-target.h    |  1 +
 tcg/tci/tcg-target.h      |  1 +
 tcg/optimize.c            |  1 +
 tcg/tcg-op.c              |  6 +++++-
 tcg/tcg.c                 |  4 ++++
 tcg/README                |  5 +++++
 tcg/i386/tcg-target.c.inc | 29 ++++++++++++++++++-----------
 15 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index 67092e82c6..70a76646c4 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -211,6 +211,11 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
 DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 
+/* Only used by i386 to cope with stupid register constraints. */
+DEF(qemu_st8_i32, 0, TLADDR_ARGS + 1, 1,
+    TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS |
+    IMPL(TCG_TARGET_HAS_qemu_st8_i32))
+
 /* Host vector support.  */
 
 #define IMPLVEC  TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 663dd0b95e..d1159d80c6 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -88,6 +88,7 @@ typedef enum {
 #define TCG_TARGET_HAS_extrl_i64_i32    0
 #define TCG_TARGET_HAS_extrh_i64_i32    0
 #define TCG_TARGET_HAS_goto_ptr         1
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rem_i64          1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 17e771374d..1e132afa75 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -126,6 +126,7 @@ extern bool use_idiv_instructions;
 #define TCG_TARGET_HAS_rem_i32          0
 #define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      0
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 enum {
     TCG_AREG0 = TCG_REG_R6,
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 89700ab6af..abe8636f0d 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -172,6 +172,9 @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_muls2_i64        1
 #define TCG_TARGET_HAS_muluh_i64        0
 #define TCG_TARGET_HAS_mulsh_i64        0
+#define TCG_TARGET_HAS_qemu_st8_i32     0
+#else
+#define TCG_TARGET_HAS_qemu_st8_i32     1
 #endif
 
 /* We do not support older SSE systems, only beginning with AVX1.  */
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index c6b091d849..b04c12d317 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -169,6 +169,7 @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_clz_i32          use_mips32r2_instructions
 #define TCG_TARGET_HAS_ctz_i32          0
 #define TCG_TARGET_HAS_ctpop_i32        0
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_movcond_i64      use_movnz_instructions
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index be10363956..7ff46b3d6d 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -108,6 +108,7 @@ extern bool have_vsx;
 #define TCG_TARGET_HAS_mulsh_i32        1
 #define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      1
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32         0
diff --git a/tcg/riscv/tcg-target.h b/tcg/riscv/tcg-target.h
index 032439d806..ccda7b83c4 100644
--- a/tcg/riscv/tcg-target.h
+++ b/tcg/riscv/tcg-target.h
@@ -119,6 +119,7 @@ typedef enum {
 #define TCG_TARGET_HAS_direct_jump      0
 #define TCG_TARGET_HAS_brcond2          1
 #define TCG_TARGET_HAS_setcond2         1
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_movcond_i64      0
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index 63c8797bd3..7e7396caf2 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -97,6 +97,7 @@ extern uint64_t s390_facilities;
 #define TCG_TARGET_HAS_extrh_i64_i32  0
 #define TCG_TARGET_HAS_goto_ptr       1
 #define TCG_TARGET_HAS_direct_jump    (s390_facilities & FACILITY_GEN_INST_EXT)
+#define TCG_TARGET_HAS_qemu_st8_i32   0
 
 #define TCG_TARGET_HAS_div2_i64       1
 #define TCG_TARGET_HAS_rot_i64        1
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 633841ebf2..f2989b3b45 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -126,6 +126,7 @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_goto_ptr         1
 #define TCG_TARGET_HAS_direct_jump      1
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #define TCG_TARGET_HAS_extrl_i64_i32    1
 #define TCG_TARGET_HAS_extrh_i64_i32    1
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 8c1c1d265d..7192d5319e 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -88,6 +88,7 @@
 #define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_goto_ptr         0
 #define TCG_TARGET_HAS_direct_jump      1
+#define TCG_TARGET_HAS_qemu_st8_i32     0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_extrl_i64_i32    0
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 220f4601d5..7f0eb6a9e7 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -1537,6 +1537,7 @@ void tcg_optimize(TCGContext *s)
             case INDEX_op_qemu_ld_i32:
             case INDEX_op_qemu_ld_i64:
             case INDEX_op_qemu_st_i32:
+            case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_st_i64:
             case INDEX_op_call:
                 /* Opcodes that touch guest memory stop the optimization.  */
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 4b8a473fad..af7ce91ffa 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2883,7 +2883,11 @@ void tcg_gen_qemu_st_i32(TCGv_i32 val, TCGv addr, TCGArg idx, MemOp memop)
     }
 
     addr = plugin_prep_mem_callbacks(addr);
-    gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx);
+    if (TCG_TARGET_HAS_qemu_st8_i32 && (memop & MO_SIZE) == MO_8) {
+        gen_ldst_i32(INDEX_op_qemu_st8_i32, val, addr, memop, idx);
+    } else {
+        gen_ldst_i32(INDEX_op_qemu_st_i32, val, addr, memop, idx);
+    }
     plugin_gen_mem_callbacks(addr, info);
 
     if (swap) {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 43c6cf8f52..829d4296e0 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1426,6 +1426,9 @@ bool tcg_op_supported(TCGOpcode op)
     case INDEX_op_qemu_st_i64:
         return true;
 
+    case INDEX_op_qemu_st8_i32:
+        return TCG_TARGET_HAS_qemu_st8_i32;
+
     case INDEX_op_goto_ptr:
         return TCG_TARGET_HAS_goto_ptr;
 
@@ -2086,6 +2089,7 @@ static void tcg_dump_ops(TCGContext *s, bool have_prefs)
                 break;
             case INDEX_op_qemu_ld_i32:
             case INDEX_op_qemu_st_i32:
+            case INDEX_op_qemu_st8_i32:
             case INDEX_op_qemu_ld_i64:
             case INDEX_op_qemu_st_i64:
                 {
diff --git a/tcg/README b/tcg/README
index 2f051e5c97..0cf9e2727c 100644
--- a/tcg/README
+++ b/tcg/README
@@ -502,6 +502,7 @@ goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0).
 
 * qemu_ld_i32/i64 t0, t1, flags, memidx
 * qemu_st_i32/i64 t0, t1, flags, memidx
+* qemu_st8_i32 t0, t1, flags, memidx
 
 Load data at the guest address t1 into t0, or store data in t0 at guest
 address t1.  The _i32/_i64 size applies to the size of the input/output
@@ -518,6 +519,10 @@ of the memory access.
 For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
 64-bit memory access specified in flags.
 
+For i386, qemu_st8_i32 is exactly like qemu_st_i32, except the size of
+the memory operation is known to be 8-bit.  This allows the backend to
+provide a different set of register constraints.
+
 ********* Host vector operations
 
 All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE.
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 01588cdcb4..f8e9a24e3b 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -245,11 +245,21 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
         ct->regs |= ALL_VECTOR_REGS;
         break;
 
-        /* qemu_ld/st address constraint */
     case 'L':
+        /* qemu_ld/st data+address constraint */
         ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
+#ifdef CONFIG_SOFTMMU
         tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
         tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
+#endif
+        break;
+    case 's':
+        /* qemu_st8_i32 data constraint */
+        ct->regs = 0xf;
+#ifdef CONFIG_SOFTMMU
+        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
+        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
+#endif
         break;
 
     case 'e':
@@ -2120,7 +2130,6 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
                                    TCGReg base, int index, intptr_t ofs,
                                    int seg, MemOp memop)
 {
-    const TCGReg scratch = TCG_REG_L0;
     bool use_movbe = false;
     int movop = OPC_MOVL_EvGv;
 
@@ -2136,15 +2145,8 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
 
     switch (memop & MO_SIZE) {
     case MO_8:
-        /*
-         * In 32-bit mode, 8-bit stores can only happen from [abcd]x.
-         * TODO: Adjust constraints such that this is is forced,
-         * then we won't need a scratch at all for user-only.
-         */
-        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
-            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
-            datalo = scratch;
-        }
+        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
                                  datalo, base, index, 0, ofs);
         break;
@@ -2491,6 +2493,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_qemu_ld(s, args, 1);
         break;
     case INDEX_op_qemu_st_i32:
+    case INDEX_op_qemu_st8_i32:
         tcg_out_qemu_st(s, args, 0);
         break;
     case INDEX_op_qemu_st_i64:
@@ -2949,9 +2952,11 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
     static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
     static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
+    static const TCGTargetOpDef s_L = { .args_ct_str = { "s", "L" } };
     static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
     static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
     static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
+    static const TCGTargetOpDef s_L_L = { .args_ct_str = { "s", "L", "L" } };
     static const TCGTargetOpDef r_r_L_L
         = { .args_ct_str = { "r", "r", "L", "L" } };
     static const TCGTargetOpDef L_L_L_L
@@ -3145,6 +3150,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
         return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
     case INDEX_op_qemu_st_i32:
         return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
+    case INDEX_op_qemu_st8_i32:
+        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &s_L : &s_L_L;
     case INDEX_op_qemu_ld_i64:
         return (TCG_TARGET_REG_BITS == 64 ? &r_L
                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend
  2020-12-11  1:14 [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend Richard Henderson
  2020-12-11  1:14 ` [PATCH 1/2] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP Richard Henderson
  2020-12-11  1:14 ` [PATCH 2/2] tcg: Introduce INDEX_op_qemu_st8_i32 Richard Henderson
@ 2021-01-05  0:04 ` Richard Henderson
  2 siblings, 0 replies; 6+ messages in thread
From: Richard Henderson @ 2021-01-05  0:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Alex Bennée, Philippe Mathieu-Daudé

Ping?

On 12/10/20 3:14 PM, Richard Henderson wrote:
> Eliminating these cleans up the backend a bit, allows the
> code generator more freedom to properly place the inputs.
> 
> 
> r~
> 
> 
> Richard Henderson (2):
>   tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP
>   tcg: Introduce INDEX_op_qemu_st8_i32
> 
>  include/tcg/tcg-opc.h     |   5 ++
>  tcg/aarch64/tcg-target.h  |   1 +
>  tcg/arm/tcg-target.h      |   1 +
>  tcg/i386/tcg-target.h     |   6 +-
>  tcg/mips/tcg-target.h     |   1 +
>  tcg/ppc/tcg-target.h      |   1 +
>  tcg/riscv/tcg-target.h    |   1 +
>  tcg/s390/tcg-target.h     |   1 +
>  tcg/sparc/tcg-target.h    |   1 +
>  tcg/tci/tcg-target.h      |   1 +
>  tcg/optimize.c            |   1 +
>  tcg/tcg-op.c              |   6 +-
>  tcg/tcg.c                 |   4 ++
>  tcg/README                |   5 ++
>  tcg/i386/tcg-target.c.inc | 138 ++++++++++++++++----------------------
>  15 files changed, 91 insertions(+), 82 deletions(-)
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP
  2020-12-11  1:14 ` [PATCH 1/2] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP Richard Henderson
@ 2021-01-05 13:42   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 6+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-01-05 13:42 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

On 12/11/20 2:14 AM, Richard Henderson wrote:
> Always true when movbe is available, otherwise leave
> this to generic code.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/i386/tcg-target.h     |   3 +-
>  tcg/i386/tcg-target.c.inc | 119 ++++++++++++++------------------------
>  2 files changed, 47 insertions(+), 75 deletions(-)

LGTM but I'd feel safer with some x86 developer reviewing it.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] tcg: Introduce INDEX_op_qemu_st8_i32
  2020-12-11  1:14 ` [PATCH 2/2] tcg: Introduce INDEX_op_qemu_st8_i32 Richard Henderson
@ 2021-01-05 13:46   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 6+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-01-05 13:46 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

On 12/11/20 2:14 AM, Richard Henderson wrote:
> Enable this on i386 to restrict the set of input registers
> for an 8-bit store, as required by the architecture.  This
> removes the last use of scratch registers for user-only mode.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  include/tcg/tcg-opc.h     |  5 +++++
>  tcg/aarch64/tcg-target.h  |  1 +
>  tcg/arm/tcg-target.h      |  1 +
>  tcg/i386/tcg-target.h     |  3 +++
>  tcg/mips/tcg-target.h     |  1 +
>  tcg/ppc/tcg-target.h      |  1 +
>  tcg/riscv/tcg-target.h    |  1 +
>  tcg/s390/tcg-target.h     |  1 +
>  tcg/sparc/tcg-target.h    |  1 +
>  tcg/tci/tcg-target.h      |  1 +
>  tcg/optimize.c            |  1 +
>  tcg/tcg-op.c              |  6 +++++-
>  tcg/tcg.c                 |  4 ++++
>  tcg/README                |  5 +++++
>  tcg/i386/tcg-target.c.inc | 29 ++++++++++++++++++-----------
>  15 files changed, 49 insertions(+), 12 deletions(-)
...

> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index 01588cdcb4..f8e9a24e3b 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -245,11 +245,21 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
>          ct->regs |= ALL_VECTOR_REGS;
>          break;
>  
> -        /* qemu_ld/st address constraint */
>      case 'L':
> +        /* qemu_ld/st data+address constraint */
>          ct->regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
> +#ifdef CONFIG_SOFTMMU
>          tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
>          tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
> +#endif
> +        break;
> +    case 's':
> +        /* qemu_st8_i32 data constraint */
> +        ct->regs = 0xf;
> +#ifdef CONFIG_SOFTMMU
> +        tcg_regset_reset_reg(ct->regs, TCG_REG_L0);
> +        tcg_regset_reset_reg(ct->regs, TCG_REG_L1);
> +#endif
>          break;
>  
>      case 'e':
> @@ -2120,7 +2130,6 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>                                     TCGReg base, int index, intptr_t ofs,
>                                     int seg, MemOp memop)
>  {
> -    const TCGReg scratch = TCG_REG_L0;
>      bool use_movbe = false;
>      int movop = OPC_MOVL_EvGv;
>  
> @@ -2136,15 +2145,8 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>  
>      switch (memop & MO_SIZE) {
>      case MO_8:
> -        /*
> -         * In 32-bit mode, 8-bit stores can only happen from [abcd]x.
> -         * TODO: Adjust constraints such that this is is forced,
> -         * then we won't need a scratch at all for user-only.
> -         */
> -        if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
> -            tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
> -            datalo = scratch;
> -        }
> +        /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
> +        tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
>          tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
>                                   datalo, base, index, 0, ofs);
>          break;
> @@ -2491,6 +2493,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          tcg_out_qemu_ld(s, args, 1);
>          break;
>      case INDEX_op_qemu_st_i32:
> +    case INDEX_op_qemu_st8_i32:
>          tcg_out_qemu_st(s, args, 0);
>          break;
>      case INDEX_op_qemu_st_i64:
> @@ -2949,9 +2952,11 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
>      static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
>      static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
>      static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
> +    static const TCGTargetOpDef s_L = { .args_ct_str = { "s", "L" } };
>      static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
>      static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
>      static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
> +    static const TCGTargetOpDef s_L_L = { .args_ct_str = { "s", "L", "L" } };
>      static const TCGTargetOpDef r_r_L_L
>          = { .args_ct_str = { "r", "r", "L", "L" } };
>      static const TCGTargetOpDef L_L_L_L
> @@ -3145,6 +3150,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
>          return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
>      case INDEX_op_qemu_st_i32:
>          return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
> +    case INDEX_op_qemu_st8_i32:
> +        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &s_L : &s_L_L;
>      case INDEX_op_qemu_ld_i64:
>          return (TCG_TARGET_REG_BITS == 64 ? &r_L
>                  : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
> 

Again, another review from a x86 developer welcomed.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-01-05 13:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-11  1:14 [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend Richard Henderson
2020-12-11  1:14 ` [PATCH 1/2] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP Richard Henderson
2021-01-05 13:42   ` Philippe Mathieu-Daudé
2020-12-11  1:14 ` [PATCH 2/2] tcg: Introduce INDEX_op_qemu_st8_i32 Richard Henderson
2021-01-05 13:46   ` Philippe Mathieu-Daudé
2021-01-05  0:04 ` [PATCH 0/2] tcg: Eliminate scratch regs from i386 backend Richard Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.