All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
@ 2018-11-28  5:38 Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 01/12] tcg: Add preferred_reg argument to tcg_reg_alloc Richard Henderson
                   ` (13 more replies)
  0 siblings, 14 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

The intent here is to remove several move insns putting the
function arguments into the proper place.  I'm hoping that
this will solve the skylake regression with spec2006, as
seen with the ool softmmu patch set.

Emilio, all of this is present on my tcg-next-for-4.0 branch.


r~


Richard Henderson (12):
  tcg: Add preferred_reg argument to tcg_reg_alloc
  tcg: Add preferred_reg argument to temp_load
  tcg: Add preferred_reg argument to temp_sync
  tcg: Add preferred_reg argument to tcg_reg_alloc_do_movi
  tcg: Add output_pref to TCGOp
  tcg: Improve register allocation for matching constraints
  tcg: Dump register preference info with liveness
  tcg: Reindent parts of liveness_pass_1
  tcg: Rename and adjust liveness_pass_1 helpers
  tcg: Split out more subroutines from liveness_pass_1
  tcg: Add TCG_OPF_BB_EXIT
  tcg: Record register preferences during liveness

 tcg/tcg-opc.h |   7 +-
 tcg/tcg.h     |  20 +-
 tcg/tcg.c     | 527 +++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 405 insertions(+), 149 deletions(-)

-- 
2.17.2

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 01/12] tcg: Add preferred_reg argument to tcg_reg_alloc
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 02/12] tcg: Add preferred_reg argument to temp_load Richard Henderson
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

This new argument will aid register allocation by indicating how
the temporary will be used in future.  If the preference cannot
be satisfied, fall back to the constraints of the current insn.

Short circuit the preference when it cannot be satisfied or if
it does not further constrain the operation.

With an eye toward optimizing function call sequences, optimize
for the preferred_reg set containing a single register.

For the moment, all users pass 0 for preference.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 103 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 81 insertions(+), 22 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 8734389ba9..c596277fd0 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1907,6 +1907,20 @@ static const char * const alignment_name[(MO_AMASK >> MO_ASHIFT) + 1] = {
     [MO_ALIGN_64 >> MO_ASHIFT] = "al64+",
 };
 
+static inline bool tcg_regset_single(TCGRegSet d)
+{
+    return (d & (d - 1)) == 0;
+}
+
+static inline TCGReg tcg_regset_first(TCGRegSet d)
+{
+    if (TCG_TARGET_NB_REGS <= 32) {
+        return ctz32(d);
+    } else {
+        return ctz64(d);
+    }
+}
+
 void tcg_dump_ops(TCGContext *s)
 {
     char buf[128];
@@ -1922,6 +1936,7 @@ void tcg_dump_ops(TCGContext *s)
         def = &tcg_op_defs[c];
 
         if (c == INDEX_op_insn_start) {
+            nb_oargs = 0;
             col += qemu_log("\n ----");
 
             for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
@@ -2924,31 +2939,72 @@ static void tcg_reg_free(TCGContext *s, TCGReg reg, TCGRegSet allocated_regs)
     }
 }
 
-/* Allocate a register belonging to reg1 & ~reg2 */
-static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet desired_regs,
-                            TCGRegSet allocated_regs, bool rev)
+/**
+ * tcg_reg_alloc:
+ * @required_regs: Set of registers in which we must allocate.
+ * @allocated_regs: Set of registers which must be avoided.
+ * @preferred_regs: Set of registers we should prefer.
+ * @rev: True if we search the registers in "indirect" order.
+ *
+ * The allocated register must be in @required_regs & ~@allocated_regs,
+ * but if we can put it in @preferred_regs we may save a move later.
+ */
+static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet required_regs,
+                            TCGRegSet allocated_regs,
+                            TCGRegSet preferred_regs, bool rev)
 {
-    int i, n = ARRAY_SIZE(tcg_target_reg_alloc_order);
+    int i, j, f, n = ARRAY_SIZE(tcg_target_reg_alloc_order);
+    TCGRegSet reg_ct[2];
     const int *order;
-    TCGReg reg;
-    TCGRegSet reg_ct;
 
-    reg_ct = desired_regs & ~allocated_regs;
+    reg_ct[1] = required_regs & ~allocated_regs;
+    tcg_debug_assert(reg_ct[1] != 0);
+    reg_ct[0] = reg_ct[1] & preferred_regs;
+
+    /* Skip the preferred_regs option if it cannot be satisfied,
+       or if the preference made no difference.  */
+    f = reg_ct[0] == 0 || reg_ct[0] == reg_ct[1];
+
     order = rev ? indirect_reg_alloc_order : tcg_target_reg_alloc_order;
 
-    /* first try free registers */
-    for(i = 0; i < n; i++) {
-        reg = order[i];
-        if (tcg_regset_test_reg(reg_ct, reg) && s->reg_to_temp[reg] == NULL)
-            return reg;
+    /* Try free registers, preferences first.  */
+    for (j = f; j < 2; j++) {
+        TCGRegSet set = reg_ct[j];
+
+        if (tcg_regset_single(set)) {
+            /* One register in the set.  */
+            TCGReg reg = tcg_regset_first(set);
+            if (s->reg_to_temp[reg] == NULL) {
+                return reg;
+            }
+        } else {
+            for (i = 0; i < n; i++) {
+                TCGReg reg = order[i];
+                if (s->reg_to_temp[reg] == NULL &&
+                    tcg_regset_test_reg(set, reg)) {
+                    return reg;
+                }
+            }
+        }
     }
 
-    /* XXX: do better spill choice */
-    for(i = 0; i < n; i++) {
-        reg = order[i];
-        if (tcg_regset_test_reg(reg_ct, reg)) {
+    /* We must spill something.  */
+    for (j = f; j < 2; j++) {
+        TCGRegSet set = reg_ct[j];
+
+        if (tcg_regset_single(set)) {
+            /* One register in the set.  */
+            TCGReg reg = tcg_regset_first(set);
             tcg_reg_free(s, reg, allocated_regs);
             return reg;
+        } else {
+            for (i = 0; i < n; i++) {
+                TCGReg reg = order[i];
+                if (tcg_regset_test_reg(set, reg)) {
+                    tcg_reg_free(s, reg, allocated_regs);
+                    return reg;
+                }
+            }
         }
     }
 
@@ -2966,12 +3022,14 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
     case TEMP_VAL_REG:
         return;
     case TEMP_VAL_CONST:
-        reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base);
+        reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
+                            0, ts->indirect_base);
         tcg_out_movi(s, ts->type, reg, ts->val);
         ts->mem_coherent = 0;
         break;
     case TEMP_VAL_MEM:
-        reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base);
+        reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
+                            0, ts->indirect_base);
         tcg_out_ld(s, ts->type, reg, ts->mem_base->reg, ts->mem_offset);
         ts->mem_coherent = 1;
         break;
@@ -3131,7 +3189,8 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                    input one. */
                 tcg_regset_set_reg(allocated_regs, ts->reg);
                 ots->reg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
-                                         allocated_regs, ots->indirect_base);
+                                         allocated_regs, 0,
+                                         ots->indirect_base);
             }
             tcg_out_mov(s, otype, ots->reg, ts->reg);
         }
@@ -3219,7 +3278,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             /* allocate a new register matching the constraint 
                and move the temporary register into it */
             reg = tcg_reg_alloc(s, arg_ct->u.regs, i_allocated_regs,
-                                ts->indirect_base);
+                                0, ts->indirect_base);
             tcg_out_mov(s, ts->type, reg, ts->reg);
         }
         new_args[i] = reg;
@@ -3264,7 +3323,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             } else if (arg_ct->ct & TCG_CT_NEWREG) {
                 reg = tcg_reg_alloc(s, arg_ct->u.regs,
                                     i_allocated_regs | o_allocated_regs,
-                                    ts->indirect_base);
+                                    0, ts->indirect_base);
             } else {
                 /* if fixed register, we try to use it */
                 reg = ts->reg;
@@ -3273,7 +3332,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                     goto oarg_end;
                 }
                 reg = tcg_reg_alloc(s, arg_ct->u.regs, o_allocated_regs,
-                                    ts->indirect_base);
+                                    0, ts->indirect_base);
             }
             tcg_regset_set_reg(o_allocated_regs, reg);
             /* if a fixed register is used, then a move will be done afterwards */
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 02/12] tcg: Add preferred_reg argument to temp_load
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 01/12] tcg: Add preferred_reg argument to tcg_reg_alloc Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 03/12] tcg: Add preferred_reg argument to temp_sync Richard Henderson
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Pass this through to tcg_reg_alloc.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index c596277fd0..7f29a2045a 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2859,7 +2859,7 @@ static void temp_allocate_frame(TCGContext *s, TCGTemp *ts)
     s->current_frame_offset += sizeof(tcg_target_long);
 }
 
-static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet);
+static void temp_load(TCGContext *, TCGTemp *, TCGRegSet, TCGRegSet, TCGRegSet);
 
 /* Mark a temporary as free or dead.  If 'free_or_dead' is negative,
    mark it free; otherwise mark it dead.  */
@@ -2908,7 +2908,7 @@ static void temp_sync(TCGContext *s, TCGTemp *ts,
                 break;
             }
             temp_load(s, ts, tcg_target_available_regs[ts->type],
-                      allocated_regs);
+                      allocated_regs, 0);
             /* fallthrough */
 
         case TEMP_VAL_REG:
@@ -3014,7 +3014,7 @@ static TCGReg tcg_reg_alloc(TCGContext *s, TCGRegSet required_regs,
 /* Make sure the temporary is in a register.  If needed, allocate the register
    from DESIRED while avoiding ALLOCATED.  */
 static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
-                      TCGRegSet allocated_regs)
+                      TCGRegSet allocated_regs, TCGRegSet preferred_regs)
 {
     TCGReg reg;
 
@@ -3023,13 +3023,13 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
         return;
     case TEMP_VAL_CONST:
         reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
-                            0, ts->indirect_base);
+                            preferred_regs, ts->indirect_base);
         tcg_out_movi(s, ts->type, reg, ts->val);
         ts->mem_coherent = 0;
         break;
     case TEMP_VAL_MEM:
         reg = tcg_reg_alloc(s, desired_regs, allocated_regs,
-                            0, ts->indirect_base);
+                            preferred_regs, ts->indirect_base);
         tcg_out_ld(s, ts->type, reg, ts->mem_base->reg, ts->mem_offset);
         ts->mem_coherent = 1;
         break;
@@ -3159,7 +3159,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
        the SOURCE value into its own register first, that way we
        don't have to reload SOURCE the next time it is used. */
     if (ts->val_type == TEMP_VAL_MEM) {
-        temp_load(s, ts, tcg_target_available_regs[itype], allocated_regs);
+        temp_load(s, ts, tcg_target_available_regs[itype], allocated_regs, 0);
     }
 
     tcg_debug_assert(ts->val_type == TEMP_VAL_REG);
@@ -3243,7 +3243,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             goto iarg_end;
         }
 
-        temp_load(s, ts, arg_ct->u.regs, i_allocated_regs);
+        temp_load(s, ts, arg_ct->u.regs, i_allocated_regs, 0);
 
         if (arg_ct->ct & TCG_CT_IALIAS) {
             if (ts->fixed_reg) {
@@ -3424,7 +3424,7 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
         if (arg != TCG_CALL_DUMMY_ARG) {
             ts = arg_temp(arg);
             temp_load(s, ts, tcg_target_available_regs[ts->type],
-                      s->reserved_regs);
+                      s->reserved_regs, 0);
             tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset);
         }
 #ifndef TCG_TARGET_STACK_GROWSUP
@@ -3449,7 +3449,7 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
                 TCGRegSet arg_set = 0;
 
                 tcg_regset_set_reg(arg_set, reg);
-                temp_load(s, ts, arg_set, allocated_regs);
+                temp_load(s, ts, arg_set, allocated_regs, 0);
             }
 
             tcg_regset_set_reg(allocated_regs, reg);
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 03/12] tcg: Add preferred_reg argument to temp_sync
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 01/12] tcg: Add preferred_reg argument to tcg_reg_alloc Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 02/12] tcg: Add preferred_reg argument to temp_load Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 04/12] tcg: Add preferred_reg argument to tcg_reg_alloc_do_movi Richard Henderson
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Pass this through to tcg_reg_alloc.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 7f29a2045a..509e5974bd 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2887,8 +2887,8 @@ static inline void temp_dead(TCGContext *s, TCGTemp *ts)
    registers needs to be allocated to store a constant.  If 'free_or_dead'
    is non-zero, subsequently release the temporary; if it is positive, the
    temp is dead; if it is negative, the temp is free.  */
-static void temp_sync(TCGContext *s, TCGTemp *ts,
-                      TCGRegSet allocated_regs, int free_or_dead)
+static void temp_sync(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs,
+                      TCGRegSet preferred_regs, int free_or_dead)
 {
     if (ts->fixed_reg) {
         return;
@@ -2908,7 +2908,7 @@ static void temp_sync(TCGContext *s, TCGTemp *ts,
                 break;
             }
             temp_load(s, ts, tcg_target_available_regs[ts->type],
-                      allocated_regs, 0);
+                      allocated_regs, preferred_regs);
             /* fallthrough */
 
         case TEMP_VAL_REG:
@@ -2935,7 +2935,7 @@ static void tcg_reg_free(TCGContext *s, TCGReg reg, TCGRegSet allocated_regs)
 {
     TCGTemp *ts = s->reg_to_temp[reg];
     if (ts != NULL) {
-        temp_sync(s, ts, allocated_regs, -1);
+        temp_sync(s, ts, allocated_regs, 0, -1);
     }
 }
 
@@ -3115,7 +3115,7 @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
     ots->val = val;
     ots->mem_coherent = 0;
     if (NEED_SYNC_ARG(0)) {
-        temp_sync(s, ots, s->reserved_regs, IS_DEAD_ARG(0));
+        temp_sync(s, ots, s->reserved_regs, 0, IS_DEAD_ARG(0));
     } else if (IS_DEAD_ARG(0)) {
         temp_dead(s, ots);
     }
@@ -3198,7 +3198,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
         ots->mem_coherent = 0;
         s->reg_to_temp[ots->reg] = ots;
         if (NEED_SYNC_ARG(0)) {
-            temp_sync(s, ots, allocated_regs, 0);
+            temp_sync(s, ots, allocated_regs, 0, 0);
         }
     }
 }
@@ -3368,7 +3368,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             tcg_out_mov(s, ts->type, ts->reg, reg);
         }
         if (NEED_SYNC_ARG(i)) {
-            temp_sync(s, ts, o_allocated_regs, IS_DEAD_ARG(i));
+            temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
         } else if (IS_DEAD_ARG(i)) {
             temp_dead(s, ts);
         }
@@ -3502,7 +3502,7 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
             ts->mem_coherent = 0;
             s->reg_to_temp[reg] = ts;
             if (NEED_SYNC_ARG(i)) {
-                temp_sync(s, ts, allocated_regs, IS_DEAD_ARG(i));
+                temp_sync(s, ts, allocated_regs, 0, IS_DEAD_ARG(i));
             } else if (IS_DEAD_ARG(i)) {
                 temp_dead(s, ts);
             }
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 04/12] tcg: Add preferred_reg argument to tcg_reg_alloc_do_movi
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (2 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 03/12] tcg: Add preferred_reg argument to temp_sync Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 05/12] tcg: Add output_pref to TCGOp Richard Henderson
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Pass this through to temp_sync.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 509e5974bd..c83ca238aa 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -3099,7 +3099,8 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
 }
 
 static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
-                                  tcg_target_ulong val, TCGLifeData arg_life)
+                                  tcg_target_ulong val, TCGLifeData arg_life,
+                                  TCGRegSet preferred_regs)
 {
     if (ots->fixed_reg) {
         /* For fixed registers, we do not do any constant propagation.  */
@@ -3115,7 +3116,7 @@ static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
     ots->val = val;
     ots->mem_coherent = 0;
     if (NEED_SYNC_ARG(0)) {
-        temp_sync(s, ots, s->reserved_regs, 0, IS_DEAD_ARG(0));
+        temp_sync(s, ots, s->reserved_regs, preferred_regs, IS_DEAD_ARG(0));
     } else if (IS_DEAD_ARG(0)) {
         temp_dead(s, ots);
     }
@@ -3126,7 +3127,7 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGOp *op)
     TCGTemp *ots = arg_temp(op->args[0]);
     tcg_target_ulong val = op->args[1];
 
-    tcg_reg_alloc_do_movi(s, ots, val, op->life);
+    tcg_reg_alloc_do_movi(s, ots, val, op->life, 0);
 }
 
 static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
@@ -3150,7 +3151,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
         if (IS_DEAD_ARG(1)) {
             temp_dead(s, ts);
         }
-        tcg_reg_alloc_do_movi(s, ots, val, arg_life);
+        tcg_reg_alloc_do_movi(s, ots, val, arg_life, 0);
         return;
     }
 
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 05/12] tcg: Add output_pref to TCGOp
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (3 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 04/12] tcg: Add preferred_reg argument to tcg_reg_alloc_do_movi Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 06/12] tcg: Improve register allocation for matching constraints Richard Henderson
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Allocate storage for, but do not yet fill in, per-opcode
preferences for the output operands.  Pass it in to the
register allocation routines for output operands.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.h |  3 +++
 tcg/tcg.c | 18 +++++++++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/tcg/tcg.h b/tcg/tcg.h
index c6caeeb42b..b2e274b7af 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -621,6 +621,9 @@ typedef struct TCGOp {
 
     /* Arguments for the opcode.  */
     TCGArg args[MAX_OPC_PARAM];
+
+    /* Register preferences for the output(s).  */
+    TCGRegSet output_pref[2];
 } TCGOp;
 
 #define TCGOP_CALLI(X)    (X)->param1
diff --git a/tcg/tcg.c b/tcg/tcg.c
index c83ca238aa..f86415ce29 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2613,6 +2613,8 @@ static void liveness_pass_1(TCGContext *s)
             break;
         }
         op->life = arg_life;
+        op->output_pref[0] = 0;
+        op->output_pref[1] = 0;
     }
 }
 
@@ -3127,17 +3129,18 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGOp *op)
     TCGTemp *ots = arg_temp(op->args[0]);
     tcg_target_ulong val = op->args[1];
 
-    tcg_reg_alloc_do_movi(s, ots, val, op->life, 0);
+    tcg_reg_alloc_do_movi(s, ots, val, op->life, op->output_pref[0]);
 }
 
 static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
 {
     const TCGLifeData arg_life = op->life;
-    TCGRegSet allocated_regs;
+    TCGRegSet allocated_regs, preferred_regs;
     TCGTemp *ts, *ots;
     TCGType otype, itype;
 
     allocated_regs = s->reserved_regs;
+    preferred_regs = op->output_pref[0];
     ots = arg_temp(op->args[0]);
     ts = arg_temp(op->args[1]);
 
@@ -3151,7 +3154,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
         if (IS_DEAD_ARG(1)) {
             temp_dead(s, ts);
         }
-        tcg_reg_alloc_do_movi(s, ots, val, arg_life, 0);
+        tcg_reg_alloc_do_movi(s, ots, val, arg_life, preferred_regs);
         return;
     }
 
@@ -3160,7 +3163,8 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
        the SOURCE value into its own register first, that way we
        don't have to reload SOURCE the next time it is used. */
     if (ts->val_type == TEMP_VAL_MEM) {
-        temp_load(s, ts, tcg_target_available_regs[itype], allocated_regs, 0);
+        temp_load(s, ts, tcg_target_available_regs[itype],
+                  allocated_regs, preferred_regs);
     }
 
     tcg_debug_assert(ts->val_type == TEMP_VAL_REG);
@@ -3190,7 +3194,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
                    input one. */
                 tcg_regset_set_reg(allocated_regs, ts->reg);
                 ots->reg = tcg_reg_alloc(s, tcg_target_available_regs[otype],
-                                         allocated_regs, 0,
+                                         allocated_regs, preferred_regs,
                                          ots->indirect_base);
             }
             tcg_out_mov(s, otype, ots->reg, ts->reg);
@@ -3324,7 +3328,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             } else if (arg_ct->ct & TCG_CT_NEWREG) {
                 reg = tcg_reg_alloc(s, arg_ct->u.regs,
                                     i_allocated_regs | o_allocated_regs,
-                                    0, ts->indirect_base);
+                                    op->output_pref[k], ts->indirect_base);
             } else {
                 /* if fixed register, we try to use it */
                 reg = ts->reg;
@@ -3333,7 +3337,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                     goto oarg_end;
                 }
                 reg = tcg_reg_alloc(s, arg_ct->u.regs, o_allocated_regs,
-                                    0, ts->indirect_base);
+                                    op->output_pref[k], ts->indirect_base);
             }
             tcg_regset_set_reg(o_allocated_regs, reg);
             /* if a fixed register is used, then a move will be done afterwards */
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 06/12] tcg: Improve register allocation for matching constraints
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (4 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 05/12] tcg: Add output_pref to TCGOp Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 07/12] tcg: Dump register preference info with liveness Richard Henderson
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Try harder to honor the output_pref.  When we're forced to allocate
a second register for the input, it does not need to use the input
constraint; that will be honored by the register we allocate for the
output and a move is already required.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index f86415ce29..adf6570c36 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -3235,6 +3235,8 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 
     /* satisfy input constraints */ 
     for (k = 0; k < nb_iargs; k++) {
+        TCGRegSet i_preferred_regs, o_preferred_regs;
+
         i = def->sorted_args[nb_oargs + k];
         arg = op->args[i];
         arg_ct = &def->args_ct[i];
@@ -3245,17 +3247,18 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
             /* constant is OK for instruction */
             const_args[i] = 1;
             new_args[i] = ts->val;
-            goto iarg_end;
+            continue;
         }
 
-        temp_load(s, ts, arg_ct->u.regs, i_allocated_regs, 0);
-
+        i_preferred_regs = o_preferred_regs = 0;
         if (arg_ct->ct & TCG_CT_IALIAS) {
+            o_preferred_regs = op->output_pref[arg_ct->alias_index];
             if (ts->fixed_reg) {
                 /* if fixed register, we must allocate a new register
                    if the alias is not the same register */
-                if (arg != op->args[arg_ct->alias_index])
+                if (arg != op->args[arg_ct->alias_index]) {
                     goto allocate_in_reg;
+                }
             } else {
                 /* if the input is aliased to an output and if it is
                    not dead after the instruction, we must allocate
@@ -3263,33 +3266,42 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                 if (!IS_DEAD_ARG(i)) {
                     goto allocate_in_reg;
                 }
+
                 /* check if the current register has already been allocated
                    for another input aliased to an output */
-                int k2, i2;
-                for (k2 = 0 ; k2 < k ; k2++) {
-                    i2 = def->sorted_args[nb_oargs + k2];
-                    if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
-                        (new_args[i2] == ts->reg)) {
-                        goto allocate_in_reg;
+                if (ts->val_type == TEMP_VAL_REG) {
+                    int k2, i2;
+                    reg = ts->reg;
+                    for (k2 = 0 ; k2 < k ; k2++) {
+                        i2 = def->sorted_args[nb_oargs + k2];
+                        if ((def->args_ct[i2].ct & TCG_CT_IALIAS) &&
+                            reg == new_args[i2]) {
+                            goto allocate_in_reg;
+                        }
                     }
                 }
+                i_preferred_regs = o_preferred_regs;
             }
         }
+
+        temp_load(s, ts, arg_ct->u.regs, i_allocated_regs, i_preferred_regs);
         reg = ts->reg;
+
         if (tcg_regset_test_reg(arg_ct->u.regs, reg)) {
             /* nothing to do : the constraint is satisfied */
         } else {
         allocate_in_reg:
             /* allocate a new register matching the constraint 
                and move the temporary register into it */
+            temp_load(s, ts, tcg_target_available_regs[ts->type],
+                      i_allocated_regs, 0);
             reg = tcg_reg_alloc(s, arg_ct->u.regs, i_allocated_regs,
-                                0, ts->indirect_base);
+                                o_preferred_regs, ts->indirect_base);
             tcg_out_mov(s, ts->type, reg, ts->reg);
         }
         new_args[i] = reg;
         const_args[i] = 0;
         tcg_regset_set_reg(i_allocated_regs, reg);
-    iarg_end: ;
     }
     
     /* mark dead temporaries and free the associated registers */
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 07/12] tcg: Dump register preference info with liveness
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (5 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 06/12] tcg: Improve register allocation for matching constraints Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 08/12] tcg: Reindent parts of liveness_pass_1 Richard Henderson
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.h |  3 ---
 tcg/tcg.c | 44 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/tcg/tcg.h b/tcg/tcg.h
index b2e274b7af..ac5d01c223 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -1089,9 +1089,6 @@ TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg);
 
 void tcg_optimize(TCGContext *s);
 
-/* only used for debugging purposes */
-void tcg_dump_ops(TCGContext *s);
-
 TCGv_i32 tcg_const_i32(int32_t val);
 TCGv_i64 tcg_const_i64(int64_t val);
 TCGv_i32 tcg_const_local_i32(int32_t val);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index adf6570c36..27d081e11a 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1921,7 +1921,7 @@ static inline TCGReg tcg_regset_first(TCGRegSet d)
     }
 }
 
-void tcg_dump_ops(TCGContext *s)
+static void tcg_dump_ops(TCGContext *s, bool have_prefs)
 {
     char buf[128];
     TCGOp *op;
@@ -2056,12 +2056,15 @@ void tcg_dump_ops(TCGContext *s)
                 col += qemu_log("%s$0x%" TCG_PRIlx, k ? "," : "", op->args[k]);
             }
         }
-        if (op->life) {
-            unsigned life = op->life;
 
-            for (; col < 48; ++col) {
+        if (have_prefs || op->life) {
+            for (; col < 40; ++col) {
                 putc(' ', qemu_logfile);
             }
+        }
+
+        if (op->life) {
+            unsigned life = op->life;
 
             if (life & (SYNC_ARG * 3)) {
                 qemu_log("  sync:");
@@ -2081,6 +2084,33 @@ void tcg_dump_ops(TCGContext *s)
                 }
             }
         }
+
+        if (have_prefs) {
+            for (i = 0; i < nb_oargs; ++i) {
+                TCGRegSet set = op->output_pref[i];
+
+                if (i == 0) {
+                    qemu_log("  pref=");
+                } else {
+                    qemu_log(",");
+                }
+                if (set == 0) {
+                    qemu_log("none");
+                } else if (set == MAKE_64BIT_MASK(0, TCG_TARGET_NB_REGS)) {
+                    qemu_log("all");
+#ifdef CONFIG_DEBUG_TCG
+                } else if (tcg_regset_single(set)) {
+                    TCGReg reg = tcg_regset_first(set);
+                    qemu_log("%s", tcg_target_reg_names[reg]);
+#endif
+                } else if (TCG_TARGET_NB_REGS <= 32) {
+                    qemu_log("%#x", (uint32_t)set);
+                } else {
+                    qemu_log("%#" PRIx64, (uint64_t)set);
+                }
+            }
+        }
+
         qemu_log("\n");
     }
 }
@@ -3669,7 +3699,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
                  && qemu_log_in_addr_range(tb->pc))) {
         qemu_log_lock();
         qemu_log("OP:\n");
-        tcg_dump_ops(s);
+        tcg_dump_ops(s, false);
         qemu_log("\n");
         qemu_log_unlock();
     }
@@ -3697,7 +3727,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
                      && qemu_log_in_addr_range(tb->pc))) {
             qemu_log_lock();
             qemu_log("OP before indirect lowering:\n");
-            tcg_dump_ops(s);
+            tcg_dump_ops(s, false);
             qemu_log("\n");
             qemu_log_unlock();
         }
@@ -3718,7 +3748,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
                  && qemu_log_in_addr_range(tb->pc))) {
         qemu_log_lock();
         qemu_log("OP after optimization and liveness analysis:\n");
-        tcg_dump_ops(s);
+        tcg_dump_ops(s, true);
         qemu_log("\n");
         qemu_log_unlock();
     }
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 08/12] tcg: Reindent parts of liveness_pass_1
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (6 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 07/12] tcg: Dump register preference info with liveness Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 09/12] tcg: Rename and adjust liveness_pass_1 helpers Richard Henderson
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

There are two blocks of the form

    if (foo) {
        stuff1;
        goto bar;
    } else {
    baz:
        stuff2;
    }

which have unnecessary and confusing indentation.
Remove the else and unindent stuff2.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 139 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 71 insertions(+), 68 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 27d081e11a..82f9a66d31 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2458,47 +2458,46 @@ static void liveness_pass_1(TCGContext *s)
                         }
                     }
                     goto do_remove;
-                } else {
-                do_not_remove_call:
+                }
+            do_not_remove_call:
 
-                    /* output args are dead */
-                    for (i = 0; i < nb_oargs; i++) {
-                        arg_ts = arg_temp(op->args[i]);
-                        if (arg_ts->state & TS_DEAD) {
-                            arg_life |= DEAD_ARG << i;
-                        }
-                        if (arg_ts->state & TS_MEM) {
-                            arg_life |= SYNC_ARG << i;
-                        }
-                        arg_ts->state = TS_DEAD;
+                /* output args are dead */
+                for (i = 0; i < nb_oargs; i++) {
+                    arg_ts = arg_temp(op->args[i]);
+                    if (arg_ts->state & TS_DEAD) {
+                        arg_life |= DEAD_ARG << i;
                     }
+                    if (arg_ts->state & TS_MEM) {
+                        arg_life |= SYNC_ARG << i;
+                    }
+                    arg_ts->state = TS_DEAD;
+                }
 
-                    if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
-                                        TCG_CALL_NO_READ_GLOBALS))) {
-                        /* globals should go back to memory */
-                        for (i = 0; i < nb_globals; i++) {
-                            s->temps[i].state = TS_DEAD | TS_MEM;
-                        }
-                    } else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
-                        /* globals should be synced to memory */
-                        for (i = 0; i < nb_globals; i++) {
-                            s->temps[i].state |= TS_MEM;
-                        }
+                if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
+                                    TCG_CALL_NO_READ_GLOBALS))) {
+                    /* globals should go back to memory */
+                    for (i = 0; i < nb_globals; i++) {
+                        s->temps[i].state = TS_DEAD | TS_MEM;
                     }
+                } else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
+                    /* globals should be synced to memory */
+                    for (i = 0; i < nb_globals; i++) {
+                        s->temps[i].state |= TS_MEM;
+                    }
+                }
 
-                    /* record arguments that die in this helper */
-                    for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
-                        arg_ts = arg_temp(op->args[i]);
-                        if (arg_ts && arg_ts->state & TS_DEAD) {
-                            arg_life |= DEAD_ARG << i;
-                        }
+                /* record arguments that die in this helper */
+                for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+                    arg_ts = arg_temp(op->args[i]);
+                    if (arg_ts && arg_ts->state & TS_DEAD) {
+                        arg_life |= DEAD_ARG << i;
                     }
-                    /* input arguments are live for preceding opcodes */
-                    for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
-                        arg_ts = arg_temp(op->args[i]);
-                        if (arg_ts) {
-                            arg_ts->state &= ~TS_DEAD;
-                        }
+                }
+                /* input arguments are live for preceding opcodes */
+                for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
+                    arg_ts = arg_temp(op->args[i]);
+                    if (arg_ts) {
+                        arg_ts->state &= ~TS_DEAD;
                     }
                 }
             }
@@ -2602,43 +2601,47 @@ static void liveness_pass_1(TCGContext *s)
                         goto do_not_remove;
                     }
                 }
-            do_remove:
-                tcg_op_remove(s, op);
-            } else {
-            do_not_remove:
-                /* output args are dead */
-                for (i = 0; i < nb_oargs; i++) {
-                    arg_ts = arg_temp(op->args[i]);
-                    if (arg_ts->state & TS_DEAD) {
-                        arg_life |= DEAD_ARG << i;
-                    }
-                    if (arg_ts->state & TS_MEM) {
-                        arg_life |= SYNC_ARG << i;
-                    }
-                    arg_ts->state = TS_DEAD;
-                }
+                goto do_remove;
+            }
+            goto do_not_remove;
 
-                /* if end of basic block, update */
-                if (def->flags & TCG_OPF_BB_END) {
-                    tcg_la_bb_end(s);
-                } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
-                    /* globals should be synced to memory */
-                    for (i = 0; i < nb_globals; i++) {
-                        s->temps[i].state |= TS_MEM;
-                    }
-                }
+        do_remove:
+            tcg_op_remove(s, op);
+            break;
 
-                /* record arguments that die in this opcode */
-                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-                    arg_ts = arg_temp(op->args[i]);
-                    if (arg_ts->state & TS_DEAD) {
-                        arg_life |= DEAD_ARG << i;
-                    }
+        do_not_remove:
+            /* output args are dead */
+            for (i = 0; i < nb_oargs; i++) {
+                arg_ts = arg_temp(op->args[i]);
+                if (arg_ts->state & TS_DEAD) {
+                    arg_life |= DEAD_ARG << i;
                 }
-                /* input arguments are live for preceding opcodes */
-                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-                    arg_temp(op->args[i])->state &= ~TS_DEAD;
+                if (arg_ts->state & TS_MEM) {
+                    arg_life |= SYNC_ARG << i;
                 }
+                arg_ts->state = TS_DEAD;
+            }
+
+            /* if end of basic block, update */
+            if (def->flags & TCG_OPF_BB_END) {
+                tcg_la_bb_end(s);
+            } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
+                /* globals should be synced to memory */
+                for (i = 0; i < nb_globals; i++) {
+                    s->temps[i].state |= TS_MEM;
+                }
+            }
+
+            /* record arguments that die in this opcode */
+            for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                arg_ts = arg_temp(op->args[i]);
+                if (arg_ts->state & TS_DEAD) {
+                    arg_life |= DEAD_ARG << i;
+                }
+            }
+            /* input arguments are live for preceding opcodes */
+            for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                arg_temp(op->args[i])->state &= ~TS_DEAD;
             }
             break;
         }
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 09/12] tcg: Rename and adjust liveness_pass_1 helpers
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (7 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 08/12] tcg: Reindent parts of liveness_pass_1 Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 10/12] tcg: Split out more subroutines from liveness_pass_1 Richard Henderson
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

No need for a "tcg_" prefix for a static function; we already
have another "la_" prefix for indicating liveness analysis.
Pass in nb_globals and nb_temps, as we will already have them
in registers for other loops within the parent function.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 82f9a66d31..27814df882 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2389,10 +2389,8 @@ static void reachable_code_pass(TCGContext *s)
 
 /* liveness analysis: end of function: all temps are dead, and globals
    should be in memory. */
-static void tcg_la_func_end(TCGContext *s)
+static void la_func_end(TCGContext *s, int ng, int nt)
 {
-    int ng = s->nb_globals;
-    int nt = s->nb_temps;
     int i;
 
     for (i = 0; i < ng; ++i) {
@@ -2405,10 +2403,8 @@ static void tcg_la_func_end(TCGContext *s)
 
 /* liveness analysis: end of basic block: all temps are dead, globals
    and local temps should be in memory. */
-static void tcg_la_bb_end(TCGContext *s)
+static void la_bb_end(TCGContext *s, int ng, int nt)
 {
-    int ng = s->nb_globals;
-    int nt = s->nb_temps;
     int i;
 
     for (i = 0; i < ng; ++i) {
@@ -2427,9 +2423,10 @@ static void tcg_la_bb_end(TCGContext *s)
 static void liveness_pass_1(TCGContext *s)
 {
     int nb_globals = s->nb_globals;
+    int nb_temps = s->nb_temps;
     TCGOp *op, *op_prev;
 
-    tcg_la_func_end(s);
+    la_func_end(s, nb_globals, nb_temps);
 
     QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, TCGOpHead, link, op_prev) {
         int i, nb_iargs, nb_oargs;
@@ -2624,7 +2621,7 @@ static void liveness_pass_1(TCGContext *s)
 
             /* if end of basic block, update */
             if (def->flags & TCG_OPF_BB_END) {
-                tcg_la_bb_end(s);
+                la_bb_end(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
                 /* globals should be synced to memory */
                 for (i = 0; i < nb_globals; i++) {
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 10/12] tcg: Split out more subroutines from liveness_pass_1
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (8 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 09/12] tcg: Rename and adjust liveness_pass_1 helpers Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 11/12] tcg: Add TCG_OPF_BB_EXIT Richard Henderson
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 27814df882..21668831a1 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2417,6 +2417,26 @@ static void la_bb_end(TCGContext *s, int ng, int nt)
     }
 }
 
+/* liveness analysis: sync globals back to memory.  */
+static void la_global_sync(TCGContext *s, int ng)
+{
+    int i;
+
+    for (i = 0; i < ng; ++i) {
+        s->temps[i].state |= TS_MEM;
+    }
+}
+
+/* liveness analysis: sync globals back to memory and kill.  */
+static void la_global_kill(TCGContext *s, int ng)
+{
+    int i;
+
+    for (i = 0; i < ng; i++) {
+        s->temps[i].state = TS_DEAD | TS_MEM;
+    }
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
    given input arguments is dead. Instructions updating dead
    temporaries are removed. */
@@ -2472,15 +2492,9 @@ static void liveness_pass_1(TCGContext *s)
 
                 if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
                                     TCG_CALL_NO_READ_GLOBALS))) {
-                    /* globals should go back to memory */
-                    for (i = 0; i < nb_globals; i++) {
-                        s->temps[i].state = TS_DEAD | TS_MEM;
-                    }
+                    la_global_kill(s, nb_globals);
                 } else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
-                    /* globals should be synced to memory */
-                    for (i = 0; i < nb_globals; i++) {
-                        s->temps[i].state |= TS_MEM;
-                    }
+                    la_global_sync(s, nb_globals);
                 }
 
                 /* record arguments that die in this helper */
@@ -2623,10 +2637,7 @@ static void liveness_pass_1(TCGContext *s)
             if (def->flags & TCG_OPF_BB_END) {
                 la_bb_end(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
-                /* globals should be synced to memory */
-                for (i = 0; i < nb_globals; i++) {
-                    s->temps[i].state |= TS_MEM;
-                }
+                la_global_sync(s, nb_globals);
             }
 
             /* record arguments that die in this opcode */
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 11/12] tcg: Add TCG_OPF_BB_EXIT
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (9 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 10/12] tcg: Split out more subroutines from liveness_pass_1 Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 12/12] tcg: Record register preferences during liveness Richard Henderson
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

Use this to notice the opcodes that exit the TB, which implies
that local temps are really dead and need not be synced.

Previously we so marked the true end of the TB, but that was
immediately overwritten by the la_bb_end invoked by any
TCG_OPF_BB_END opcode, like exit_tb.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-opc.h |  7 ++++---
 tcg/tcg.h     | 14 ++++++++------
 tcg/tcg.c     |  5 ++++-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index e3a43aabb6..7a8a3edb5b 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -191,9 +191,10 @@ DEF(mulsh_i64, 1, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_mulsh_i64))
 /* QEMU specific */
 DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
     TCG_OPF_NOT_PRESENT)
-DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_END)
-DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_END)
-DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_END | IMPL(TCG_TARGET_HAS_goto_ptr))
+DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
+DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
+DEF(goto_ptr, 0, 1, 0,
+    TCG_OPF_BB_EXIT | TCG_OPF_BB_END | IMPL(TCG_TARGET_HAS_goto_ptr))
 
 DEF(qemu_ld_i32, 1, TLADDR_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index ac5d01c223..abbf9c836a 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -1037,20 +1037,22 @@ typedef struct TCGArgConstraint {
 
 /* Bits for TCGOpDef->flags, 8 bits available.  */
 enum {
+    /* Instruction exits the translation block.  */
+    TCG_OPF_BB_EXIT      = 0x01,
     /* Instruction defines the end of a basic block.  */
-    TCG_OPF_BB_END       = 0x01,
+    TCG_OPF_BB_END       = 0x02,
     /* Instruction clobbers call registers and potentially update globals.  */
-    TCG_OPF_CALL_CLOBBER = 0x02,
+    TCG_OPF_CALL_CLOBBER = 0x04,
     /* Instruction has side effects: it cannot be removed if its outputs
        are not used, and might trigger exceptions.  */
-    TCG_OPF_SIDE_EFFECTS = 0x04,
+    TCG_OPF_SIDE_EFFECTS = 0x08,
     /* Instruction operands are 64-bits (otherwise 32-bits).  */
-    TCG_OPF_64BIT        = 0x08,
+    TCG_OPF_64BIT        = 0x10,
     /* Instruction is optional and not implemented by the host, or insn
        is generic and should not be implemened by the host.  */
-    TCG_OPF_NOT_PRESENT  = 0x10,
+    TCG_OPF_NOT_PRESENT  = 0x20,
     /* Instruction operands are vectors.  */
-    TCG_OPF_VECTOR       = 0x20,
+    TCG_OPF_VECTOR       = 0x40,
 };
 
 typedef struct TCGOpDef {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 21668831a1..673aaf52a1 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2446,6 +2446,7 @@ static void liveness_pass_1(TCGContext *s)
     int nb_temps = s->nb_temps;
     TCGOp *op, *op_prev;
 
+    /* ??? Should be redundant with the exit_tb that ends the TB.  */
     la_func_end(s, nb_globals, nb_temps);
 
     QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, TCGOpHead, link, op_prev) {
@@ -2634,7 +2635,9 @@ static void liveness_pass_1(TCGContext *s)
             }
 
             /* if end of basic block, update */
-            if (def->flags & TCG_OPF_BB_END) {
+            if (def->flags & TCG_OPF_BB_EXIT) {
+                la_func_end(s, nb_globals, nb_temps);
+            } else if (def->flags & TCG_OPF_BB_END) {
                 la_bb_end(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
                 la_global_sync(s, nb_globals);
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [Qemu-devel] [PATCH 12/12] tcg: Record register preferences during liveness
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (10 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 11/12] tcg: Add TCG_OPF_BB_EXIT Richard Henderson
@ 2018-11-28  5:38 ` Richard Henderson
  2018-11-28 22:15 ` [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Emilio G. Cota
  2018-12-24 21:53 ` Emilio G. Cota
  13 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2018-11-28  5:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: cota

With these preferences, we can arrange for function call arguments to
be computed into the proper registers instead of requiring extra moves.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 197 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 165 insertions(+), 32 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 673aaf52a1..734a453fc8 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -2387,6 +2387,21 @@ static void reachable_code_pass(TCGContext *s)
 #define IS_DEAD_ARG(n)   (arg_life & (DEAD_ARG << (n)))
 #define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n)))
 
+/* For liveness_pass_1, the register preferences for a given temp.  */
+static inline TCGRegSet *la_temp_pref(TCGTemp *ts)
+{
+    return ts->state_ptr;
+}
+
+/* For liveness_pass_1, reset the preferences for a given temp to the
+ * maximal regset for its type.
+ */
+static inline void la_reset_pref(TCGTemp *ts)
+{
+    *la_temp_pref(ts)
+        = (ts->state == TS_DEAD ? 0 : tcg_target_available_regs[ts->type]);
+}
+
 /* liveness analysis: end of function: all temps are dead, and globals
    should be in memory. */
 static void la_func_end(TCGContext *s, int ng, int nt)
@@ -2395,9 +2410,11 @@ static void la_func_end(TCGContext *s, int ng, int nt)
 
     for (i = 0; i < ng; ++i) {
         s->temps[i].state = TS_DEAD | TS_MEM;
+        la_reset_pref(&s->temps[i]);
     }
     for (i = ng; i < nt; ++i) {
         s->temps[i].state = TS_DEAD;
+        la_reset_pref(&s->temps[i]);
     }
 }
 
@@ -2409,11 +2426,13 @@ static void la_bb_end(TCGContext *s, int ng, int nt)
 
     for (i = 0; i < ng; ++i) {
         s->temps[i].state = TS_DEAD | TS_MEM;
+        la_reset_pref(&s->temps[i]);
     }
     for (i = ng; i < nt; ++i) {
         s->temps[i].state = (s->temps[i].temp_local
                              ? TS_DEAD | TS_MEM
                              : TS_DEAD);
+        la_reset_pref(&s->temps[i]);
     }
 }
 
@@ -2423,7 +2442,12 @@ static void la_global_sync(TCGContext *s, int ng)
     int i;
 
     for (i = 0; i < ng; ++i) {
-        s->temps[i].state |= TS_MEM;
+        int state = s->temps[i].state;
+        s->temps[i].state = state | TS_MEM;
+        if (state == TS_DEAD) {
+            /* If the global was previously dead, reset prefs.  */
+            la_reset_pref(&s->temps[i]);
+        }
     }
 }
 
@@ -2434,6 +2458,29 @@ static void la_global_kill(TCGContext *s, int ng)
 
     for (i = 0; i < ng; i++) {
         s->temps[i].state = TS_DEAD | TS_MEM;
+        la_reset_pref(&s->temps[i]);
+    }
+}
+
+/* liveness analysis: note live globals crossing calls.  */
+static void la_cross_call(TCGContext *s, int nt)
+{
+    TCGRegSet mask = ~tcg_target_call_clobber_regs;
+    int i;
+
+    for (i = 0; i < nt; i++) {
+        TCGTemp *ts = &s->temps[i];
+        if (!(ts->state & TS_DEAD)) {
+            TCGRegSet *pset = la_temp_pref(ts);
+            TCGRegSet set = *pset;
+
+            set &= mask;
+            /* If the combination is not possible, restart.  */
+            if (set == 0) {
+                set = tcg_target_available_regs[ts->type] & mask;
+            }
+            *pset = set;
+        }
     }
 }
 
@@ -2445,16 +2492,23 @@ static void liveness_pass_1(TCGContext *s)
     int nb_globals = s->nb_globals;
     int nb_temps = s->nb_temps;
     TCGOp *op, *op_prev;
+    TCGRegSet *prefs;
+    int i;
+
+    prefs = tcg_malloc(sizeof(TCGRegSet) * nb_temps);
+    for (i = 0; i < nb_temps; ++i) {
+        s->temps[i].state_ptr = prefs + i;
+    }
 
     /* ??? Should be redundant with the exit_tb that ends the TB.  */
     la_func_end(s, nb_globals, nb_temps);
 
     QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, TCGOpHead, link, op_prev) {
-        int i, nb_iargs, nb_oargs;
+        int nb_iargs, nb_oargs;
         TCGOpcode opc_new, opc_new2;
         bool have_opc_new2;
         TCGLifeData arg_life = 0;
-        TCGTemp *arg_ts;
+        TCGTemp *ts;
         TCGOpcode opc = op->opc;
         const TCGOpDef *def = &tcg_op_defs[opc];
 
@@ -2462,6 +2516,7 @@ static void liveness_pass_1(TCGContext *s)
         case INDEX_op_call:
             {
                 int call_flags;
+                int nb_call_regs;
 
                 nb_oargs = TCGOP_CALLO(op);
                 nb_iargs = TCGOP_CALLI(op);
@@ -2470,8 +2525,8 @@ static void liveness_pass_1(TCGContext *s)
                 /* pure functions can be removed if their result is unused */
                 if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
                     for (i = 0; i < nb_oargs; i++) {
-                        arg_ts = arg_temp(op->args[i]);
-                        if (arg_ts->state != TS_DEAD) {
+                        ts = arg_temp(op->args[i]);
+                        if (ts->state != TS_DEAD) {
                             goto do_not_remove_call;
                         }
                     }
@@ -2479,16 +2534,20 @@ static void liveness_pass_1(TCGContext *s)
                 }
             do_not_remove_call:
 
-                /* output args are dead */
+                /* Output args are dead.  */
                 for (i = 0; i < nb_oargs; i++) {
-                    arg_ts = arg_temp(op->args[i]);
-                    if (arg_ts->state & TS_DEAD) {
+                    ts = arg_temp(op->args[i]);
+                    if (ts->state & TS_DEAD) {
                         arg_life |= DEAD_ARG << i;
                     }
-                    if (arg_ts->state & TS_MEM) {
+                    if (ts->state & TS_MEM) {
                         arg_life |= SYNC_ARG << i;
                     }
-                    arg_ts->state = TS_DEAD;
+                    ts->state = TS_DEAD;
+                    la_reset_pref(ts);
+
+                    /* Not used -- it will be tcg_target_call_oarg_regs[i].  */
+                    op->output_pref[i] = 0;
                 }
 
                 if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
@@ -2498,18 +2557,42 @@ static void liveness_pass_1(TCGContext *s)
                     la_global_sync(s, nb_globals);
                 }
 
-                /* record arguments that die in this helper */
+                /* Record arguments that die in this helper.  */
                 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
-                    arg_ts = arg_temp(op->args[i]);
-                    if (arg_ts && arg_ts->state & TS_DEAD) {
+                    ts = arg_temp(op->args[i]);
+                    if (ts && ts->state & TS_DEAD) {
                         arg_life |= DEAD_ARG << i;
                     }
                 }
-                /* input arguments are live for preceding opcodes */
-                for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
-                    arg_ts = arg_temp(op->args[i]);
-                    if (arg_ts) {
-                        arg_ts->state &= ~TS_DEAD;
+
+                /* For all live registers, remove call-clobbered prefs.  */
+                la_cross_call(s, nb_temps);
+
+                nb_call_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
+
+                /* Input arguments are live for preceding opcodes.  */
+                for (i = 0; i < nb_iargs; i++) {
+                    ts = arg_temp(op->args[i + nb_oargs]);
+                    if (ts && ts->state & TS_DEAD) {
+                        /* For those arguments that die, and will be allocated
+                         * in registers, clear the register set for that arg,
+                         * to be filled in below.  For args that will be on
+                         * the stack, reset to any available reg.
+                         */
+                        *la_temp_pref(ts)
+                            = (i < nb_call_regs ? 0 :
+                               tcg_target_available_regs[ts->type]);
+                        ts->state &= ~TS_DEAD;
+                    }
+                }
+
+                /* For each input argument, add its input register to prefs.
+                   If a temp is used once, this produces a single set bit.  */
+                for (i = 0; i < MIN(nb_call_regs, nb_iargs); i++) {
+                    ts = arg_temp(op->args[i + nb_oargs]);
+                    if (ts) {
+                        tcg_regset_set_reg(*la_temp_pref(ts),
+                                           tcg_target_call_iarg_regs[i]);
                     }
                 }
             }
@@ -2518,7 +2601,9 @@ static void liveness_pass_1(TCGContext *s)
             break;
         case INDEX_op_discard:
             /* mark the temporary as dead */
-            arg_temp(op->args[0])->state = TS_DEAD;
+            ts = arg_temp(op->args[0]);
+            ts->state = TS_DEAD;
+            la_reset_pref(ts);
             break;
 
         case INDEX_op_add2_i32:
@@ -2622,43 +2707,91 @@ static void liveness_pass_1(TCGContext *s)
             break;
 
         do_not_remove:
-            /* output args are dead */
             for (i = 0; i < nb_oargs; i++) {
-                arg_ts = arg_temp(op->args[i]);
-                if (arg_ts->state & TS_DEAD) {
+                ts = arg_temp(op->args[i]);
+
+                /* Remember the preference of the uses that followed.  */
+                op->output_pref[i] = *la_temp_pref(ts);
+
+                /* Output args are dead.  */
+                if (ts->state & TS_DEAD) {
                     arg_life |= DEAD_ARG << i;
                 }
-                if (arg_ts->state & TS_MEM) {
+                if (ts->state & TS_MEM) {
                     arg_life |= SYNC_ARG << i;
                 }
-                arg_ts->state = TS_DEAD;
+                ts->state = TS_DEAD;
+                la_reset_pref(ts);
             }
 
-            /* if end of basic block, update */
+            /* If end of basic block, update.  */
             if (def->flags & TCG_OPF_BB_EXIT) {
                 la_func_end(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_BB_END) {
                 la_bb_end(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
                 la_global_sync(s, nb_globals);
+                if (def->flags & TCG_OPF_CALL_CLOBBER) {
+                    la_cross_call(s, nb_temps);
+                }
             }
 
-            /* record arguments that die in this opcode */
+            /* Record arguments that die in this opcode.  */
             for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-                arg_ts = arg_temp(op->args[i]);
-                if (arg_ts->state & TS_DEAD) {
+                ts = arg_temp(op->args[i]);
+                if (ts->state & TS_DEAD) {
                     arg_life |= DEAD_ARG << i;
                 }
             }
-            /* input arguments are live for preceding opcodes */
+
+            /* Input arguments are live for preceding opcodes.  */
             for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
-                arg_temp(op->args[i])->state &= ~TS_DEAD;
+                ts = arg_temp(op->args[i]);
+                if (ts->state & TS_DEAD) {
+                    /* For operands that were dead, initially allow
+                       all regs for the type.  */
+                    *la_temp_pref(ts) = tcg_target_available_regs[ts->type];
+                    ts->state &= ~TS_DEAD;
+                }
+            }
+
+            /* Incorporate constraints for this operand.  */
+            switch (opc) {
+            case INDEX_op_mov_i32:
+            case INDEX_op_mov_i64:
+                /* Note that these are TCG_OPF_NOT_PRESENT and do not
+                   have proper constraints.  That said, special case
+                   moves to propagate preferences backward.  */
+                if (IS_DEAD_ARG(1)) {
+                    *la_temp_pref(arg_temp(op->args[0]))
+                        = *la_temp_pref(arg_temp(op->args[1]));
+                }
+                break;
+
+            default:
+                for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
+                    const TCGArgConstraint *ct = &def->args_ct[i];
+                    TCGRegSet set, *pset;
+
+                    ts = arg_temp(op->args[i]);
+                    pset = la_temp_pref(ts);
+                    set = *pset;
+
+                    set &= ct->u.regs;
+                    if (ct->ct & TCG_CT_IALIAS) {
+                        set &= op->output_pref[ct->alias_index];
+                    }
+                    /* If the combination is not possible, restart.  */
+                    if (set == 0) {
+                        set = ct->u.regs;
+                    }
+                    *pset = set;
+                }
+                break;
             }
             break;
         }
         op->life = arg_life;
-        op->output_pref[0] = 0;
-        op->output_pref[1] = 0;
     }
 }
 
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (11 preceding siblings ...)
  2018-11-28  5:38 ` [Qemu-devel] [PATCH 12/12] tcg: Record register preferences during liveness Richard Henderson
@ 2018-11-28 22:15 ` Emilio G. Cota
  2018-11-29 19:23   ` Richard Henderson
  2018-12-24 21:53 ` Emilio G. Cota
  13 siblings, 1 reply; 20+ messages in thread
From: Emilio G. Cota @ 2018-11-28 22:15 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Tue, Nov 27, 2018 at 21:38:22 -0800, Richard Henderson wrote:
> The intent here is to remove several move insns putting the
> function arguments into the proper place.  I'm hoping that
> this will solve the skylake regression with spec2006, as
> seen with the ool softmmu patch set.
> 
> Emilio, all of this is present on my tcg-next-for-4.0 branch.

Thanks for this.

Unfortunately, it doesn't seem to help, performance-wise.

I've benchmarked this on three different machines: Sandy
Bridge, Haswell and Skylake. The average slowdown vs.
the baseline is ~0%, ~5%, and ~10%, respectively.

So it seems the more modern the microarchitecture, the more
severe the slowdown (this is consistent with the assumption
that processors are getting better at caching over time).

Here are all the bar charts:

  https://imgur.com/a/k7vmjVd

- baseline: tcg-next-for-4.0's parent from master, i.e.
  4822f1e ("Merge remote-tracking branch
  'remotes/kraxel/tags/fixes-31-20181127-pull-request'
  into staging", 2018-11-27)

- ool: dc93c4a ("tcg/ppc: Use TCG_TARGET_NEED_LDST_OOL_LABELS",
  2018-11-27)

- ool-regs: a9bac58 ("tcg: Record register preferences during
  liveness", 2018-11-27)

I've also looked at hardware event counts on Skylake for
the above three commits. It seems that the indirection of
the (very) frequent ool calls/rets are what cause the large
reduction in IPC (results for bootup + hmmer):

- baseline:
   291,451,142,426      instructions              #    2.94  insn per cycle           (71.45%)
    99,050,829,190      cycles                                                        (71.49%)
     2,678,751,743      br_inst_retired.near_call                                     (71.43%)
     2,674,367,278      br_inst_retired.near_return                                   (71.42%)
    34,065,079,963      branches                                                      (57.09%)
       161,441,496      branch-misses             #    0.47% of all branches          (57.17%)
      29.916874137 seconds time elapsed

- ool:
   312,368,465,806      instructions              #    2.79  insn per cycle           (71.45%)
   111,863,014,212      cycles                                                        (71.31%)
    11,751,151,140      br_inst_retired.near_call                                     (71.30%)
    11,736,770,191      br_inst_retired.near_return                                   (71.41%)
        24,660,597      br_misp_retired.near_call                                     (71.49%)
    52,096,512,558      branches                                                      (57.28%)
       176,951,727      branch-misses             #    0.34% of all branches          (57.20%)
      33.285149773 seconds time elapsed

- ool-regs:
   309,253,149,588      instructions              #    2.71  insn per cycle           (71.47%)
   113,938,069,597      cycles                                                        (71.50%)
    11,735,199,530      br_inst_retired.near_call                                     (71.51%)
    11,725,686,909      br_inst_retired.near_return                                   (71.54%)
        24,885,204      br_misp_retired.near_call                                     (71.46%)
    52,768,150,694      branches                                                      (56.97%)
       184,421,824      branch-misses             #    0.35% of all branches          (57.03%)
      33.867122498 seconds time elapsed 

The additional branches are all from call/ret. I double-checked the generated
code and these are all well-matched (no jmp's instead of ret's), so
I don't think we can optimize anything there; it seems to me that this
is just a code size vs. speed trade-off.

ool-regs has even lower IPC, but it also uses less instructions, which
mitigates the slowdown due to lower IPC. The bottleneck in the ool
calls/rets remains, which explains why there isn't much to
be gained from the lower number of insns.

Let me know if you want me to do any other data collection.

Thanks,

		Emilio

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
  2018-11-28 22:15 ` [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Emilio G. Cota
@ 2018-11-29 19:23   ` Richard Henderson
  2018-11-30  0:39     ` Emilio G. Cota
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Henderson @ 2018-11-29 19:23 UTC (permalink / raw)
  To: Emilio G. Cota; +Cc: qemu-devel

On 11/28/18 2:15 PM, Emilio G. Cota wrote:
> Unfortunately, it doesn't seem to help, performance-wise.

That is really disappointing, considering the size gains are huge -- even more
dramatically for non-x86 hosts.  I will see about some more benchmarking on
this for other host/guest combinations.

Thanks!


r~

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
  2018-11-29 19:23   ` Richard Henderson
@ 2018-11-30  0:39     ` Emilio G. Cota
  2018-11-30  3:00       ` Emilio G. Cota
  0 siblings, 1 reply; 20+ messages in thread
From: Emilio G. Cota @ 2018-11-30  0:39 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Nov 29, 2018 at 11:23:09 -0800, Richard Henderson wrote:
> On 11/28/18 2:15 PM, Emilio G. Cota wrote:
> > Unfortunately, it doesn't seem to help, performance-wise.
> 
> That is really disappointing, considering the size gains are huge -- even more
> dramatically for non-x86 hosts.  I will see about some more benchmarking on
> this for other host/guest combinations.

A64 and POWER9 host numbers:

  https://imgur.com/a/m6Pss99

There's quite a bit of noise in the P9 measurements, but it's
a shared machine so I can't do much about that.

I'll update the A64 results with error bars later tonight,
when I get further results.

		E.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
  2018-11-30  0:39     ` Emilio G. Cota
@ 2018-11-30  3:00       ` Emilio G. Cota
  2018-11-30  7:15         ` Laurent Desnogues
  0 siblings, 1 reply; 20+ messages in thread
From: Emilio G. Cota @ 2018-11-30  3:00 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Nov 29, 2018 at 19:39:15 -0500, Emilio G. Cota wrote:
> A64 and POWER9 host numbers:
> 
>   https://imgur.com/a/m6Pss99
> 
> There's quite a bit of noise in the P9 measurements, but it's
> a shared machine so I can't do much about that.
> 
> I'll update the A64 results with error bars later tonight,
> when I get further results.

Here they are:

  https://imgur.com/a/EAAapSW

The second image is the same results, but zoomed in. I could
bring the confidence intervals down by running this many times,
but each run takes 2h and I only have access to the
machine for a few hours at a time.

Those confidence intervals are generated from only 2 runs per benchmark,
which explains why they're so large.

		E.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
  2018-11-30  3:00       ` Emilio G. Cota
@ 2018-11-30  7:15         ` Laurent Desnogues
  2018-11-30 15:56           ` Emilio G. Cota
  0 siblings, 1 reply; 20+ messages in thread
From: Laurent Desnogues @ 2018-11-30  7:15 UTC (permalink / raw)
  To: Emilio G. Cota; +Cc: Richard Henderson, qemu-devel

On Fri, Nov 30, 2018 at 4:00 AM Emilio G. Cota <cota@braap.org> wrote:
>
> On Thu, Nov 29, 2018 at 19:39:15 -0500, Emilio G. Cota wrote:
> > A64 and POWER9 host numbers:
> >
> >   https://imgur.com/a/m6Pss99
> >
> > There's quite a bit of noise in the P9 measurements, but it's
> > a shared machine so I can't do much about that.
> >
> > I'll update the A64 results with error bars later tonight,
> > when I get further results.
>
> Here they are:
>
>   https://imgur.com/a/EAAapSW

What is a X-Gene A57? It's either X-Gene or A57 :-)

Thanks,

Laurent

> The second image is the same results, but zoomed in. I could
> bring the confidence intervals down by running this many times,
> but each run takes 2h and I only have access to the
> machine for a few hours at a time.
>
> Those confidence intervals are generated from only 2 runs per benchmark,
> which explains why they're so large.
>
>                 E.
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
  2018-11-30  7:15         ` Laurent Desnogues
@ 2018-11-30 15:56           ` Emilio G. Cota
  0 siblings, 0 replies; 20+ messages in thread
From: Emilio G. Cota @ 2018-11-30 15:56 UTC (permalink / raw)
  To: Laurent Desnogues; +Cc: Richard Henderson, qemu-devel

On Fri, Nov 30, 2018 at 08:15:56 +0100, Laurent Desnogues wrote:
> On Fri, Nov 30, 2018 at 4:00 AM Emilio G. Cota <cota@braap.org> wrote:
> >
> > On Thu, Nov 29, 2018 at 19:39:15 -0500, Emilio G. Cota wrote:
> > > A64 and POWER9 host numbers:
> > >
> > >   https://imgur.com/a/m6Pss99
> > >
> > > There's quite a bit of noise in the P9 measurements, but it's
> > > a shared machine so I can't do much about that.
> > >
> > > I'll update the A64 results with error bars later tonight,
> > > when I get further results.
> >
> > Here they are:
> >
> >   https://imgur.com/a/EAAapSW
> 
> What is a X-Gene A57? It's either X-Gene or A57 :-)

You're right -- this is an X-Gene (xgene 1).

The A57 reference came from here:

 https://www.cloudlab.us/hardware.php
 m400 nodes: 45 per chassis, 315 total
 Processor/Chipset: Applied Micro X-Gene system-on-chip
 Eight 64-bit ARMv8 (Atlas/A57) cores at 2.4 GHz
                           ^^^

I'm not familiar with ARMv8's commercial offerings, so I
just quoted the above--which turns out to be wrong,
since A57 is an ARM design and X-Gene is not.

Thanks,

		E.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls
  2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
                   ` (12 preceding siblings ...)
  2018-11-28 22:15 ` [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Emilio G. Cota
@ 2018-12-24 21:53 ` Emilio G. Cota
  13 siblings, 0 replies; 20+ messages in thread
From: Emilio G. Cota @ 2018-12-24 21:53 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Tue, Nov 27, 2018 at 21:38:22 -0800, Richard Henderson wrote:
> The intent here is to remove several move insns putting the
> function arguments into the proper place.  I'm hoping that
> this will solve the skylake regression with spec2006, as
> seen with the ool softmmu patch set.

Reviewed-by: Emilio G. Cota <cota@braap.org>

for the series.

Thanks,

		E.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2018-12-24 21:53 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-28  5:38 [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 01/12] tcg: Add preferred_reg argument to tcg_reg_alloc Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 02/12] tcg: Add preferred_reg argument to temp_load Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 03/12] tcg: Add preferred_reg argument to temp_sync Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 04/12] tcg: Add preferred_reg argument to tcg_reg_alloc_do_movi Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 05/12] tcg: Add output_pref to TCGOp Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 06/12] tcg: Improve register allocation for matching constraints Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 07/12] tcg: Dump register preference info with liveness Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 08/12] tcg: Reindent parts of liveness_pass_1 Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 09/12] tcg: Rename and adjust liveness_pass_1 helpers Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 10/12] tcg: Split out more subroutines from liveness_pass_1 Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 11/12] tcg: Add TCG_OPF_BB_EXIT Richard Henderson
2018-11-28  5:38 ` [Qemu-devel] [PATCH 12/12] tcg: Record register preferences during liveness Richard Henderson
2018-11-28 22:15 ` [Qemu-devel] [PATCH 00/12] tcg: Improve register allocation for calls Emilio G. Cota
2018-11-29 19:23   ` Richard Henderson
2018-11-30  0:39     ` Emilio G. Cota
2018-11-30  3:00       ` Emilio G. Cota
2018-11-30  7:15         ` Laurent Desnogues
2018-11-30 15:56           ` Emilio G. Cota
2018-12-24 21:53 ` Emilio G. Cota

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.