All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes
@ 2013-08-17 23:26 Richard Henderson
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 1/4] " Richard Henderson
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Richard Henderson @ 2013-08-17 23:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

We have -- or will have -- several targets which have a native
multiply-highpart instruction: ppc*, ia64, aarch64, alpha.

If we leave only the mul[us]2 opcode with which to expose this,
we have to handle the register allocation bits in the backends.
Better, IMO, to expose the two parts at the TCG opcode level,
simplifying the backends.

I've left tcg_gen_mul[us]_i{32,64} as the "public" interface to
these opcodes at the translator level.  If the guest does not
need both results, they can just be ignored.  If the host has a
combined mult insn (i386, arm) then one output is garbage; if
the host has separate mult insns, then the optimizer can delete
the unused opcode.

Really only tested with x86_64 and ppc64.
The linux-user-test image for alpha sees:

IN: 
0x0000004000814148:  umulh      t5,t0,t0

OP:
 ld_i32 tmp0,env,$0xffffffffffffffa8
 movi_i32 tmp1,$0x0
 brcond_i32 tmp0,tmp1,ne,$0x0
 ---- 0x4000814148
 mul_i64 tmp3,ir6,ir1
 muluh_i64 ir1,ir6,ir1
 mov_i64 tmp2,tmp3
 movi_i64 pc,$0x400081414c
 exit_tb $0x0
 set_label $0x0
 exit_tb $0x3fff8c244483

OP after optimization and liveness analysis:
 ld_i32 tmp0,env,$0xffffffffffffffa8
 movi_i32 tmp1,$0x0
 brcond_i32 tmp0,tmp1,ne,$0x0
 ---- 0x4000814148
 nopn $0x3,$0xd,$0x3
 muluh_i64 ir1,ir1,ir6
 nopn $0x2,$0x2
 movi_i64 pc,$0x400081414c
 exit_tb $0x0
 set_label $0x0
 exit_tb $0x3fff8c244483
 end 

OUT: [size=76]
0x6011b0f0:  lwz     r14,-88(r27)
0x6011b0f4:  cmpwi   cr7,r14,0
0x6011b0f8:  bne-    cr7,0x6011b128
0x6011b0fc:  ld      r14,8(r27)
0x6011b100:  ld      r15,48(r27)
0x6011b104:  mulhdu  r14,r14,r15
0x6011b108:  std     r14,8(r27)
...



r~


Richard Henderson (4):
  tcg: Add muluh and mulsh opcodes
  tcg-mips: Implement mulsh, muluh
  tcg-ppc64: Implement muluh, mulsh
  tcg: Constant fold div, rem

 tcg/aarch64/tcg-target.h |  4 ++++
 tcg/arm/tcg-target.h     |  2 ++
 tcg/hppa/tcg-target.h    |  2 ++
 tcg/i386/tcg-target.h    |  4 ++++
 tcg/ia64/tcg-target.h    |  4 ++++
 tcg/mips/tcg-target.c    | 10 ++++++++++
 tcg/mips/tcg-target.h    |  2 ++
 tcg/optimize.c           | 43 +++++++++++++++++++++++++++++++++++++++++++
 tcg/ppc/tcg-target.h     |  2 ++
 tcg/ppc64/tcg-target.c   | 32 +++++++-------------------------
 tcg/ppc64/tcg-target.h   |  8 ++++++--
 tcg/s390/tcg-target.h    |  4 ++++
 tcg/sparc/tcg-target.h   |  4 ++++
 tcg/tcg-op.h             | 40 ++++++++++++++++++++++++++++++++++++----
 tcg/tcg-opc.h            |  4 ++++
 tcg/tcg.c                | 36 ++++++++++++++++++++++++++++++------
 tcg/tcg.h                |  2 ++
 tcg/tci/tcg-target.h     |  5 ++++-
 18 files changed, 170 insertions(+), 38 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 1/4] tcg: Add muluh and mulsh opcodes
  2013-08-17 23:26 [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
@ 2013-08-17 23:26 ` Richard Henderson
  2013-08-28 20:59   ` Aurelien Jarno
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 2/4] tcg-mips: Implement mulsh, muluh Richard Henderson
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2013-08-17 23:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Use them in places where mulu2 and muls2 are used.
Optimize mulx2 with dead low part to mulxh.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/aarch64/tcg-target.h |  4 ++++
 tcg/arm/tcg-target.h     |  2 ++
 tcg/hppa/tcg-target.h    |  2 ++
 tcg/i386/tcg-target.h    |  4 ++++
 tcg/ia64/tcg-target.h    |  4 ++++
 tcg/mips/tcg-target.h    |  2 ++
 tcg/optimize.c           | 20 ++++++++++++++++++++
 tcg/ppc/tcg-target.h     |  2 ++
 tcg/ppc64/tcg-target.h   |  4 ++++
 tcg/s390/tcg-target.h    |  4 ++++
 tcg/sparc/tcg-target.h   |  4 ++++
 tcg/tcg-op.h             | 40 ++++++++++++++++++++++++++++++++++++----
 tcg/tcg-opc.h            |  4 ++++
 tcg/tcg.c                | 36 ++++++++++++++++++++++++++++++------
 tcg/tcg.h                |  2 ++
 tcg/tci/tcg-target.h     |  5 ++++-
 16 files changed, 128 insertions(+), 11 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 51e5092..26ee28b 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -61,6 +61,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i32         0
 #define TCG_TARGET_HAS_mulu2_i32        0
 #define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 #define TCG_TARGET_HAS_div_i64          0
 #define TCG_TARGET_HAS_rem_i64          0
@@ -87,6 +89,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i64         0
 #define TCG_TARGET_HAS_mulu2_i64        0
 #define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 
 enum {
     TCG_AREG0 = TCG_REG_X19,
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 5cd9d6a..ed48092 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -80,6 +80,8 @@ extern bool use_idiv_instructions;
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_movcond_i32      1
 #define TCG_TARGET_HAS_muls2_i32        1
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
 #define TCG_TARGET_HAS_rem_i32          0
 
diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
index 25467bd..0f6f2ff 100644
--- a/tcg/hppa/tcg-target.h
+++ b/tcg/hppa/tcg-target.h
@@ -100,6 +100,8 @@ typedef enum {
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_movcond_i32      1
 #define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_neg_i32          0 /* sub rd, 0, rs */
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index e3f6bb9..b7d1a55 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -96,6 +96,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i32         1
 #define TCG_TARGET_HAS_mulu2_i32        1
 #define TCG_TARGET_HAS_muls2_i32        1
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div2_i64         1
@@ -122,6 +124,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i64         1
 #define TCG_TARGET_HAS_mulu2_i64        1
 #define TCG_TARGET_HAS_muls2_i64        1
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index f32d519..ee6b2c8 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -146,6 +146,10 @@ typedef enum {
 #define TCG_TARGET_HAS_mulu2_i64        0
 #define TCG_TARGET_HAS_muls2_i32        0
 #define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i32        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) ((len) <= 16)
 #define TCG_TARGET_deposit_i64_valid(ofs, len) ((len) <= 16)
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index a438950..6cb7c2f 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -89,6 +89,8 @@ typedef enum {
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_muls2_i32        1
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 /* optional instructions only implemented on MIPS4, MIPS32 and Loongson 2 */
 #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
diff --git a/tcg/optimize.c b/tcg/optimize.c
index b35868a..e8dedf3 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -198,6 +198,8 @@ static TCGOpcode op_to_mov(TCGOpcode op)
 
 static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
 {
+    uint64_t l64, h64;
+
     switch (op) {
     CASE_OP_32_64(add):
         return x + y;
@@ -290,6 +292,18 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
     case INDEX_op_ext32u_i64:
         return (uint32_t)x;
 
+    case INDEX_op_muluh_i32:
+        return ((uint64_t)(uint32_t)x * (uint32_t)y) >> 32;
+    case INDEX_op_mulsh_i32:
+        return ((int64_t)(int32_t)x * (int32_t)y) >> 32;
+
+    case INDEX_op_muluh_i64:
+        mulu64(&l64, &h64, x, y);
+        return h64;
+    case INDEX_op_mulsh_i64:
+        muls64(&l64, &h64, x, y);
+        return h64;
+
     default:
         fprintf(stderr,
                 "Unrecognized operation %d in do_constant_folding.\n", op);
@@ -531,6 +545,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(eqv):
         CASE_OP_32_64(nand):
         CASE_OP_32_64(nor):
+        CASE_OP_32_64(muluh):
+        CASE_OP_32_64(mulsh):
             swap_commutative(args[0], &args[1], &args[2]);
             break;
         CASE_OP_32_64(brcond):
@@ -771,6 +787,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         switch (op) {
         CASE_OP_32_64(and):
         CASE_OP_32_64(mul):
+        CASE_OP_32_64(muluh):
+        CASE_OP_32_64(mulsh):
             if ((temps[args[2]].state == TCG_TEMP_CONST
                 && temps[args[2]].val == 0)) {
                 s->gen_opc_buf[op_index] = op_to_movi(op);
@@ -882,6 +900,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(eqv):
         CASE_OP_32_64(nand):
         CASE_OP_32_64(nor):
+        CASE_OP_32_64(muluh):
+        CASE_OP_32_64(mulsh):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 s->gen_opc_buf[op_index] = op_to_movi(op);
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index b42d97c..613c5ff 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -96,6 +96,8 @@ typedef enum {
 #define TCG_TARGET_HAS_deposit_i32      1
 #define TCG_TARGET_HAS_movcond_i32      1
 #define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index 48fc6e2..0789daf 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -95,6 +95,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i32         0
 #define TCG_TARGET_HAS_mulu2_i32        0
 #define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 #define TCG_TARGET_HAS_div_i64          1
 #define TCG_TARGET_HAS_rem_i64          0
@@ -118,6 +120,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i64         1
 #define TCG_TARGET_HAS_mulu2_i64        1
 #define TCG_TARGET_HAS_muls2_i64        1
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 
 #define TCG_AREG0 TCG_REG_R27
 
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index 42ca36c..b02f170 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -69,6 +69,8 @@ typedef enum TCGReg {
 #define TCG_TARGET_HAS_sub2_i32         1
 #define TCG_TARGET_HAS_mulu2_i32        0
 #define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 #define TCG_TARGET_HAS_div2_i64         1
 #define TCG_TARGET_HAS_rot_i64          1
@@ -94,6 +96,8 @@ typedef enum TCGReg {
 #define TCG_TARGET_HAS_sub2_i64         1
 #define TCG_TARGET_HAS_mulu2_i64        1
 #define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 
 extern bool tcg_target_deposit_valid(int ofs, int len);
 #define TCG_TARGET_deposit_i32_valid  tcg_target_deposit_valid
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index dab52d7..1a696bc 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -107,6 +107,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i32         1
 #define TCG_TARGET_HAS_mulu2_i32        1
 #define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_div_i64          1
@@ -134,6 +136,8 @@ typedef enum {
 #define TCG_TARGET_HAS_sub2_i64         0
 #define TCG_TARGET_HAS_mulu2_i64        0
 #define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 #endif
 
 #define TCG_AREG0 TCG_REG_I0
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 364964d..3de7545 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -1039,10 +1039,18 @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
     t0 = tcg_temp_new_i64();
     t1 = tcg_temp_new_i32();
 
-    tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
-                    TCGV_LOW(arg1), TCGV_LOW(arg2));
-    /* Allow the optimizer room to replace mulu2 with two moves.  */
-    tcg_gen_op0(INDEX_op_nop);
+    if (TCG_TARGET_HAS_mulu2_i32) {
+        tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
+                        TCGV_LOW(arg1), TCGV_LOW(arg2));
+        /* Allow the optimizer room to replace mulu2 with two moves.  */
+        tcg_gen_op0(INDEX_op_nop);
+    } else {
+        tcg_debug_assert(TCG_TARGET_HAS_muluh_i32);
+        tcg_gen_op3_i32(INDEX_op_mul_i32, TCGV_LOW(t0),
+                        TCGV_LOW(arg1), TCGV_LOW(arg2));
+        tcg_gen_op3_i32(INDEX_op_muluh_i32, TCGV_HIGH(t0),
+                        TCGV_LOW(arg1), TCGV_LOW(arg2));
+    }
 
     tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
     tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
@@ -2401,6 +2409,12 @@ static inline void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh,
         tcg_gen_op4_i32(INDEX_op_mulu2_i32, rl, rh, arg1, arg2);
         /* Allow the optimizer room to replace mulu2 with two moves.  */
         tcg_gen_op0(INDEX_op_nop);
+    } else if (TCG_TARGET_HAS_muluh_i32) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
+        tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
+        tcg_gen_mov_i32(rl, t);
+        tcg_temp_free_i32(t);
     } else {
         TCGv_i64 t0 = tcg_temp_new_i64();
         TCGv_i64 t1 = tcg_temp_new_i64();
@@ -2420,6 +2434,12 @@ static inline void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh,
         tcg_gen_op4_i32(INDEX_op_muls2_i32, rl, rh, arg1, arg2);
         /* Allow the optimizer room to replace muls2 with two moves.  */
         tcg_gen_op0(INDEX_op_nop);
+    } else if (TCG_TARGET_HAS_mulsh_i32) {
+        TCGv_i32 t = tcg_temp_new_i32();
+        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
+        tcg_gen_op3_i32(INDEX_op_mulsh_i32, rh, arg1, arg2);
+        tcg_gen_mov_i32(rl, t);
+        tcg_temp_free_i32(t);
     } else if (TCG_TARGET_REG_BITS == 32 && TCG_TARGET_HAS_mulu2_i32) {
         TCGv_i32 t0 = tcg_temp_new_i32();
         TCGv_i32 t1 = tcg_temp_new_i32();
@@ -2499,6 +2519,12 @@ static inline void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh,
         tcg_gen_op4_i64(INDEX_op_mulu2_i64, rl, rh, arg1, arg2);
         /* Allow the optimizer room to replace mulu2 with two moves.  */
         tcg_gen_op0(INDEX_op_nop);
+    } else if (TCG_TARGET_HAS_muluh_i64) {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
+        tcg_gen_op3_i64(INDEX_op_muluh_i64, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t);
+        tcg_temp_free_i64(t);
     } else if (TCG_TARGET_HAS_mulu2_i64) {
         TCGv_i64 t0 = tcg_temp_new_i64();
         TCGv_i64 t1 = tcg_temp_new_i64();
@@ -2540,6 +2566,12 @@ static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
         tcg_gen_op4_i64(INDEX_op_muls2_i64, rl, rh, arg1, arg2);
         /* Allow the optimizer room to replace muls2 with two moves.  */
         tcg_gen_op0(INDEX_op_nop);
+    } else if (TCG_TARGET_HAS_mulsh_i64) {
+        TCGv_i64 t = tcg_temp_new_i64();
+        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
+        tcg_gen_op3_i64(INDEX_op_mulsh_i64, rh, arg1, arg2);
+        tcg_gen_mov_i64(rl, t);
+        tcg_temp_free_i64(t);
     } else {
         TCGv_i64 t0 = tcg_temp_new_i64();
         int sizemask = 0;
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index a8af5b9..a75c29d 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -91,6 +91,8 @@ DEF(add2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_add2_i32))
 DEF(sub2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_sub2_i32))
 DEF(mulu2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_mulu2_i32))
 DEF(muls2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_muls2_i32))
+DEF(muluh_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_muluh_i32))
+DEF(mulsh_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i32))
 DEF(brcond2_i32, 0, 4, 2, TCG_OPF_BB_END | IMPL(TCG_TARGET_REG_BITS == 32))
 DEF(setcond2_i32, 1, 4, 1, IMPL(TCG_TARGET_REG_BITS == 32))
 
@@ -167,6 +169,8 @@ DEF(add2_i64, 2, 4, 0, IMPL64 | IMPL(TCG_TARGET_HAS_add2_i64))
 DEF(sub2_i64, 2, 4, 0, IMPL64 | IMPL(TCG_TARGET_HAS_sub2_i64))
 DEF(mulu2_i64, 2, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_mulu2_i64))
 DEF(muls2_i64, 2, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_muls2_i64))
+DEF(muluh_i64, 1, 2, 0, IMPL(TCG_TARGET_HAS_muluh_i64))
+DEF(mulsh_i64, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i64))
 
 /* QEMU specific */
 #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
diff --git a/tcg/tcg.c b/tcg/tcg.c
index dac8224..75034ca 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1243,12 +1243,13 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
 static void tcg_liveness_analysis(TCGContext *s)
 {
     int i, op_index, nb_args, nb_iargs, nb_oargs, arg, nb_ops;
-    TCGOpcode op, op_new;
+    TCGOpcode op, op_new, op_new2;
     TCGArg *args;
     const TCGOpDef *def;
     uint8_t *dead_temps, *mem_temps;
     uint16_t dead_args;
     uint8_t sync_args;
+    bool have_op_new2;
     
     s->gen_opc_ptr++; /* skip end */
 
@@ -1385,29 +1386,52 @@ static void tcg_liveness_analysis(TCGContext *s)
             goto do_not_remove;
 
         case INDEX_op_mulu2_i32:
+            op_new = INDEX_op_mul_i32;
+            op_new2 = INDEX_op_muluh_i32;
+            have_op_new2 = TCG_TARGET_HAS_muluh_i32;
+            goto do_mul2;
         case INDEX_op_muls2_i32:
             op_new = INDEX_op_mul_i32;
+            op_new2 = INDEX_op_mulsh_i32;
+            have_op_new2 = TCG_TARGET_HAS_mulsh_i32;
             goto do_mul2;
         case INDEX_op_mulu2_i64:
+            op_new = INDEX_op_mul_i64;
+            op_new2 = INDEX_op_muluh_i64;
+            have_op_new2 = TCG_TARGET_HAS_muluh_i64;
+            goto do_mul2;
         case INDEX_op_muls2_i64:
             op_new = INDEX_op_mul_i64;
+            op_new2 = INDEX_op_mulsh_i64;
+            have_op_new2 = TCG_TARGET_HAS_mulsh_i64;
+            goto do_mul2;
         do_mul2:
             args -= 4;
             nb_iargs = 2;
             nb_oargs = 2;
-            /* Likewise, test for the high part of the operation dead.  */
             if (dead_temps[args[1]] && !mem_temps[args[1]]) {
                 if (dead_temps[args[0]] && !mem_temps[args[0]]) {
+                    /* Both parts of the operation are dead.  */
                     goto do_remove;
                 }
+                /* The high part of the operation is dead; generate the low. */
                 s->gen_opc_buf[op_index] = op = op_new;
                 args[1] = args[2];
                 args[2] = args[3];
-                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
-                tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 1);
-                /* Fall through and mark the single-word operation live.  */
-                nb_oargs = 1;
+            } else if (have_op_new2 && dead_temps[args[0]]
+                       && !mem_temps[args[0]]) {
+                /* The low part of the operation is dead; generate the high.  */
+                s->gen_opc_buf[op_index] = op = op_new2;
+                args[0] = args[1];
+                args[1] = args[2];
+                args[2] = args[3];
+            } else {
+                goto do_not_remove;
             }
+            assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
+            tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 1);
+            /* Mark the single-word operation live.  */
+            nb_oargs = 1;
             goto do_not_remove;
 
         default:
diff --git a/tcg/tcg.h b/tcg/tcg.h
index f3f9889..3f869dd 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -85,6 +85,8 @@ typedef uint64_t TCGRegSet;
 #define TCG_TARGET_HAS_sub2_i64         0
 #define TCG_TARGET_HAS_mulu2_i64        0
 #define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 /* Turn some undef macros into true macros.  */
 #define TCG_TARGET_HAS_add2_i32         1
 #define TCG_TARGET_HAS_sub2_i32         1
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index d7fc14e..ff12b4b 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -76,6 +76,8 @@
 #define TCG_TARGET_HAS_rot_i32          1
 #define TCG_TARGET_HAS_movcond_i32      0
 #define TCG_TARGET_HAS_muls2_i32        0
+#define TCG_TARGET_HAS_muluh_i32        0
+#define TCG_TARGET_HAS_mulsh_i32        0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_bswap16_i64      1
@@ -100,13 +102,14 @@
 #define TCG_TARGET_HAS_rot_i64          1
 #define TCG_TARGET_HAS_movcond_i64      0
 #define TCG_TARGET_HAS_muls2_i64        0
-
 #define TCG_TARGET_HAS_add2_i32         0
 #define TCG_TARGET_HAS_sub2_i32         0
 #define TCG_TARGET_HAS_mulu2_i32        0
 #define TCG_TARGET_HAS_add2_i64         0
 #define TCG_TARGET_HAS_sub2_i64         0
 #define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        0
+#define TCG_TARGET_HAS_mulsh_i64        0
 #endif /* TCG_TARGET_REG_BITS == 64 */
 
 /* Number of registers available.
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 2/4] tcg-mips: Implement mulsh, muluh
  2013-08-17 23:26 [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 1/4] " Richard Henderson
@ 2013-08-17 23:26 ` Richard Henderson
  2013-08-28 20:59   ` Aurelien Jarno
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 3/4] tcg-ppc64: Implement muluh, mulsh Richard Henderson
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2013-08-17 23:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

With the optimization in tcg_liveness_analysis,
we can avoid the MFLO when it is unused.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/mips/tcg-target.c | 10 ++++++++++
 tcg/mips/tcg-target.h |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index 793532e..31cd514 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -1423,6 +1423,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
         tcg_out_opc_reg(s, OPC_MFHI, args[1], 0, 0);
         break;
+    case INDEX_op_mulsh_i32:
+        tcg_out_opc_reg(s, OPC_MULT, 0, args[1], args[2]);
+        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
+        break;
+    case INDEX_op_muluh_i32:
+        tcg_out_opc_reg(s, OPC_MULTU, 0, args[1], args[2]);
+        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
+        break;
     case INDEX_op_div_i32:
         tcg_out_opc_reg(s, OPC_DIV, 0, args[1], args[2]);
         tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
@@ -1602,6 +1610,8 @@ static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_mul_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_muls2_i32, { "r", "r", "rZ", "rZ" } },
     { INDEX_op_mulu2_i32, { "r", "r", "rZ", "rZ" } },
+    { INDEX_op_mulsh_i32, { "r", "rZ", "rZ" } },
+    { INDEX_op_muluh_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_div_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_divu_i32, { "r", "rZ", "rZ" } },
     { INDEX_op_rem_i32, { "r", "rZ", "rZ" } },
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 6cb7c2f..7ef79e0 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -89,8 +89,8 @@ typedef enum {
 #define TCG_TARGET_HAS_eqv_i32          0
 #define TCG_TARGET_HAS_nand_i32         0
 #define TCG_TARGET_HAS_muls2_i32        1
-#define TCG_TARGET_HAS_muluh_i32        0
-#define TCG_TARGET_HAS_mulsh_i32        0
+#define TCG_TARGET_HAS_muluh_i32        1
+#define TCG_TARGET_HAS_mulsh_i32        1
 
 /* optional instructions only implemented on MIPS4, MIPS32 and Loongson 2 */
 #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 3/4] tcg-ppc64: Implement muluh, mulsh
  2013-08-17 23:26 [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 1/4] " Richard Henderson
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 2/4] tcg-mips: Implement mulsh, muluh Richard Henderson
@ 2013-08-17 23:26 ` Richard Henderson
  2013-08-28 21:00   ` Aurelien Jarno
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 4/4] tcg: Constant fold div, rem Richard Henderson
  2013-08-27 21:48 ` [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
  4 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2013-08-17 23:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Using these instead of mulu2 and muls2 lets us avoid having to argument
overlap analysis in the backend.  Normal register allocation will DTRT.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/ppc64/tcg-target.c | 32 +++++++-------------------------
 tcg/ppc64/tcg-target.h |  8 ++++----
 2 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
index 0678de2..939f7cb 100644
--- a/tcg/ppc64/tcg-target.c
+++ b/tcg/ppc64/tcg-target.c
@@ -1975,29 +1975,11 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
         }
         break;
 
-    case INDEX_op_mulu2_i64:
-    case INDEX_op_muls2_i64:
-        {
-            int oph = (opc == INDEX_op_mulu2_i64 ? MULHDU : MULHD);
-            TCGReg outl = args[0], outh = args[1];
-            a0 = args[2], a1 = args[3];
-
-            if (outl == a0 || outl == a1) {
-                if (outh == a0 || outh == a1) {
-                    outl = TCG_REG_R0;
-                } else {
-                    tcg_out32(s, oph | TAB(outh, a0, a1));
-                    oph = 0;
-                }
-            }
-            tcg_out32(s, MULLD | TAB(outl, a0, a1));
-            if (oph != 0) {
-                tcg_out32(s, oph | TAB(outh, a0, a1));
-            }
-            if (outl != args[0]) {
-                tcg_out_mov(s, TCG_TYPE_I64, args[0], outl);
-            }
-        }
+    case INDEX_op_muluh_i64:
+        tcg_out32(s, MULHDU | TAB(args[0], args[1], args[2]));
+        break;
+    case INDEX_op_mulsh_i64:
+        tcg_out32(s, MULHD | TAB(args[0], args[1], args[2]));
         break;
 
     default:
@@ -2124,8 +2106,8 @@ static const TCGTargetOpDef ppc_op_defs[] = {
 
     { INDEX_op_add2_i64, { "r", "r", "r", "r", "rI", "rZM" } },
     { INDEX_op_sub2_i64, { "r", "r", "rI", "r", "rZM", "r" } },
-    { INDEX_op_muls2_i64, { "r", "r", "r", "r" } },
-    { INDEX_op_mulu2_i64, { "r", "r", "r", "r" } },
+    { INDEX_op_mulsh_i64, { "r", "r", "r" } },
+    { INDEX_op_muluh_i64, { "r", "r", "r" } },
 
     { -1 },
 };
diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
index 0789daf..fa4b9da 100644
--- a/tcg/ppc64/tcg-target.h
+++ b/tcg/ppc64/tcg-target.h
@@ -118,10 +118,10 @@ typedef enum {
 #define TCG_TARGET_HAS_movcond_i64      1
 #define TCG_TARGET_HAS_add2_i64         1
 #define TCG_TARGET_HAS_sub2_i64         1
-#define TCG_TARGET_HAS_mulu2_i64        1
-#define TCG_TARGET_HAS_muls2_i64        1
-#define TCG_TARGET_HAS_muluh_i64        0
-#define TCG_TARGET_HAS_mulsh_i64        0
+#define TCG_TARGET_HAS_mulu2_i64        0
+#define TCG_TARGET_HAS_muls2_i64        0
+#define TCG_TARGET_HAS_muluh_i64        1
+#define TCG_TARGET_HAS_mulsh_i64        1
 
 #define TCG_AREG0 TCG_REG_R27
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 4/4] tcg: Constant fold div, rem
  2013-08-17 23:26 [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
                   ` (2 preceding siblings ...)
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 3/4] tcg-ppc64: Implement muluh, mulsh Richard Henderson
@ 2013-08-17 23:26 ` Richard Henderson
  2013-08-28 21:02   ` Aurelien Jarno
  2013-08-27 21:48 ` [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
  4 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2013-08-17 23:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/optimize.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index e8dedf3..b29bf25 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -304,6 +304,25 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
         muls64(&l64, &h64, x, y);
         return h64;
 
+    case INDEX_op_div_i32:
+        /* Avoid crashing on divide by zero, otherwise undefined.  */
+        return (int32_t)x / ((int32_t)y ? : 1);
+    case INDEX_op_divu_i32:
+        return (uint32_t)x / ((uint32_t)y ? : 1);
+    case INDEX_op_div_i64:
+        return (int64_t)x / ((int64_t)y ? : 1);
+    case INDEX_op_divu_i64:
+        return (uint64_t)x / ((uint64_t)y ? : 1);
+
+    case INDEX_op_rem_i32:
+        return (int32_t)x % ((int32_t)y ? : 1);
+    case INDEX_op_remu_i32:
+        return (uint32_t)x % ((uint32_t)y ? : 1);
+    case INDEX_op_rem_i64:
+        return (int64_t)x % ((int64_t)y ? : 1);
+    case INDEX_op_remu_i64:
+        return (uint64_t)x % ((uint64_t)y ? : 1);
+
     default:
         fprintf(stderr,
                 "Unrecognized operation %d in do_constant_folding.\n", op);
@@ -902,6 +921,10 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
         CASE_OP_32_64(nor):
         CASE_OP_32_64(muluh):
         CASE_OP_32_64(mulsh):
+        CASE_OP_32_64(div):
+        CASE_OP_32_64(divu):
+        CASE_OP_32_64(rem):
+        CASE_OP_32_64(remu):
             if (temps[args[1]].state == TCG_TEMP_CONST
                 && temps[args[2]].state == TCG_TEMP_CONST) {
                 s->gen_opc_buf[op_index] = op_to_movi(op);
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes
  2013-08-17 23:26 [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
                   ` (3 preceding siblings ...)
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 4/4] tcg: Constant fold div, rem Richard Henderson
@ 2013-08-27 21:48 ` Richard Henderson
  4 siblings, 0 replies; 10+ messages in thread
From: Richard Henderson @ 2013-08-27 21:48 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Ping.

r~

On 08/17/2013 04:26 PM, Richard Henderson wrote:
> We have -- or will have -- several targets which have a native
> multiply-highpart instruction: ppc*, ia64, aarch64, alpha.
> 
> If we leave only the mul[us]2 opcode with which to expose this,
> we have to handle the register allocation bits in the backends.
> Better, IMO, to expose the two parts at the TCG opcode level,
> simplifying the backends.
> 
> I've left tcg_gen_mul[us]_i{32,64} as the "public" interface to
> these opcodes at the translator level.  If the guest does not
> need both results, they can just be ignored.  If the host has a
> combined mult insn (i386, arm) then one output is garbage; if
> the host has separate mult insns, then the optimizer can delete
> the unused opcode.
> 
> Really only tested with x86_64 and ppc64.
> The linux-user-test image for alpha sees:
> 
> IN: 
> 0x0000004000814148:  umulh      t5,t0,t0
> 
> OP:
>  ld_i32 tmp0,env,$0xffffffffffffffa8
>  movi_i32 tmp1,$0x0
>  brcond_i32 tmp0,tmp1,ne,$0x0
>  ---- 0x4000814148
>  mul_i64 tmp3,ir6,ir1
>  muluh_i64 ir1,ir6,ir1
>  mov_i64 tmp2,tmp3
>  movi_i64 pc,$0x400081414c
>  exit_tb $0x0
>  set_label $0x0
>  exit_tb $0x3fff8c244483
> 
> OP after optimization and liveness analysis:
>  ld_i32 tmp0,env,$0xffffffffffffffa8
>  movi_i32 tmp1,$0x0
>  brcond_i32 tmp0,tmp1,ne,$0x0
>  ---- 0x4000814148
>  nopn $0x3,$0xd,$0x3
>  muluh_i64 ir1,ir1,ir6
>  nopn $0x2,$0x2
>  movi_i64 pc,$0x400081414c
>  exit_tb $0x0
>  set_label $0x0
>  exit_tb $0x3fff8c244483
>  end 
> 
> OUT: [size=76]
> 0x6011b0f0:  lwz     r14,-88(r27)
> 0x6011b0f4:  cmpwi   cr7,r14,0
> 0x6011b0f8:  bne-    cr7,0x6011b128
> 0x6011b0fc:  ld      r14,8(r27)
> 0x6011b100:  ld      r15,48(r27)
> 0x6011b104:  mulhdu  r14,r14,r15
> 0x6011b108:  std     r14,8(r27)
> ...
> 
> 
> 
> r~
> 
> 
> Richard Henderson (4):
>   tcg: Add muluh and mulsh opcodes
>   tcg-mips: Implement mulsh, muluh
>   tcg-ppc64: Implement muluh, mulsh
>   tcg: Constant fold div, rem
> 
>  tcg/aarch64/tcg-target.h |  4 ++++
>  tcg/arm/tcg-target.h     |  2 ++
>  tcg/hppa/tcg-target.h    |  2 ++
>  tcg/i386/tcg-target.h    |  4 ++++
>  tcg/ia64/tcg-target.h    |  4 ++++
>  tcg/mips/tcg-target.c    | 10 ++++++++++
>  tcg/mips/tcg-target.h    |  2 ++
>  tcg/optimize.c           | 43 +++++++++++++++++++++++++++++++++++++++++++
>  tcg/ppc/tcg-target.h     |  2 ++
>  tcg/ppc64/tcg-target.c   | 32 +++++++-------------------------
>  tcg/ppc64/tcg-target.h   |  8 ++++++--
>  tcg/s390/tcg-target.h    |  4 ++++
>  tcg/sparc/tcg-target.h   |  4 ++++
>  tcg/tcg-op.h             | 40 ++++++++++++++++++++++++++++++++++++----
>  tcg/tcg-opc.h            |  4 ++++
>  tcg/tcg.c                | 36 ++++++++++++++++++++++++++++++------
>  tcg/tcg.h                |  2 ++
>  tcg/tci/tcg-target.h     |  5 ++++-
>  18 files changed, 170 insertions(+), 38 deletions(-)
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] tcg: Add muluh and mulsh opcodes
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 1/4] " Richard Henderson
@ 2013-08-28 20:59   ` Aurelien Jarno
  0 siblings, 0 replies; 10+ messages in thread
From: Aurelien Jarno @ 2013-08-28 20:59 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Sat, Aug 17, 2013 at 04:26:43PM -0700, Richard Henderson wrote:
> Use them in places where mulu2 and muls2 are used.
> Optimize mulx2 with dead low part to mulxh.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/aarch64/tcg-target.h |  4 ++++
>  tcg/arm/tcg-target.h     |  2 ++
>  tcg/hppa/tcg-target.h    |  2 ++
>  tcg/i386/tcg-target.h    |  4 ++++
>  tcg/ia64/tcg-target.h    |  4 ++++
>  tcg/mips/tcg-target.h    |  2 ++
>  tcg/optimize.c           | 20 ++++++++++++++++++++
>  tcg/ppc/tcg-target.h     |  2 ++
>  tcg/ppc64/tcg-target.h   |  4 ++++
>  tcg/s390/tcg-target.h    |  4 ++++
>  tcg/sparc/tcg-target.h   |  4 ++++
>  tcg/tcg-op.h             | 40 ++++++++++++++++++++++++++++++++++++----
>  tcg/tcg-opc.h            |  4 ++++
>  tcg/tcg.c                | 36 ++++++++++++++++++++++++++++++------
>  tcg/tcg.h                |  2 ++
>  tcg/tci/tcg-target.h     |  5 ++++-
>  16 files changed, 128 insertions(+), 11 deletions(-)
> 
> diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
> index 51e5092..26ee28b 100644
> --- a/tcg/aarch64/tcg-target.h
> +++ b/tcg/aarch64/tcg-target.h
> @@ -61,6 +61,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i32         0
>  #define TCG_TARGET_HAS_mulu2_i32        0
>  #define TCG_TARGET_HAS_muls2_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  #define TCG_TARGET_HAS_div_i64          0
>  #define TCG_TARGET_HAS_rem_i64          0
> @@ -87,6 +89,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i64         0
>  #define TCG_TARGET_HAS_mulu2_i64        0
>  #define TCG_TARGET_HAS_muls2_i64        0
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  
>  enum {
>      TCG_AREG0 = TCG_REG_X19,
> diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
> index 5cd9d6a..ed48092 100644
> --- a/tcg/arm/tcg-target.h
> +++ b/tcg/arm/tcg-target.h
> @@ -80,6 +80,8 @@ extern bool use_idiv_instructions;
>  #define TCG_TARGET_HAS_deposit_i32      1
>  #define TCG_TARGET_HAS_movcond_i32      1
>  #define TCG_TARGET_HAS_muls2_i32        1
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  #define TCG_TARGET_HAS_div_i32          use_idiv_instructions
>  #define TCG_TARGET_HAS_rem_i32          0
>  
> diff --git a/tcg/hppa/tcg-target.h b/tcg/hppa/tcg-target.h
> index 25467bd..0f6f2ff 100644
> --- a/tcg/hppa/tcg-target.h
> +++ b/tcg/hppa/tcg-target.h
> @@ -100,6 +100,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_deposit_i32      1
>  #define TCG_TARGET_HAS_movcond_i32      1
>  #define TCG_TARGET_HAS_muls2_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  /* optional instructions automatically implemented */
>  #define TCG_TARGET_HAS_neg_i32          0 /* sub rd, 0, rs */
> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> index e3f6bb9..b7d1a55 100644
> --- a/tcg/i386/tcg-target.h
> +++ b/tcg/i386/tcg-target.h
> @@ -96,6 +96,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i32         1
>  #define TCG_TARGET_HAS_mulu2_i32        1
>  #define TCG_TARGET_HAS_muls2_i32        1
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  #if TCG_TARGET_REG_BITS == 64
>  #define TCG_TARGET_HAS_div2_i64         1
> @@ -122,6 +124,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i64         1
>  #define TCG_TARGET_HAS_mulu2_i64        1
>  #define TCG_TARGET_HAS_muls2_i64        1
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  #endif
>  
>  #define TCG_TARGET_deposit_i32_valid(ofs, len) \
> diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
> index f32d519..ee6b2c8 100644
> --- a/tcg/ia64/tcg-target.h
> +++ b/tcg/ia64/tcg-target.h
> @@ -146,6 +146,10 @@ typedef enum {
>  #define TCG_TARGET_HAS_mulu2_i64        0
>  #define TCG_TARGET_HAS_muls2_i32        0
>  #define TCG_TARGET_HAS_muls2_i64        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  
>  #define TCG_TARGET_deposit_i32_valid(ofs, len) ((len) <= 16)
>  #define TCG_TARGET_deposit_i64_valid(ofs, len) ((len) <= 16)
> diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
> index a438950..6cb7c2f 100644
> --- a/tcg/mips/tcg-target.h
> +++ b/tcg/mips/tcg-target.h
> @@ -89,6 +89,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_eqv_i32          0
>  #define TCG_TARGET_HAS_nand_i32         0
>  #define TCG_TARGET_HAS_muls2_i32        1
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  /* optional instructions only implemented on MIPS4, MIPS32 and Loongson 2 */
>  #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index b35868a..e8dedf3 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -198,6 +198,8 @@ static TCGOpcode op_to_mov(TCGOpcode op)
>  
>  static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
>  {
> +    uint64_t l64, h64;
> +
>      switch (op) {
>      CASE_OP_32_64(add):
>          return x + y;
> @@ -290,6 +292,18 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
>      case INDEX_op_ext32u_i64:
>          return (uint32_t)x;
>  
> +    case INDEX_op_muluh_i32:
> +        return ((uint64_t)(uint32_t)x * (uint32_t)y) >> 32;
> +    case INDEX_op_mulsh_i32:
> +        return ((int64_t)(int32_t)x * (int32_t)y) >> 32;
> +
> +    case INDEX_op_muluh_i64:
> +        mulu64(&l64, &h64, x, y);
> +        return h64;
> +    case INDEX_op_mulsh_i64:
> +        muls64(&l64, &h64, x, y);
> +        return h64;
> +
>      default:
>          fprintf(stderr,
>                  "Unrecognized operation %d in do_constant_folding.\n", op);
> @@ -531,6 +545,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>          CASE_OP_32_64(eqv):
>          CASE_OP_32_64(nand):
>          CASE_OP_32_64(nor):
> +        CASE_OP_32_64(muluh):
> +        CASE_OP_32_64(mulsh):
>              swap_commutative(args[0], &args[1], &args[2]);
>              break;
>          CASE_OP_32_64(brcond):
> @@ -771,6 +787,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>          switch (op) {
>          CASE_OP_32_64(and):
>          CASE_OP_32_64(mul):
> +        CASE_OP_32_64(muluh):
> +        CASE_OP_32_64(mulsh):
>              if ((temps[args[2]].state == TCG_TEMP_CONST
>                  && temps[args[2]].val == 0)) {
>                  s->gen_opc_buf[op_index] = op_to_movi(op);
> @@ -882,6 +900,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>          CASE_OP_32_64(eqv):
>          CASE_OP_32_64(nand):
>          CASE_OP_32_64(nor):
> +        CASE_OP_32_64(muluh):
> +        CASE_OP_32_64(mulsh):
>              if (temps[args[1]].state == TCG_TEMP_CONST
>                  && temps[args[2]].state == TCG_TEMP_CONST) {
>                  s->gen_opc_buf[op_index] = op_to_movi(op);
> diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
> index b42d97c..613c5ff 100644
> --- a/tcg/ppc/tcg-target.h
> +++ b/tcg/ppc/tcg-target.h
> @@ -96,6 +96,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_deposit_i32      1
>  #define TCG_TARGET_HAS_movcond_i32      1
>  #define TCG_TARGET_HAS_muls2_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  #define TCG_AREG0 TCG_REG_R27
>  
> diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
> index 48fc6e2..0789daf 100644
> --- a/tcg/ppc64/tcg-target.h
> +++ b/tcg/ppc64/tcg-target.h
> @@ -95,6 +95,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i32         0
>  #define TCG_TARGET_HAS_mulu2_i32        0
>  #define TCG_TARGET_HAS_muls2_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  #define TCG_TARGET_HAS_div_i64          1
>  #define TCG_TARGET_HAS_rem_i64          0
> @@ -118,6 +120,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i64         1
>  #define TCG_TARGET_HAS_mulu2_i64        1
>  #define TCG_TARGET_HAS_muls2_i64        1
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  
>  #define TCG_AREG0 TCG_REG_R27
>  
> diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
> index 42ca36c..b02f170 100644
> --- a/tcg/s390/tcg-target.h
> +++ b/tcg/s390/tcg-target.h
> @@ -69,6 +69,8 @@ typedef enum TCGReg {
>  #define TCG_TARGET_HAS_sub2_i32         1
>  #define TCG_TARGET_HAS_mulu2_i32        0
>  #define TCG_TARGET_HAS_muls2_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  #define TCG_TARGET_HAS_div2_i64         1
>  #define TCG_TARGET_HAS_rot_i64          1
> @@ -94,6 +96,8 @@ typedef enum TCGReg {
>  #define TCG_TARGET_HAS_sub2_i64         1
>  #define TCG_TARGET_HAS_mulu2_i64        1
>  #define TCG_TARGET_HAS_muls2_i64        0
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  
>  extern bool tcg_target_deposit_valid(int ofs, int len);
>  #define TCG_TARGET_deposit_i32_valid  tcg_target_deposit_valid
> diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
> index dab52d7..1a696bc 100644
> --- a/tcg/sparc/tcg-target.h
> +++ b/tcg/sparc/tcg-target.h
> @@ -107,6 +107,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i32         1
>  #define TCG_TARGET_HAS_mulu2_i32        1
>  #define TCG_TARGET_HAS_muls2_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  #if TCG_TARGET_REG_BITS == 64
>  #define TCG_TARGET_HAS_div_i64          1
> @@ -134,6 +136,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_sub2_i64         0
>  #define TCG_TARGET_HAS_mulu2_i64        0
>  #define TCG_TARGET_HAS_muls2_i64        0
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  #endif
>  
>  #define TCG_AREG0 TCG_REG_I0
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 364964d..3de7545 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -1039,10 +1039,18 @@ static inline void tcg_gen_mul_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
>      t0 = tcg_temp_new_i64();
>      t1 = tcg_temp_new_i32();
>  
> -    tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
> -                    TCGV_LOW(arg1), TCGV_LOW(arg2));
> -    /* Allow the optimizer room to replace mulu2 with two moves.  */
> -    tcg_gen_op0(INDEX_op_nop);
> +    if (TCG_TARGET_HAS_mulu2_i32) {
> +        tcg_gen_op4_i32(INDEX_op_mulu2_i32, TCGV_LOW(t0), TCGV_HIGH(t0),
> +                        TCGV_LOW(arg1), TCGV_LOW(arg2));
> +        /* Allow the optimizer room to replace mulu2 with two moves.  */
> +        tcg_gen_op0(INDEX_op_nop);
> +    } else {
> +        tcg_debug_assert(TCG_TARGET_HAS_muluh_i32);
> +        tcg_gen_op3_i32(INDEX_op_mul_i32, TCGV_LOW(t0),
> +                        TCGV_LOW(arg1), TCGV_LOW(arg2));
> +        tcg_gen_op3_i32(INDEX_op_muluh_i32, TCGV_HIGH(t0),
> +                        TCGV_LOW(arg1), TCGV_LOW(arg2));
> +    }
>  
>      tcg_gen_mul_i32(t1, TCGV_LOW(arg1), TCGV_HIGH(arg2));
>      tcg_gen_add_i32(TCGV_HIGH(t0), TCGV_HIGH(t0), t1);
> @@ -2401,6 +2409,12 @@ static inline void tcg_gen_mulu2_i32(TCGv_i32 rl, TCGv_i32 rh,
>          tcg_gen_op4_i32(INDEX_op_mulu2_i32, rl, rh, arg1, arg2);
>          /* Allow the optimizer room to replace mulu2 with two moves.  */
>          tcg_gen_op0(INDEX_op_nop);
> +    } else if (TCG_TARGET_HAS_muluh_i32) {
> +        TCGv_i32 t = tcg_temp_new_i32();
> +        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
> +        tcg_gen_op3_i32(INDEX_op_muluh_i32, rh, arg1, arg2);
> +        tcg_gen_mov_i32(rl, t);
> +        tcg_temp_free_i32(t);
>      } else {
>          TCGv_i64 t0 = tcg_temp_new_i64();
>          TCGv_i64 t1 = tcg_temp_new_i64();
> @@ -2420,6 +2434,12 @@ static inline void tcg_gen_muls2_i32(TCGv_i32 rl, TCGv_i32 rh,
>          tcg_gen_op4_i32(INDEX_op_muls2_i32, rl, rh, arg1, arg2);
>          /* Allow the optimizer room to replace muls2 with two moves.  */
>          tcg_gen_op0(INDEX_op_nop);
> +    } else if (TCG_TARGET_HAS_mulsh_i32) {
> +        TCGv_i32 t = tcg_temp_new_i32();
> +        tcg_gen_op3_i32(INDEX_op_mul_i32, t, arg1, arg2);
> +        tcg_gen_op3_i32(INDEX_op_mulsh_i32, rh, arg1, arg2);
> +        tcg_gen_mov_i32(rl, t);
> +        tcg_temp_free_i32(t);
>      } else if (TCG_TARGET_REG_BITS == 32 && TCG_TARGET_HAS_mulu2_i32) {
>          TCGv_i32 t0 = tcg_temp_new_i32();
>          TCGv_i32 t1 = tcg_temp_new_i32();
> @@ -2499,6 +2519,12 @@ static inline void tcg_gen_mulu2_i64(TCGv_i64 rl, TCGv_i64 rh,
>          tcg_gen_op4_i64(INDEX_op_mulu2_i64, rl, rh, arg1, arg2);
>          /* Allow the optimizer room to replace mulu2 with two moves.  */
>          tcg_gen_op0(INDEX_op_nop);
> +    } else if (TCG_TARGET_HAS_muluh_i64) {
> +        TCGv_i64 t = tcg_temp_new_i64();
> +        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
> +        tcg_gen_op3_i64(INDEX_op_muluh_i64, rh, arg1, arg2);
> +        tcg_gen_mov_i64(rl, t);
> +        tcg_temp_free_i64(t);
>      } else if (TCG_TARGET_HAS_mulu2_i64) {
>          TCGv_i64 t0 = tcg_temp_new_i64();
>          TCGv_i64 t1 = tcg_temp_new_i64();
> @@ -2540,6 +2566,12 @@ static inline void tcg_gen_muls2_i64(TCGv_i64 rl, TCGv_i64 rh,
>          tcg_gen_op4_i64(INDEX_op_muls2_i64, rl, rh, arg1, arg2);
>          /* Allow the optimizer room to replace muls2 with two moves.  */
>          tcg_gen_op0(INDEX_op_nop);
> +    } else if (TCG_TARGET_HAS_mulsh_i64) {
> +        TCGv_i64 t = tcg_temp_new_i64();
> +        tcg_gen_op3_i64(INDEX_op_mul_i64, t, arg1, arg2);
> +        tcg_gen_op3_i64(INDEX_op_mulsh_i64, rh, arg1, arg2);
> +        tcg_gen_mov_i64(rl, t);
> +        tcg_temp_free_i64(t);
>      } else {
>          TCGv_i64 t0 = tcg_temp_new_i64();
>          int sizemask = 0;
> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
> index a8af5b9..a75c29d 100644
> --- a/tcg/tcg-opc.h
> +++ b/tcg/tcg-opc.h
> @@ -91,6 +91,8 @@ DEF(add2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_add2_i32))
>  DEF(sub2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_sub2_i32))
>  DEF(mulu2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_mulu2_i32))
>  DEF(muls2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_muls2_i32))
> +DEF(muluh_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_muluh_i32))
> +DEF(mulsh_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i32))
>  DEF(brcond2_i32, 0, 4, 2, TCG_OPF_BB_END | IMPL(TCG_TARGET_REG_BITS == 32))
>  DEF(setcond2_i32, 1, 4, 1, IMPL(TCG_TARGET_REG_BITS == 32))
>  
> @@ -167,6 +169,8 @@ DEF(add2_i64, 2, 4, 0, IMPL64 | IMPL(TCG_TARGET_HAS_add2_i64))
>  DEF(sub2_i64, 2, 4, 0, IMPL64 | IMPL(TCG_TARGET_HAS_sub2_i64))
>  DEF(mulu2_i64, 2, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_mulu2_i64))
>  DEF(muls2_i64, 2, 2, 0, IMPL64 | IMPL(TCG_TARGET_HAS_muls2_i64))
> +DEF(muluh_i64, 1, 2, 0, IMPL(TCG_TARGET_HAS_muluh_i64))
> +DEF(mulsh_i64, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i64))
>  
>  /* QEMU specific */
>  #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index dac8224..75034ca 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1243,12 +1243,13 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
>  static void tcg_liveness_analysis(TCGContext *s)
>  {
>      int i, op_index, nb_args, nb_iargs, nb_oargs, arg, nb_ops;
> -    TCGOpcode op, op_new;
> +    TCGOpcode op, op_new, op_new2;
>      TCGArg *args;
>      const TCGOpDef *def;
>      uint8_t *dead_temps, *mem_temps;
>      uint16_t dead_args;
>      uint8_t sync_args;
> +    bool have_op_new2;
>      
>      s->gen_opc_ptr++; /* skip end */
>  
> @@ -1385,29 +1386,52 @@ static void tcg_liveness_analysis(TCGContext *s)
>              goto do_not_remove;
>  
>          case INDEX_op_mulu2_i32:
> +            op_new = INDEX_op_mul_i32;
> +            op_new2 = INDEX_op_muluh_i32;
> +            have_op_new2 = TCG_TARGET_HAS_muluh_i32;
> +            goto do_mul2;
>          case INDEX_op_muls2_i32:
>              op_new = INDEX_op_mul_i32;
> +            op_new2 = INDEX_op_mulsh_i32;
> +            have_op_new2 = TCG_TARGET_HAS_mulsh_i32;
>              goto do_mul2;
>          case INDEX_op_mulu2_i64:
> +            op_new = INDEX_op_mul_i64;
> +            op_new2 = INDEX_op_muluh_i64;
> +            have_op_new2 = TCG_TARGET_HAS_muluh_i64;
> +            goto do_mul2;
>          case INDEX_op_muls2_i64:
>              op_new = INDEX_op_mul_i64;
> +            op_new2 = INDEX_op_mulsh_i64;
> +            have_op_new2 = TCG_TARGET_HAS_mulsh_i64;
> +            goto do_mul2;
>          do_mul2:
>              args -= 4;
>              nb_iargs = 2;
>              nb_oargs = 2;
> -            /* Likewise, test for the high part of the operation dead.  */
>              if (dead_temps[args[1]] && !mem_temps[args[1]]) {
>                  if (dead_temps[args[0]] && !mem_temps[args[0]]) {
> +                    /* Both parts of the operation are dead.  */
>                      goto do_remove;
>                  }
> +                /* The high part of the operation is dead; generate the low. */
>                  s->gen_opc_buf[op_index] = op = op_new;
>                  args[1] = args[2];
>                  args[2] = args[3];
> -                assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
> -                tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 1);
> -                /* Fall through and mark the single-word operation live.  */
> -                nb_oargs = 1;
> +            } else if (have_op_new2 && dead_temps[args[0]]
> +                       && !mem_temps[args[0]]) {
> +                /* The low part of the operation is dead; generate the high.  */
> +                s->gen_opc_buf[op_index] = op = op_new2;
> +                args[0] = args[1];
> +                args[1] = args[2];
> +                args[2] = args[3];
> +            } else {
> +                goto do_not_remove;
>              }
> +            assert(s->gen_opc_buf[op_index + 1] == INDEX_op_nop);
> +            tcg_set_nop(s, s->gen_opc_buf + op_index + 1, args + 3, 1);
> +            /* Mark the single-word operation live.  */
> +            nb_oargs = 1;
>              goto do_not_remove;
>  
>          default:
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index f3f9889..3f869dd 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -85,6 +85,8 @@ typedef uint64_t TCGRegSet;
>  #define TCG_TARGET_HAS_sub2_i64         0
>  #define TCG_TARGET_HAS_mulu2_i64        0
>  #define TCG_TARGET_HAS_muls2_i64        0
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  /* Turn some undef macros into true macros.  */
>  #define TCG_TARGET_HAS_add2_i32         1
>  #define TCG_TARGET_HAS_sub2_i32         1
> diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
> index d7fc14e..ff12b4b 100644
> --- a/tcg/tci/tcg-target.h
> +++ b/tcg/tci/tcg-target.h
> @@ -76,6 +76,8 @@
>  #define TCG_TARGET_HAS_rot_i32          1
>  #define TCG_TARGET_HAS_movcond_i32      0
>  #define TCG_TARGET_HAS_muls2_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        0
> +#define TCG_TARGET_HAS_mulsh_i32        0
>  
>  #if TCG_TARGET_REG_BITS == 64
>  #define TCG_TARGET_HAS_bswap16_i64      1
> @@ -100,13 +102,14 @@
>  #define TCG_TARGET_HAS_rot_i64          1
>  #define TCG_TARGET_HAS_movcond_i64      0
>  #define TCG_TARGET_HAS_muls2_i64        0
> -
>  #define TCG_TARGET_HAS_add2_i32         0
>  #define TCG_TARGET_HAS_sub2_i32         0
>  #define TCG_TARGET_HAS_mulu2_i32        0
>  #define TCG_TARGET_HAS_add2_i64         0
>  #define TCG_TARGET_HAS_sub2_i64         0
>  #define TCG_TARGET_HAS_mulu2_i64        0
> +#define TCG_TARGET_HAS_muluh_i64        0
> +#define TCG_TARGET_HAS_mulsh_i64        0
>  #endif /* TCG_TARGET_REG_BITS == 64 */
>  
>  /* Number of registers available.

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] tcg-mips: Implement mulsh, muluh
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 2/4] tcg-mips: Implement mulsh, muluh Richard Henderson
@ 2013-08-28 20:59   ` Aurelien Jarno
  0 siblings, 0 replies; 10+ messages in thread
From: Aurelien Jarno @ 2013-08-28 20:59 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Sat, Aug 17, 2013 at 04:26:44PM -0700, Richard Henderson wrote:
> With the optimization in tcg_liveness_analysis,
> we can avoid the MFLO when it is unused.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/mips/tcg-target.c | 10 ++++++++++
>  tcg/mips/tcg-target.h |  4 ++--
>  2 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
> index 793532e..31cd514 100644
> --- a/tcg/mips/tcg-target.c
> +++ b/tcg/mips/tcg-target.c
> @@ -1423,6 +1423,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
>          tcg_out_opc_reg(s, OPC_MFHI, args[1], 0, 0);
>          break;
> +    case INDEX_op_mulsh_i32:
> +        tcg_out_opc_reg(s, OPC_MULT, 0, args[1], args[2]);
> +        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
> +        break;
> +    case INDEX_op_muluh_i32:
> +        tcg_out_opc_reg(s, OPC_MULTU, 0, args[1], args[2]);
> +        tcg_out_opc_reg(s, OPC_MFHI, args[0], 0, 0);
> +        break;
>      case INDEX_op_div_i32:
>          tcg_out_opc_reg(s, OPC_DIV, 0, args[1], args[2]);
>          tcg_out_opc_reg(s, OPC_MFLO, args[0], 0, 0);
> @@ -1602,6 +1610,8 @@ static const TCGTargetOpDef mips_op_defs[] = {
>      { INDEX_op_mul_i32, { "r", "rZ", "rZ" } },
>      { INDEX_op_muls2_i32, { "r", "r", "rZ", "rZ" } },
>      { INDEX_op_mulu2_i32, { "r", "r", "rZ", "rZ" } },
> +    { INDEX_op_mulsh_i32, { "r", "rZ", "rZ" } },
> +    { INDEX_op_muluh_i32, { "r", "rZ", "rZ" } },
>      { INDEX_op_div_i32, { "r", "rZ", "rZ" } },
>      { INDEX_op_divu_i32, { "r", "rZ", "rZ" } },
>      { INDEX_op_rem_i32, { "r", "rZ", "rZ" } },
> diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
> index 6cb7c2f..7ef79e0 100644
> --- a/tcg/mips/tcg-target.h
> +++ b/tcg/mips/tcg-target.h
> @@ -89,8 +89,8 @@ typedef enum {
>  #define TCG_TARGET_HAS_eqv_i32          0
>  #define TCG_TARGET_HAS_nand_i32         0
>  #define TCG_TARGET_HAS_muls2_i32        1
> -#define TCG_TARGET_HAS_muluh_i32        0
> -#define TCG_TARGET_HAS_mulsh_i32        0
> +#define TCG_TARGET_HAS_muluh_i32        1
> +#define TCG_TARGET_HAS_mulsh_i32        1
>  
>  /* optional instructions only implemented on MIPS4, MIPS32 and Loongson 2 */
>  #if (defined(__mips_isa_rev) && (__mips_isa_rev >= 1)) || \

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 3/4] tcg-ppc64: Implement muluh, mulsh
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 3/4] tcg-ppc64: Implement muluh, mulsh Richard Henderson
@ 2013-08-28 21:00   ` Aurelien Jarno
  0 siblings, 0 replies; 10+ messages in thread
From: Aurelien Jarno @ 2013-08-28 21:00 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Sat, Aug 17, 2013 at 04:26:45PM -0700, Richard Henderson wrote:
> Using these instead of mulu2 and muls2 lets us avoid having to argument
> overlap analysis in the backend.  Normal register allocation will DTRT.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/ppc64/tcg-target.c | 32 +++++++-------------------------
>  tcg/ppc64/tcg-target.h |  8 ++++----
>  2 files changed, 11 insertions(+), 29 deletions(-)
> 
> diff --git a/tcg/ppc64/tcg-target.c b/tcg/ppc64/tcg-target.c
> index 0678de2..939f7cb 100644
> --- a/tcg/ppc64/tcg-target.c
> +++ b/tcg/ppc64/tcg-target.c
> @@ -1975,29 +1975,11 @@ static void tcg_out_op (TCGContext *s, TCGOpcode opc, const TCGArg *args,
>          }
>          break;
>  
> -    case INDEX_op_mulu2_i64:
> -    case INDEX_op_muls2_i64:
> -        {
> -            int oph = (opc == INDEX_op_mulu2_i64 ? MULHDU : MULHD);
> -            TCGReg outl = args[0], outh = args[1];
> -            a0 = args[2], a1 = args[3];
> -
> -            if (outl == a0 || outl == a1) {
> -                if (outh == a0 || outh == a1) {
> -                    outl = TCG_REG_R0;
> -                } else {
> -                    tcg_out32(s, oph | TAB(outh, a0, a1));
> -                    oph = 0;
> -                }
> -            }
> -            tcg_out32(s, MULLD | TAB(outl, a0, a1));
> -            if (oph != 0) {
> -                tcg_out32(s, oph | TAB(outh, a0, a1));
> -            }
> -            if (outl != args[0]) {
> -                tcg_out_mov(s, TCG_TYPE_I64, args[0], outl);
> -            }
> -        }
> +    case INDEX_op_muluh_i64:
> +        tcg_out32(s, MULHDU | TAB(args[0], args[1], args[2]));
> +        break;
> +    case INDEX_op_mulsh_i64:
> +        tcg_out32(s, MULHD | TAB(args[0], args[1], args[2]));
>          break;
>  
>      default:
> @@ -2124,8 +2106,8 @@ static const TCGTargetOpDef ppc_op_defs[] = {
>  
>      { INDEX_op_add2_i64, { "r", "r", "r", "r", "rI", "rZM" } },
>      { INDEX_op_sub2_i64, { "r", "r", "rI", "r", "rZM", "r" } },
> -    { INDEX_op_muls2_i64, { "r", "r", "r", "r" } },
> -    { INDEX_op_mulu2_i64, { "r", "r", "r", "r" } },
> +    { INDEX_op_mulsh_i64, { "r", "r", "r" } },
> +    { INDEX_op_muluh_i64, { "r", "r", "r" } },
>  
>      { -1 },
>  };
> diff --git a/tcg/ppc64/tcg-target.h b/tcg/ppc64/tcg-target.h
> index 0789daf..fa4b9da 100644
> --- a/tcg/ppc64/tcg-target.h
> +++ b/tcg/ppc64/tcg-target.h
> @@ -118,10 +118,10 @@ typedef enum {
>  #define TCG_TARGET_HAS_movcond_i64      1
>  #define TCG_TARGET_HAS_add2_i64         1
>  #define TCG_TARGET_HAS_sub2_i64         1
> -#define TCG_TARGET_HAS_mulu2_i64        1
> -#define TCG_TARGET_HAS_muls2_i64        1
> -#define TCG_TARGET_HAS_muluh_i64        0
> -#define TCG_TARGET_HAS_mulsh_i64        0
> +#define TCG_TARGET_HAS_mulu2_i64        0
> +#define TCG_TARGET_HAS_muls2_i64        0
> +#define TCG_TARGET_HAS_muluh_i64        1
> +#define TCG_TARGET_HAS_mulsh_i64        1
>  
>  #define TCG_AREG0 TCG_REG_R27
>  

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>
 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 4/4] tcg: Constant fold div, rem
  2013-08-17 23:26 ` [Qemu-devel] [PATCH 4/4] tcg: Constant fold div, rem Richard Henderson
@ 2013-08-28 21:02   ` Aurelien Jarno
  0 siblings, 0 replies; 10+ messages in thread
From: Aurelien Jarno @ 2013-08-28 21:02 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Sat, Aug 17, 2013 at 04:26:46PM -0700, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/optimize.c | 23 +++++++++++++++++++++++
>  1 file changed, 23 insertions(+)
> 
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index e8dedf3..b29bf25 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -304,6 +304,25 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
>          muls64(&l64, &h64, x, y);
>          return h64;
>  
> +    case INDEX_op_div_i32:
> +        /* Avoid crashing on divide by zero, otherwise undefined.  */
> +        return (int32_t)x / ((int32_t)y ? : 1);
> +    case INDEX_op_divu_i32:
> +        return (uint32_t)x / ((uint32_t)y ? : 1);
> +    case INDEX_op_div_i64:
> +        return (int64_t)x / ((int64_t)y ? : 1);
> +    case INDEX_op_divu_i64:
> +        return (uint64_t)x / ((uint64_t)y ? : 1);
> +
> +    case INDEX_op_rem_i32:
> +        return (int32_t)x % ((int32_t)y ? : 1);
> +    case INDEX_op_remu_i32:
> +        return (uint32_t)x % ((uint32_t)y ? : 1);
> +    case INDEX_op_rem_i64:
> +        return (int64_t)x % ((int64_t)y ? : 1);
> +    case INDEX_op_remu_i64:
> +        return (uint64_t)x % ((uint64_t)y ? : 1);
> +
>      default:
>          fprintf(stderr,
>                  "Unrecognized operation %d in do_constant_folding.\n", op);
> @@ -902,6 +921,10 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
>          CASE_OP_32_64(nor):
>          CASE_OP_32_64(muluh):
>          CASE_OP_32_64(mulsh):
> +        CASE_OP_32_64(div):
> +        CASE_OP_32_64(divu):
> +        CASE_OP_32_64(rem):
> +        CASE_OP_32_64(remu):
>              if (temps[args[1]].state == TCG_TEMP_CONST
>                  && temps[args[2]].state == TCG_TEMP_CONST) {
>                  s->gen_opc_buf[op_index] = op_to_movi(op);

This looks fine to me, though I haven't been able to trigger this code
path. So:

Reviewed-by: Aurelien Jarno <aurelien@aurel32.net>

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2013-08-28 21:02 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-17 23:26 [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson
2013-08-17 23:26 ` [Qemu-devel] [PATCH 1/4] " Richard Henderson
2013-08-28 20:59   ` Aurelien Jarno
2013-08-17 23:26 ` [Qemu-devel] [PATCH 2/4] tcg-mips: Implement mulsh, muluh Richard Henderson
2013-08-28 20:59   ` Aurelien Jarno
2013-08-17 23:26 ` [Qemu-devel] [PATCH 3/4] tcg-ppc64: Implement muluh, mulsh Richard Henderson
2013-08-28 21:00   ` Aurelien Jarno
2013-08-17 23:26 ` [Qemu-devel] [PATCH 4/4] tcg: Constant fold div, rem Richard Henderson
2013-08-28 21:02   ` Aurelien Jarno
2013-08-27 21:48 ` [Qemu-devel] [PATCH 0/4] tcg: Add muluh and mulsh opcodes Richard Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.