All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 01/21] tcg-i386: Allocate call-saved registers first.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
@ 2010-04-13 22:23 ` Richard Henderson
  2010-04-13 22:26 ` [Qemu-devel] [PATCH 02/21] tcg-i386: Tidy initialization of tcg_target_call_clobber_regs Richard Henderson
                   ` (19 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-13 22:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   13 +++++++++----
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index e684b33..f5c24f7 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -36,16 +36,21 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #endif
 
 static const int tcg_target_reg_alloc_order[] = {
-    TCG_REG_EAX,
-    TCG_REG_EDX,
-    TCG_REG_ECX,
     TCG_REG_EBX,
     TCG_REG_ESI,
     TCG_REG_EDI,
     TCG_REG_EBP,
+    TCG_REG_ECX,
+    TCG_REG_EDX,
+    TCG_REG_EAX,
+};
+
+static const int tcg_target_call_iarg_regs[3] = {
+    TCG_REG_EAX,
+    TCG_REG_EDX,
+    TCG_REG_ECX
 };
 
-static const int tcg_target_call_iarg_regs[3] = { TCG_REG_EAX, TCG_REG_EDX, TCG_REG_ECX };
 static const int tcg_target_call_oarg_regs[2] = { TCG_REG_EAX, TCG_REG_EDX };
 
 static uint8_t *tb_ret_addr;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 02/21] tcg-i386: Tidy initialization of tcg_target_call_clobber_regs.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
  2010-04-13 22:23 ` [Qemu-devel] [PATCH 01/21] tcg-i386: Allocate call-saved registers first Richard Henderson
@ 2010-04-13 22:26 ` Richard Henderson
  2010-04-13 22:59 ` [Qemu-devel] [PATCH 03/21] tcg-i386: Tidy ext8u and ext16u operations Richard Henderson
                   ` (18 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-13 22:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Setting the registers one by one is easier to read, and gets
optimized by the compiler just the same.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index f5c24f7..359f81b 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1365,11 +1365,12 @@ void tcg_target_init(TCGContext *s)
 #endif
 
     tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
-    tcg_regset_set32(tcg_target_call_clobber_regs, 0,
-                     (1 << TCG_REG_EAX) | 
-                     (1 << TCG_REG_EDX) | 
-                     (1 << TCG_REG_ECX));
-    
+
+    tcg_regset_clear(tcg_target_call_clobber_regs);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
+
     tcg_regset_clear(s->reserved_regs);
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_ESP);
 
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 03/21] tcg-i386: Tidy ext8u and ext16u operations.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
  2010-04-13 22:23 ` [Qemu-devel] [PATCH 01/21] tcg-i386: Allocate call-saved registers first Richard Henderson
  2010-04-13 22:26 ` [Qemu-devel] [PATCH 02/21] tcg-i386: Tidy initialization of tcg_target_call_clobber_regs Richard Henderson
@ 2010-04-13 22:59 ` Richard Henderson
  2010-04-13 23:13 ` [Qemu-devel] [PATCH 04/21] tcg-i386: Tidy ext8s and ext16s operations Richard Henderson
                   ` (17 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-13 22:59 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_MOVZBL and OPC_MOVZWL.  Factor opcode emission to
separate functions.  Don't restrict the input register to the
low 4 "q" registers; emit an AND instead if needed.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   68 ++++++++++++++++++++++++++++++------------------
 1 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 359f81b..2cc1191 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -161,6 +161,11 @@ static inline int tcg_target_const_match(tcg_target_long val,
         return 0;
 }
 
+#define P_EXT   0x100 /* 0x0f opcode prefix */
+
+#define OPC_MOVZBL	(0xb6 | P_EXT)
+#define OPC_MOVZWL	(0xb7 | P_EXT)
+
 #define ARITH_ADD 0
 #define ARITH_OR  1
 #define ARITH_ADC 2
@@ -194,8 +199,6 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define JCC_JLE 0xe
 #define JCC_JG  0xf
 
-#define P_EXT   0x100 /* 0x0f opcode prefix */
-
 static const uint8_t tcg_cond_to_jcc[10] = {
     [TCG_COND_EQ] = JCC_JE,
     [TCG_COND_NE] = JCC_JNE,
@@ -288,6 +291,27 @@ static inline void tcg_out_st(TCGContext *s, TCGType type, int arg,
     tcg_out_modrm_offset(s, 0x89, arg, arg1, arg2);
 }
 
+static void tcg_out_ext8u(TCGContext *s, int dest, int src)
+{
+    if (src >= 4) {
+        tcg_out_mov(s, dest, src);
+        if (dest >= 4) {
+            tcg_out_modrm(s, 0x81, ARITH_AND, dest);
+            tcg_out32(s, 0xff);
+            return;
+        }
+        src = dest;
+    }
+    /* movzbl */
+    tcg_out_modrm(s, OPC_MOVZBL, dest, src);
+}
+
+static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
+{
+    /* movzwl */
+    tcg_out_modrm(s, OPC_MOVZWL, dest, src);
+}
+
 static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
 {
     if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
@@ -300,11 +324,9 @@ static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf
         tcg_out_modrm(s, 0x83, c, r0);
         tcg_out8(s, val);
     } else if (c == ARITH_AND && val == 0xffu && r0 < 4) {
-        /* movzbl */
-        tcg_out_modrm(s, 0xb6 | P_EXT, r0, r0);
+        tcg_out_ext8u(s, r0, r0);
     } else if (c == ARITH_AND && val == 0xffffu) {
-        /* movzwl */
-        tcg_out_modrm(s, 0xb7 | P_EXT, r0, r0);
+        tcg_out_ext16u(s, r0, r0);
     } else {
         tcg_out_modrm(s, 0x81, c, r0);
         tcg_out32(s, val);
@@ -645,12 +667,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         tcg_out_modrm(s, 0xbf | P_EXT, data_reg, TCG_REG_EAX);
         break;
     case 0:
-        /* movzbl */
-        tcg_out_modrm(s, 0xb6 | P_EXT, data_reg, TCG_REG_EAX);
+        tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
         break;
     case 1:
-        /* movzwl */
-        tcg_out_modrm(s, 0xb7 | P_EXT, data_reg, TCG_REG_EAX);
+        tcg_out_ext16u(s, data_reg, TCG_REG_EAX);
         break;
     case 2:
     default:
@@ -690,7 +710,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     switch(opc) {
     case 0:
         /* movzbl */
-        tcg_out_modrm_offset(s, 0xb6 | P_EXT, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVZBL, data_reg, r0, GUEST_BASE);
         break;
     case 0 | 4:
         /* movsbl */
@@ -698,7 +718,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         break;
     case 1:
         /* movzwl */
-        tcg_out_modrm_offset(s, 0xb7 | P_EXT, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
         if (bswap) {
             /* rolw $8, data_reg */
             tcg_out8(s, 0x66); 
@@ -850,12 +870,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     } else {
         switch(opc) {
         case 0:
-            /* movzbl */
-            tcg_out_modrm(s, 0xb6 | P_EXT, TCG_REG_EDX, data_reg);
+            tcg_out_ext8u(s, TCG_REG_EDX, data_reg);
             break;
         case 1:
-            /* movzwl */
-            tcg_out_modrm(s, 0xb7 | P_EXT, TCG_REG_EDX, data_reg);
+            tcg_out_ext16u(s, TCG_REG_EDX, data_reg);
             break;
         case 2:
             tcg_out_mov(s, TCG_REG_EDX, data_reg);
@@ -881,12 +899,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         tcg_out_mov(s, TCG_REG_EDX, addr_reg2);
         switch(opc) {
         case 0:
-            /* movzbl */
-            tcg_out_modrm(s, 0xb6 | P_EXT, TCG_REG_ECX, data_reg);
+            tcg_out_ext8u(s, TCG_REG_ECX, data_reg);
             break;
         case 1:
-            /* movzwl */
-            tcg_out_modrm(s, 0xb7 | P_EXT, TCG_REG_ECX, data_reg);
+            tcg_out_ext16u(s, TCG_REG_ECX, data_reg);
             break;
         case 2:
             tcg_out_mov(s, TCG_REG_ECX, data_reg);
@@ -1022,7 +1038,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_ld8u_i32:
         /* movzbl */
-        tcg_out_modrm_offset(s, 0xb6 | P_EXT, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVZBL, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld8s_i32:
         /* movsbl */
@@ -1030,7 +1046,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_ld16u_i32:
         /* movzwl */
-        tcg_out_modrm_offset(s, 0xb7 | P_EXT, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVZWL, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld16s_i32:
         /* movswl */
@@ -1177,10 +1193,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_modrm(s, 0xbf | P_EXT, args[0], args[1]);
         break;
     case INDEX_op_ext8u_i32:
-        tcg_out_modrm(s, 0xb6 | P_EXT, args[0], args[1]);
+        tcg_out_ext8u(s, args[0], args[1]);
         break;
     case INDEX_op_ext16u_i32:
-        tcg_out_modrm(s, 0xb7 | P_EXT, args[0], args[1]);
+        tcg_out_ext16u(s, args[0], args[1]);
         break;
 
     case INDEX_op_setcond_i32:
@@ -1275,8 +1291,8 @@ static const TCGTargetOpDef x86_op_defs[] = {
 
     { INDEX_op_ext8s_i32, { "r", "q" } },
     { INDEX_op_ext16s_i32, { "r", "r" } },
-    { INDEX_op_ext8u_i32, { "r", "q"} },
-    { INDEX_op_ext16u_i32, { "r", "r"} },
+    { INDEX_op_ext8u_i32, { "r", "r" } },
+    { INDEX_op_ext16u_i32, { "r", "r" } },
 
     { INDEX_op_setcond_i32, { "q", "r", "ri" } },
     { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } },
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 04/21] tcg-i386: Tidy ext8s and ext16s operations.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (2 preceding siblings ...)
  2010-04-13 22:59 ` [Qemu-devel] [PATCH 03/21] tcg-i386: Tidy ext8u and ext16u operations Richard Henderson
@ 2010-04-13 23:13 ` Richard Henderson
  2010-04-13 23:33 ` [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations Richard Henderson
                   ` (16 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-13 23:13 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_MOVSBL and OPC_MOVSWL.  Factor opcode emission to
separate functions.  Don't restrict the input register to the
low 4 "q" registers; emit shifts instead if needed.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   49 +++++++++++++++++++++++++++++++++++++------------
 1 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 2cc1191..75b9915 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -165,6 +165,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
 
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
+#define OPC_MOVSBL	(0xbe | P_EXT)
+#define OPC_MOVSWL	(0xbf | P_EXT)
 
 #define ARITH_ADD 0
 #define ARITH_OR  1
@@ -306,12 +308,37 @@ static void tcg_out_ext8u(TCGContext *s, int dest, int src)
     tcg_out_modrm(s, OPC_MOVZBL, dest, src);
 }
 
+static void tcg_out_ext8s(TCGContext *s, int dest, int src)
+{
+    if (src >= 4) {
+        tcg_out_mov(s, dest, src);
+        if (dest >= 4) {
+            /* shl $24, dest */
+            tcg_out_modrm(s, 0xc1, SHIFT_SHL, dest);
+            tcg_out8(s, 24);
+            /* sar $24, dest */
+            tcg_out_modrm(s, 0xc1, SHIFT_SAR, dest);
+            tcg_out8(s, 24);
+            return;
+        }
+        src = dest;
+    }
+    /* movsbl */
+    tcg_out_modrm(s, OPC_MOVSBL, dest, src);
+}
+
 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
 {
     /* movzwl */
     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
 }
 
+static inline void tcg_out_ext16s(TCGContext *s, int dest, int src)
+{
+    /* movswl */
+    tcg_out_modrm(s, OPC_MOVSWL, dest, src);
+}
+
 static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
 {
     if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
@@ -659,12 +686,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
 
     switch(opc) {
     case 0 | 4:
-        /* movsbl */
-        tcg_out_modrm(s, 0xbe | P_EXT, data_reg, TCG_REG_EAX);
+        tcg_out_ext8s(s, data_reg, TCG_REG_EAX);
         break;
     case 1 | 4:
-        /* movswl */
-        tcg_out_modrm(s, 0xbf | P_EXT, data_reg, TCG_REG_EAX);
+        tcg_out_ext16s(s, data_reg, TCG_REG_EAX);
         break;
     case 0:
         tcg_out_ext8u(s, data_reg, TCG_REG_EAX);
@@ -714,7 +739,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         break;
     case 0 | 4:
         /* movsbl */
-        tcg_out_modrm_offset(s, 0xbe | P_EXT, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVSBL, data_reg, r0, GUEST_BASE);
         break;
     case 1:
         /* movzwl */
@@ -728,7 +753,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         break;
     case 1 | 4:
         /* movswl */
-        tcg_out_modrm_offset(s, 0xbf | P_EXT, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE);
         if (bswap) {
             /* rolw $8, data_reg */
             tcg_out8(s, 0x66); 
@@ -736,7 +761,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
             tcg_out8(s, 8);
 
             /* movswl data_reg, data_reg */
-            tcg_out_modrm(s, 0xbf | P_EXT, data_reg, data_reg);
+            tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg);
         }
         break;
     case 2:
@@ -1042,7 +1067,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_ld8s_i32:
         /* movsbl */
-        tcg_out_modrm_offset(s, 0xbe | P_EXT, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVSBL, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld16u_i32:
         /* movzwl */
@@ -1050,7 +1075,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_ld16s_i32:
         /* movswl */
-        tcg_out_modrm_offset(s, 0xbf | P_EXT, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVSWL, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld_i32:
         /* movl */
@@ -1187,10 +1212,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_ext8s_i32:
-        tcg_out_modrm(s, 0xbe | P_EXT, args[0], args[1]);
+        tcg_out_ext8s(s, args[0], args[1]);
         break;
     case INDEX_op_ext16s_i32:
-        tcg_out_modrm(s, 0xbf | P_EXT, args[0], args[1]);
+        tcg_out_ext16s(s, args[0], args[1]);
         break;
     case INDEX_op_ext8u_i32:
         tcg_out_ext8u(s, args[0], args[1]);
@@ -1289,7 +1314,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
 
     { INDEX_op_not_i32, { "r", "0" } },
 
-    { INDEX_op_ext8s_i32, { "r", "q" } },
+    { INDEX_op_ext8s_i32, { "r", "r" } },
     { INDEX_op_ext16s_i32, { "r", "r" } },
     { INDEX_op_ext8u_i32, { "r", "r" } },
     { INDEX_op_ext16u_i32, { "r", "r" } },
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (3 preceding siblings ...)
  2010-04-13 23:13 ` [Qemu-devel] [PATCH 04/21] tcg-i386: Tidy ext8s and ext16s operations Richard Henderson
@ 2010-04-13 23:33 ` Richard Henderson
  2010-04-18 22:13   ` Aurelien Jarno
  2010-04-13 23:44 ` [Qemu-devel] [PATCH 06/21] tcg-i386: Tidy shift operations Richard Henderson
                   ` (15 subsequent siblings)
  20 siblings, 1 reply; 26+ messages in thread
From: Richard Henderson @ 2010-04-13 23:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_BSWAP.  Factor opcode emission to separate functions.
Use bswap+shift to implement 16-bit swap instead of a rolw; this
gets the proper zero-extension required by INDEX_op_bswap16_i32.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   53 +++++++++++++++++++++++++------------------------
 1 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 75b9915..0bafd00 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -163,6 +163,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 
 #define P_EXT   0x100 /* 0x0f opcode prefix */
 
+#define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
 #define OPC_MOVSBL	(0xbe | P_EXT)
@@ -339,6 +340,22 @@ static inline void tcg_out_ext16s(TCGContext *s, int dest, int src)
     tcg_out_modrm(s, OPC_MOVSWL, dest, src);
 }
 
+static inline void tcg_out_bswap32(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_BSWAP + reg);
+}
+
+static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign)
+{
+    /* This swap+shift combination guarantees that the high part contains
+       the sign or zero extension required.  It also doesn't suffer the
+       problem of partial register stalls that using rolw does.  */
+    tcg_out_bswap32(s, reg);
+    /* shr $16, dest */
+    tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg);
+    tcg_out8(s, 16);
+}
+
 static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
 {
     if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
@@ -745,31 +762,21 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         /* movzwl */
         tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
         if (bswap) {
-            /* rolw $8, data_reg */
-            tcg_out8(s, 0x66); 
-            tcg_out_modrm(s, 0xc1, 0, data_reg);
-            tcg_out8(s, 8);
+            tcg_out_bswap16(s, data_reg, 0);
         }
         break;
     case 1 | 4:
         /* movswl */
         tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE);
         if (bswap) {
-            /* rolw $8, data_reg */
-            tcg_out8(s, 0x66); 
-            tcg_out_modrm(s, 0xc1, 0, data_reg);
-            tcg_out8(s, 8);
-
-            /* movswl data_reg, data_reg */
-            tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg);
+            tcg_out_bswap16(s, data_reg, 1);
         }
         break;
     case 2:
         /* movl (r0), data_reg */
         tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE);
         if (bswap) {
-            /* bswap */
-            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
+            tcg_out_bswap32(s, data_reg);
         }
         break;
     case 3:
@@ -786,11 +793,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
             tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4);
         } else {
             tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4);
-            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
+            tcg_out_bswap32(s, data_reg);
 
             tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE);
-            /* bswap */
-            tcg_out_opc(s, (0xc8 + data_reg2) | P_EXT);
+            tcg_out_bswap32(s, data_reg2);
         }
         break;
     default:
@@ -982,8 +988,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     case 2:
         if (bswap) {
             tcg_out_mov(s, r1, data_reg);
-            /* bswap data_reg */
-            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
+            tcg_out_bswap32(s, r1);
             data_reg = r1;
         }
         /* movl */
@@ -992,12 +997,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     case 3:
         if (bswap) {
             tcg_out_mov(s, r1, data_reg2);
-            /* bswap data_reg */
-            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
+            tcg_out_bswap32(s, r1);
             tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE);
             tcg_out_mov(s, r1, data_reg);
-            /* bswap data_reg */
-            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
+            tcg_out_bswap32(s, r1);
             tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4);
         } else {
             tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
@@ -1195,12 +1198,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_bswap16_i32:
-        tcg_out8(s, 0x66);
-        tcg_out_modrm(s, 0xc1, SHIFT_ROL, args[0]);
-        tcg_out8(s, 8);
+        tcg_out_bswap16(s, args[0], 0);
         break;
     case INDEX_op_bswap32_i32:
-        tcg_out_opc(s, (0xc8 + args[0]) | P_EXT);
+        tcg_out_bswap32(s, args[0]);
         break;
 
     case INDEX_op_neg_i32:
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 06/21] tcg-i386: Tidy shift operations.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (4 preceding siblings ...)
  2010-04-13 23:33 ` [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations Richard Henderson
@ 2010-04-13 23:44 ` Richard Henderson
  2010-04-14 14:58 ` [Qemu-devel] [PATCH 07/21] tcg-i386: Tidy move operations Richard Henderson
                   ` (14 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-13 23:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_SHIFT_{1,Ib,cl}.  Factor opcode emission to a function.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   47 +++++++++++++++++++++++------------------------
 1 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 0bafd00..2df45bf 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -168,6 +168,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_MOVZWL	(0xb7 | P_EXT)
 #define OPC_MOVSBL	(0xbe | P_EXT)
 #define OPC_MOVSWL	(0xbf | P_EXT)
+#define OPC_SHIFT_1	(0xd1)
+#define OPC_SHIFT_Ib	(0xc1)
+#define OPC_SHIFT_cl	(0xd3)
 
 #define ARITH_ADD 0
 #define ARITH_OR  1
@@ -294,6 +297,16 @@ static inline void tcg_out_st(TCGContext *s, TCGType type, int arg,
     tcg_out_modrm_offset(s, 0x89, arg, arg1, arg2);
 }
 
+static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
+{
+    if (count == 1) {
+        tcg_out_modrm(s, OPC_SHIFT_1, subopc, reg);
+    } else {
+        tcg_out_modrm(s, OPC_SHIFT_Ib, subopc, reg);
+        tcg_out8(s, count);
+    }
+}
+
 static void tcg_out_ext8u(TCGContext *s, int dest, int src)
 {
     if (src >= 4) {
@@ -314,12 +327,8 @@ static void tcg_out_ext8s(TCGContext *s, int dest, int src)
     if (src >= 4) {
         tcg_out_mov(s, dest, src);
         if (dest >= 4) {
-            /* shl $24, dest */
-            tcg_out_modrm(s, 0xc1, SHIFT_SHL, dest);
-            tcg_out8(s, 24);
-            /* sar $24, dest */
-            tcg_out_modrm(s, 0xc1, SHIFT_SAR, dest);
-            tcg_out8(s, 24);
+            tcg_out_shifti(s, SHIFT_SHL, dest, 24);
+            tcg_out_shifti(s, SHIFT_SAR, dest, 24);
             return;
         }
         src = dest;
@@ -351,9 +360,7 @@ static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign)
        the sign or zero extension required.  It also doesn't suffer the
        problem of partial register stalls that using rolw does.  */
     tcg_out_bswap32(s, reg);
-    /* shr $16, dest */
-    tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg);
-    tcg_out8(s, 16);
+    tcg_out_shifti(s, (sign ? SHIFT_SAR : SHIFT_SHR), reg, 16);
 }
 
 static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
@@ -648,9 +655,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
 
     tcg_out_mov(s, r0, addr_reg); 
  
-    tcg_out_modrm(s, 0xc1, 5, r1); /* shr $x, r1 */
-    tcg_out8(s, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 
-    
+    tcg_out_shifti(s, SHIFT_SHR, r1, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 
+
     tcg_out_modrm(s, 0x81, 4, r0); /* andl $x, r0 */
     tcg_out32(s, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
     
@@ -845,9 +851,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 
     tcg_out_mov(s, r0, addr_reg); 
  
-    tcg_out_modrm(s, 0xc1, 5, r1); /* shr $x, r1 */
-    tcg_out8(s, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 
-    
+    tcg_out_shifti(s, SHIFT_SHR, r1, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 
+
     tcg_out_modrm(s, 0x81, 4, r0); /* andl $x, r0 */
     tcg_out32(s, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
     
@@ -977,8 +982,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         if (bswap) {
             tcg_out_mov(s, r1, data_reg);
             tcg_out8(s, 0x66); /* rolw $8, %ecx */
-            tcg_out_modrm(s, 0xc1, 0, r1);
-            tcg_out8(s, 8);
+            tcg_out_shifti(s, SHIFT_ROL, r1, 8);
             data_reg = r1;
         }
         /* movw */
@@ -1146,14 +1150,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         c = SHIFT_SHL;
     gen_shift32:
         if (const_args[2]) {
-            if (args[2] == 1) {
-                tcg_out_modrm(s, 0xd1, c, args[0]);
-            } else {
-                tcg_out_modrm(s, 0xc1, c, args[0]);
-                tcg_out8(s, args[2]);
-            }
+            tcg_out_shifti(s, c, args[0], args[2]);
         } else {
-            tcg_out_modrm(s, 0xd3, c, args[0]);
+            tcg_out_modrm(s, OPC_SHIFT_cl, c, args[0]);
         }
         break;
     case INDEX_op_shr_i32:
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 07/21] tcg-i386: Tidy move operations.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (5 preceding siblings ...)
  2010-04-13 23:44 ` [Qemu-devel] [PATCH 06/21] tcg-i386: Tidy shift operations Richard Henderson
@ 2010-04-14 14:58 ` Richard Henderson
  2010-04-14 15:06 ` [Qemu-devel] [PATCH 08/21] tcg-i386: Eliminate extra move from qemu_ld64 Richard Henderson
                   ` (13 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 14:58 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_MOVB* and OPC_MOVL*; use them throughout.
Use tcg_out_ld/st instead of bare tcg_out_modrm_offset
when it makes sense.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   51 +++++++++++++++++++++++-------------------------
 1 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 2df45bf..4f7df70 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -164,6 +164,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define P_EXT   0x100 /* 0x0f opcode prefix */
 
 #define OPC_BSWAP	(0xc8 | P_EXT)
+#define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
+#define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
+#define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
 #define OPC_MOVSBL	(0xbe | P_EXT)
@@ -267,8 +270,9 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm,
 
 static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
 {
-    if (arg != ret)
-        tcg_out_modrm(s, 0x8b, ret, arg);
+    if (arg != ret) {
+        tcg_out_modrm(s, OPC_MOVL_GvEv, ret, arg);
+    }
 }
 
 static inline void tcg_out_movi(TCGContext *s, TCGType type,
@@ -286,15 +290,13 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
 static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
                               int arg1, tcg_target_long arg2)
 {
-    /* movl */
-    tcg_out_modrm_offset(s, 0x8b, ret, arg1, arg2);
+    tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
 }
 
 static inline void tcg_out_st(TCGContext *s, TCGType type, int arg,
                               int arg1, tcg_target_long arg2)
 {
-    /* movl */
-    tcg_out_modrm_offset(s, 0x89, arg, arg1, arg2);
+    tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
 }
 
 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
@@ -779,8 +781,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         }
         break;
     case 2:
-        /* movl (r0), data_reg */
-        tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE);
+        tcg_out_ld(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE);
         if (bswap) {
             tcg_out_bswap32(s, data_reg);
         }
@@ -795,13 +796,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
             r0 = r1;
         }
         if (!bswap) {
-            tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE);
-            tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4);
+            tcg_out_ld(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE);
+            tcg_out_ld(s, TCG_TYPE_I32, data_reg2, r0, GUEST_BASE + 4);
         } else {
-            tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4);
+            tcg_out_ld(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE + 4);
             tcg_out_bswap32(s, data_reg);
 
-            tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE);
+            tcg_out_ld(s, TCG_TYPE_I32, data_reg2, r0, GUEST_BASE);
             tcg_out_bswap32(s, data_reg2);
         }
         break;
@@ -975,8 +976,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 #endif
     switch(opc) {
     case 0:
-        /* movb */
-        tcg_out_modrm_offset(s, 0x88, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVB_EvGv, data_reg, r0, GUEST_BASE);
         break;
     case 1:
         if (bswap) {
@@ -987,7 +987,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         }
         /* movw */
         tcg_out8(s, 0x66);
-        tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv, data_reg, r0, GUEST_BASE);
         break;
     case 2:
         if (bswap) {
@@ -995,20 +995,19 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
             tcg_out_bswap32(s, r1);
             data_reg = r1;
         }
-        /* movl */
-        tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
+        tcg_out_st(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE);
         break;
     case 3:
         if (bswap) {
             tcg_out_mov(s, r1, data_reg2);
             tcg_out_bswap32(s, r1);
-            tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE);
+            tcg_out_st(s, TCG_TYPE_I32, r1, r0, GUEST_BASE);
             tcg_out_mov(s, r1, data_reg);
             tcg_out_bswap32(s, r1);
-            tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4);
+            tcg_out_st(s, TCG_TYPE_I32, r1, r0, GUEST_BASE + 4);
         } else {
-            tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
-            tcg_out_modrm_offset(s, 0x89, data_reg2, r0, GUEST_BASE + 4);
+            tcg_out_st(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE);
+            tcg_out_st(s, TCG_TYPE_I32, data_reg2, r0, GUEST_BASE + 4);
         }
         break;
     default:
@@ -1085,21 +1084,19 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_modrm_offset(s, OPC_MOVSWL, args[0], args[1], args[2]);
         break;
     case INDEX_op_ld_i32:
-        /* movl */
-        tcg_out_modrm_offset(s, 0x8b, args[0], args[1], args[2]);
+        tcg_out_ld(s, TCG_TYPE_I32, args[0], args[1], args[2]);
         break;
     case INDEX_op_st8_i32:
         /* movb */
-        tcg_out_modrm_offset(s, 0x88, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVB_EvGv, args[0], args[1], args[2]);
         break;
     case INDEX_op_st16_i32:
         /* movw */
         tcg_out8(s, 0x66);
-        tcg_out_modrm_offset(s, 0x89, args[0], args[1], args[2]);
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv, args[0], args[1], args[2]);
         break;
     case INDEX_op_st_i32:
-        /* movl */
-        tcg_out_modrm_offset(s, 0x89, args[0], args[1], args[2]);
+        tcg_out_st(s, TCG_TYPE_I32, args[0], args[1], args[2]);
         break;
     case INDEX_op_sub_i32:
         c = ARITH_SUB;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 08/21] tcg-i386: Eliminate extra move from qemu_ld64.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (6 preceding siblings ...)
  2010-04-14 14:58 ` [Qemu-devel] [PATCH 07/21] tcg-i386: Tidy move operations Richard Henderson
@ 2010-04-14 15:06 ` Richard Henderson
  2010-04-14 15:26 ` [Qemu-devel] [PATCH 09/21] tcg-i386: Tidy jumps Richard Henderson
                   ` (12 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 15:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

If the address register overlaps one of the output registers
simply issue the clobbering load last, rather than emitting
an extra move of the address register.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   20 +++++++++-----------
 1 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4f7df70..5829c5b 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -787,22 +787,20 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         }
         break;
     case 3:
-        /* XXX: could be nicer */
-        if (r0 == data_reg) {
-            r1 = TCG_REG_EDX;
-            if (r1 == data_reg)
-                r1 = TCG_REG_EAX;
-            tcg_out_mov(s, r1, r0);
-            r0 = r1;
+        if (bswap) {
+            int t = data_reg;
+            data_reg = data_reg2;
+            data_reg2 = t;
         }
-        if (!bswap) {
+        if (r0 != data_reg) {
             tcg_out_ld(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE);
             tcg_out_ld(s, TCG_TYPE_I32, data_reg2, r0, GUEST_BASE + 4);
         } else {
-            tcg_out_ld(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE + 4);
+            tcg_out_ld(s, TCG_TYPE_I32, data_reg2, r0, GUEST_BASE + 4);
+            tcg_out_ld(s, TCG_TYPE_I32, data_reg, r0, GUEST_BASE);
+        }
+        if (bswap) {
             tcg_out_bswap32(s, data_reg);
-
-            tcg_out_ld(s, TCG_TYPE_I32, data_reg2, r0, GUEST_BASE);
             tcg_out_bswap32(s, data_reg2);
         }
         break;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 09/21] tcg-i386: Tidy jumps.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (7 preceding siblings ...)
  2010-04-14 15:06 ` [Qemu-devel] [PATCH 08/21] tcg-i386: Eliminate extra move from qemu_ld64 Richard Henderson
@ 2010-04-14 15:26 ` Richard Henderson
  2010-04-14 15:38 ` [Qemu-devel] [PATCH 10/21] tcg-i386: Tidy immediate arithmetic operations Richard Henderson
                   ` (11 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 15:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_JCC*, OC_JMP*, and EXT_JMPN_Ev.  Use them throughout.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   58 +++++++++++++++++++++++++++---------------------
 1 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 5829c5b..9d728f5 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -164,6 +164,10 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define P_EXT   0x100 /* 0x0f opcode prefix */
 
 #define OPC_BSWAP	(0xc8 | P_EXT)
+#define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
+#define OPC_JCC_short	(0x70)		/* ... plus condition code */
+#define OPC_JMP_long	(0xe9)
+#define OPC_JMP_short	(0xeb)
 #define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
 #define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
 #define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
@@ -175,6 +179,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
 
+/* Group 1 opcode extensions for 0x80-0x83.  */
 #define ARITH_ADD 0
 #define ARITH_OR  1
 #define ARITH_ADC 2
@@ -184,12 +189,17 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define ARITH_XOR 6
 #define ARITH_CMP 7
 
+/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 #define SHIFT_ROL 0
 #define SHIFT_ROR 1
 #define SHIFT_SHL 4
 #define SHIFT_SHR 5
 #define SHIFT_SAR 7
 
+/* Group 5 opcode extensions for 0xff.  */
+#define EXT_JMPN_Ev	4
+
+/* Condition codes to be added to OPC_JCC_{long,short}.  */
 #define JCC_JMP (-1)
 #define JCC_JO  0x0
 #define JCC_JNO 0x1
@@ -403,9 +413,9 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index, int small)
         val1 = val - 2;
         if ((int8_t)val1 == val1) {
             if (opc == -1) {
-                tcg_out8(s, 0xeb);
+                tcg_out8(s, OPC_JMP_short);
             } else {
-                tcg_out8(s, 0x70 + opc);
+                tcg_out8(s, OPC_JCC_short + opc);
             }
             tcg_out8(s, val1);
         } else {
@@ -413,28 +423,26 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index, int small)
                 tcg_abort();
             }
             if (opc == -1) {
-                tcg_out8(s, 0xe9);
+                tcg_out8(s, OPC_JMP_long);
                 tcg_out32(s, val - 5);
             } else {
-                tcg_out8(s, 0x0f);
-                tcg_out8(s, 0x80 + opc);
+                tcg_out_opc(s, OPC_JCC_long + opc);
                 tcg_out32(s, val - 6);
             }
         }
     } else if (small) {
         if (opc == -1) {
-            tcg_out8(s, 0xeb);
+            tcg_out8(s, OPC_JMP_short);
         } else {
-            tcg_out8(s, 0x70 + opc);
+            tcg_out8(s, OPC_JCC_short + opc);
         }
         tcg_out_reloc(s, s->code_ptr, R_386_PC8, label_index, -1);
         s->code_ptr += 1;
     } else {
         if (opc == -1) {
-            tcg_out8(s, 0xe9);
+            tcg_out8(s, OPC_JMP_long);
         } else {
-            tcg_out8(s, 0x0f);
-            tcg_out8(s, 0x80 + opc);
+            tcg_out_opc(s, OPC_JCC_long + opc);
         }
         tcg_out_reloc(s, s->code_ptr, R_386_PC32, label_index, -4);
         s->code_ptr += 4;
@@ -677,12 +685,12 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     
 #if TARGET_LONG_BITS == 32
     /* je label1 */
-    tcg_out8(s, 0x70 + JCC_JE);
+    tcg_out8(s, OPC_JCC_short + JCC_JE);
     label1_ptr = s->code_ptr;
     s->code_ptr++;
 #else
     /* jne label3 */
-    tcg_out8(s, 0x70 + JCC_JNE);
+    tcg_out8(s, OPC_JCC_short + JCC_JNE);
     label3_ptr = s->code_ptr;
     s->code_ptr++;
     
@@ -690,7 +698,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     tcg_out_modrm_offset(s, 0x3b, addr_reg2, r1, 4);
 
     /* je label1 */
-    tcg_out8(s, 0x70 + JCC_JE);
+    tcg_out8(s, OPC_JCC_short + JCC_JE);
     label1_ptr = s->code_ptr;
     s->code_ptr++;
     
@@ -738,7 +746,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     }
 
     /* jmp label2 */
-    tcg_out8(s, 0xeb);
+    tcg_out8(s, OPC_JMP_short);
     label2_ptr = s->code_ptr;
     s->code_ptr++;
     
@@ -870,12 +878,12 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     
 #if TARGET_LONG_BITS == 32
     /* je label1 */
-    tcg_out8(s, 0x70 + JCC_JE);
+    tcg_out8(s, OPC_JCC_short + JCC_JE);
     label1_ptr = s->code_ptr;
     s->code_ptr++;
 #else
     /* jne label3 */
-    tcg_out8(s, 0x70 + JCC_JNE);
+    tcg_out8(s, OPC_JCC_short + JCC_JNE);
     label3_ptr = s->code_ptr;
     s->code_ptr++;
     
@@ -883,7 +891,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out_modrm_offset(s, 0x3b, addr_reg2, r1, 4);
 
     /* je label1 */
-    tcg_out8(s, 0x70 + JCC_JE);
+    tcg_out8(s, OPC_JCC_short + JCC_JE);
     label1_ptr = s->code_ptr;
     s->code_ptr++;
     
@@ -953,7 +961,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 #endif
     
     /* jmp label2 */
-    tcg_out8(s, 0xeb);
+    tcg_out8(s, OPC_JMP_short);
     label2_ptr = s->code_ptr;
     s->code_ptr++;
     
@@ -1026,19 +1034,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     switch(opc) {
     case INDEX_op_exit_tb:
         tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_EAX, args[0]);
-        tcg_out8(s, 0xe9); /* jmp tb_ret_addr */
+        tcg_out8(s, OPC_JMP_long); /* jmp tb_ret_addr */
         tcg_out32(s, tb_ret_addr - s->code_ptr - 4);
         break;
     case INDEX_op_goto_tb:
         if (s->tb_jmp_offset) {
             /* direct jump method */
-            tcg_out8(s, 0xe9); /* jmp im */
+            tcg_out8(s, OPC_JMP_long); /* jmp im */
             s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf;
             tcg_out32(s, 0);
         } else {
             /* indirect jump method */
-            /* jmp Ev */
-            tcg_out_modrm_offset(s, 0xff, 4, -1, 
+            tcg_out_modrm_offset(s, 0xff, EXT_JMPN_Ev, -1, 
                                  (tcg_target_long)(s->tb_next + args[0]));
         }
         s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
@@ -1053,10 +1060,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_jmp:
         if (const_args[0]) {
-            tcg_out8(s, 0xe9);
+            tcg_out8(s, OPC_JMP_long);
             tcg_out32(s, args[0] - (tcg_target_long)s->code_ptr - 4);
         } else {
-            tcg_out_modrm(s, 0xff, 4, args[0]);
+            /* jmp *reg */
+            tcg_out_modrm(s, 0xff, EXT_JMPN_Ev, args[0]);
         }
         break;
     case INDEX_op_br:
@@ -1381,7 +1389,7 @@ void tcg_target_qemu_prologue(TCGContext *s)
     stack_addend = frame_size - push_size;
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
 
-    tcg_out_modrm(s, 0xff, 4, TCG_REG_EAX); /* jmp *%eax */
+    tcg_out_modrm(s, 0xff, EXT_JMPN_Ev, TCG_REG_EAX); /* jmp *%eax */
     
     /* TB epilogue */
     tb_ret_addr = s->code_ptr;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 10/21] tcg-i386: Tidy immediate arithmetic operations.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (8 preceding siblings ...)
  2010-04-14 15:26 ` [Qemu-devel] [PATCH 09/21] tcg-i386: Tidy jumps Richard Henderson
@ 2010-04-14 15:38 ` Richard Henderson
  2010-04-14 17:16 ` [Qemu-devel] [PATCH 11/21] tcg-i386: Tidy non-immediate " Richard Henderson
                   ` (10 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 15:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_ARITH_EvI[bz]; use throughout.  Use tcg_out_ext8u
directly in setcond.  Use tgen_arithi in qemu_ld/st.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   28 +++++++++++-----------------
 1 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 9d728f5..fb553f4 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -163,6 +163,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
 
 #define P_EXT   0x100 /* 0x0f opcode prefix */
 
+#define OPC_ARITH_EvIz	(0x81)
+#define OPC_ARITH_EvIb	(0x83)
 #define OPC_BSWAP	(0xc8 | P_EXT)
 #define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
 #define OPC_JCC_short	(0x70)		/* ... plus condition code */
@@ -324,7 +326,7 @@ static void tcg_out_ext8u(TCGContext *s, int dest, int src)
     if (src >= 4) {
         tcg_out_mov(s, dest, src);
         if (dest >= 4) {
-            tcg_out_modrm(s, 0x81, ARITH_AND, dest);
+            tcg_out_modrm(s, OPC_ARITH_EvIz, ARITH_AND, dest);
             tcg_out32(s, 0xff);
             return;
         }
@@ -384,14 +386,14 @@ static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf
         /* dec */
         tcg_out_opc(s, 0x48 + r0);
     } else if (val == (int8_t)val) {
-        tcg_out_modrm(s, 0x83, c, r0);
+        tcg_out_modrm(s, OPC_ARITH_EvIb, c, r0);
         tcg_out8(s, val);
     } else if (c == ARITH_AND && val == 0xffu && r0 < 4) {
         tcg_out_ext8u(s, r0, r0);
     } else if (c == ARITH_AND && val == 0xffffu) {
         tcg_out_ext16u(s, r0, r0);
     } else {
-        tcg_out_modrm(s, 0x81, c, r0);
+        tcg_out_modrm(s, OPC_ARITH_EvIz, c, r0);
         tcg_out32(s, val);
     }
 }
@@ -560,7 +562,7 @@ static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGArg dest,
     tcg_out_cmp(s, arg1, arg2, const_arg2);
     /* setcc */
     tcg_out_modrm(s, 0x90 | tcg_cond_to_jcc[cond] | P_EXT, 0, dest);
-    tgen_arithi(s, ARITH_AND, dest, 0xff, 0);
+    tcg_out_ext8u(s, dest, dest);
 }
 
 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
@@ -662,16 +664,12 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
 
 #if defined(CONFIG_SOFTMMU)
     tcg_out_mov(s, r1, addr_reg); 
-
     tcg_out_mov(s, r0, addr_reg); 
- 
+
     tcg_out_shifti(s, SHIFT_SHR, r1, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 
 
-    tcg_out_modrm(s, 0x81, 4, r0); /* andl $x, r0 */
-    tcg_out32(s, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
-    
-    tcg_out_modrm(s, 0x81, 4, r1); /* andl $x, r1 */
-    tcg_out32(s, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
+    tgen_arithi(s, ARITH_AND, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
+    tgen_arithi(s, ARITH_AND, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
     tcg_out_opc(s, 0x8d); /* lea offset(r1, %ebp), r1 */
     tcg_out8(s, 0x80 | (r1 << 3) | 0x04);
@@ -855,16 +853,12 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 
 #if defined(CONFIG_SOFTMMU)
     tcg_out_mov(s, r1, addr_reg); 
-
     tcg_out_mov(s, r0, addr_reg); 
  
     tcg_out_shifti(s, SHIFT_SHR, r1, TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); 
 
-    tcg_out_modrm(s, 0x81, 4, r0); /* andl $x, r0 */
-    tcg_out32(s, TARGET_PAGE_MASK | ((1 << s_bits) - 1));
-    
-    tcg_out_modrm(s, 0x81, 4, r1); /* andl $x, r1 */
-    tcg_out32(s, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS);
+    tgen_arithi(s, ARITH_AND, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
+    tgen_arithi(s, ARITH_AND, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
     tcg_out_opc(s, 0x8d); /* lea offset(r1, %ebp), r1 */
     tcg_out8(s, 0x80 | (r1 << 3) | 0x04);
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 11/21] tcg-i386: Tidy non-immediate arithmetic operations.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (9 preceding siblings ...)
  2010-04-14 15:38 ` [Qemu-devel] [PATCH 10/21] tcg-i386: Tidy immediate arithmetic operations Richard Henderson
@ 2010-04-14 17:16 ` Richard Henderson
  2010-04-14 17:20 ` [Qemu-devel] [PATCH 12/21] tcg-i386: Tidy movi Richard Henderson
                   ` (9 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 17:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Add more OPC values, and tgen_arithr.  Use the later throughout.

Note that normal reg/reg arithmetic now uses the Gv,Ev opcode form
instead of the Ev,Gv opcode form used previously.  Both forms
disassemble properly, and so there's no visible change when diffing
log files before and after the change.  This change makes the operand
ordering within the output routines more natural, and avoids the need
to define an OPC_ARITH_EvGv since a read-modify-write with memory is
not needed within TCG.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   78 ++++++++++++++++++++++++++++++-------------------
 1 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index fb553f4..1243759 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -165,7 +165,12 @@ static inline int tcg_target_const_match(tcg_target_long val,
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
+#define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
+#define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
+#define OPC_DEC_r32	(0x48)
 #define OPC_BSWAP	(0xc8 | P_EXT)
+#define OPC_INC_r32	(0x40)
 #define OPC_JCC_long	(0x80 | P_EXT)	/* ... plus condition code */
 #define OPC_JCC_short	(0x70)		/* ... plus condition code */
 #define OPC_JMP_long	(0xe9)
@@ -180,6 +185,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
+#define OPC_TESTL	(0x85)
 
 /* Group 1 opcode extensions for 0x80-0x83.  */
 #define ARITH_ADD 0
@@ -280,6 +286,12 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm,
     }
 }
 
+/* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
+static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
+{
+    tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3), dest, src);
+}
+
 static inline void tcg_out_mov(TCGContext *s, int ret, int arg)
 {
     if (arg != ret) {
@@ -291,8 +303,7 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
                                 int ret, int32_t arg)
 {
     if (arg == 0) {
-        /* xor r0,r0 */
-        tcg_out_modrm(s, 0x01 | (ARITH_XOR << 3), ret, ret);
+        tgen_arithr(s, ARITH_XOR, ret, ret);
     } else {
         tcg_out8(s, 0xb8 + ret);
         tcg_out32(s, arg);
@@ -377,14 +388,15 @@ static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign)
     tcg_out_shifti(s, (sign ? SHIFT_SAR : SHIFT_SHR), reg, 16);
 }
 
-static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
+static inline void tgen_arithi(TCGContext *s, int c, int r0,
+                               int32_t val, int cf)
 {
-    if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
-        /* inc */
-        tcg_out_opc(s, 0x40 + r0);
-    } else if (!cf && ((c == ARITH_ADD && val == -1) || (c == ARITH_SUB && val == 1))) {
-        /* dec */
-        tcg_out_opc(s, 0x48 + r0);
+    /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
+       partial flags update stalls on Pentium4 and are not recommended
+       by current Intel optimization manuals.  */
+    if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
+        int opc = ((c == ARITH_ADD) ^ (val < 0) ? OPC_INC_r32 : OPC_DEC_r32);
+        tcg_out_opc(s, opc + r0);
     } else if (val == (int8_t)val) {
         tcg_out_modrm(s, OPC_ARITH_EvIb, c, r0);
         tcg_out8(s, val);
@@ -457,12 +469,12 @@ static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
     if (const_arg2) {
         if (arg2 == 0) {
             /* test r, r */
-            tcg_out_modrm(s, 0x85, arg1, arg1);
+            tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
         } else {
             tgen_arithi(s, ARITH_CMP, arg1, arg2, 0);
         }
     } else {
-        tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3), arg2, arg1);
+        tgen_arithr(s, ARITH_CMP, arg1, arg2);
     }
 }
 
@@ -677,7 +689,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     tcg_out32(s, offsetof(CPUState, tlb_table[mem_index][0].addr_read));
 
     /* cmp 0(r1), r0 */
-    tcg_out_modrm_offset(s, 0x3b, r0, r1, 0);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, r0, r1, 0);
     
     tcg_out_mov(s, r0, addr_reg);
     
@@ -693,7 +705,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     s->code_ptr++;
     
     /* cmp 4(r1), addr_reg2 */
-    tcg_out_modrm_offset(s, 0x3b, addr_reg2, r1, 4);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, addr_reg2, r1, 4);
 
     /* je label1 */
     tcg_out8(s, OPC_JCC_short + JCC_JE);
@@ -752,7 +764,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     *label1_ptr = s->code_ptr - label1_ptr - 1;
 
     /* add x(r1), r0 */
-    tcg_out_modrm_offset(s, 0x03, r0, r1, offsetof(CPUTLBEntry, addend) - 
+    tcg_out_modrm_offset(s, OPC_ADD_GvEv, r0, r1,
+                         offsetof(CPUTLBEntry, addend) - 
                          offsetof(CPUTLBEntry, addr_read));
 #else
     r0 = addr_reg;
@@ -866,7 +879,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tcg_out32(s, offsetof(CPUState, tlb_table[mem_index][0].addr_write));
 
     /* cmp 0(r1), r0 */
-    tcg_out_modrm_offset(s, 0x3b, r0, r1, 0);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, r0, r1, 0);
     
     tcg_out_mov(s, r0, addr_reg);
     
@@ -882,7 +895,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     s->code_ptr++;
     
     /* cmp 4(r1), addr_reg2 */
-    tcg_out_modrm_offset(s, 0x3b, addr_reg2, r1, 4);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv, addr_reg2, r1, 4);
 
     /* je label1 */
     tcg_out8(s, OPC_JCC_short + JCC_JE);
@@ -963,7 +976,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     *label1_ptr = s->code_ptr - label1_ptr - 1;
 
     /* add x(r1), r0 */
-    tcg_out_modrm_offset(s, 0x03, r0, r1, offsetof(CPUTLBEntry, addend) - 
+    tcg_out_modrm_offset(s, OPC_ADD_GvEv, r0, r1,
+                         offsetof(CPUTLBEntry, addend) - 
                          offsetof(CPUTLBEntry, addr_write));
 #else
     r0 = addr_reg;
@@ -1116,7 +1130,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         if (const_args[2]) {
             tgen_arithi(s, c, args[0], args[2], 0);
         } else {
-            tcg_out_modrm(s, 0x01 | (c << 3), args[2], args[0]);
+            tgen_arithr(s, c, args[0], args[2]);
         }
         break;
     case INDEX_op_mul_i32:
@@ -1166,24 +1180,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         goto gen_shift32;
 
     case INDEX_op_add2_i32:
-        if (const_args[4]) 
+        if (const_args[4]) {
             tgen_arithi(s, ARITH_ADD, args[0], args[4], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_ADD << 3), args[4], args[0]);
-        if (const_args[5]) 
+        } else {
+            tgen_arithr(s, ARITH_ADD, args[0], args[4]);
+        }
+        if (const_args[5]) {
             tgen_arithi(s, ARITH_ADC, args[1], args[5], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_ADC << 3), args[5], args[1]);
+        } else {
+            tgen_arithr(s, ARITH_ADC, args[0], args[5]);
+        }
         break;
     case INDEX_op_sub2_i32:
-        if (const_args[4]) 
+        if (const_args[4]) {
             tgen_arithi(s, ARITH_SUB, args[0], args[4], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_SUB << 3), args[4], args[0]);
-        if (const_args[5]) 
+        } else {
+            tgen_arithr(s, ARITH_SUB, args[0], args[4]);
+        }
+        if (const_args[5]) {
             tgen_arithi(s, ARITH_SBB, args[1], args[5], 1);
-        else
-            tcg_out_modrm(s, 0x01 | (ARITH_SBB << 3), args[5], args[1]);
+        } else {
+            tgen_arithr(s, ARITH_SBB, args[1], args[5]);
+        }
         break;
     case INDEX_op_brcond_i32:
         tcg_out_brcond(s, args[2], args[0], args[1], const_args[1],
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 12/21] tcg-i386: Tidy movi.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (10 preceding siblings ...)
  2010-04-14 17:16 ` [Qemu-devel] [PATCH 11/21] tcg-i386: Tidy non-immediate " Richard Henderson
@ 2010-04-14 17:20 ` Richard Henderson
  2010-04-14 17:59 ` [Qemu-devel] [PATCH 13/21] tcg-i386: Tidy push/pop Richard Henderson
                   ` (8 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 17:20 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define and use OPC_MOVL_Iv.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 1243759..76b36aa 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -178,6 +178,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
 #define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
 #define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
+#define OPC_MOVL_Iv     (0xb8)
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
 #define OPC_MOVSBL	(0xbe | P_EXT)
@@ -305,7 +306,7 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
     if (arg == 0) {
         tgen_arithr(s, ARITH_XOR, ret, ret);
     } else {
-        tcg_out8(s, 0xb8 + ret);
+        tcg_out8(s, OPC_MOVL_Iv + ret);
         tcg_out32(s, arg);
     }
 }
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 13/21] tcg-i386: Tidy push/pop.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (11 preceding siblings ...)
  2010-04-14 17:20 ` [Qemu-devel] [PATCH 12/21] tcg-i386: Tidy movi Richard Henderson
@ 2010-04-14 17:59 ` Richard Henderson
  2010-04-14 18:02 ` [Qemu-devel] [PATCH 14/21] tcg-i386: Tidy calls Richard Henderson
                   ` (7 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 17:59 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Move tcg_out_push/pop up in the file so that they can be used
by qemu_ld/st.  Define a tcg_out_pushi to be used as well.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   50 +++++++++++++++++++++++++++++++-----------------
 1 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 76b36aa..044d7fe 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -183,6 +183,10 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_MOVZWL	(0xb7 | P_EXT)
 #define OPC_MOVSBL	(0xbe | P_EXT)
 #define OPC_MOVSWL	(0xbf | P_EXT)
+#define OPC_POP_r32	(0x58)
+#define OPC_PUSH_r32	(0x50)
+#define OPC_PUSH_Iv	(0x68)
+#define OPC_PUSH_Ib	(0x6a)
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
@@ -311,6 +315,29 @@ static inline void tcg_out_movi(TCGContext *s, TCGType type,
     }
 }
 
+#if defined(CONFIG_SOFTMMU)
+static void tcg_out_pushi(TCGContext *s, tcg_target_long val)
+{
+    if (val == (int8_t)val) {
+        tcg_out_opc(s, OPC_PUSH_Ib);
+        tcg_out8(s, val);
+    } else {
+        tcg_out_opc(s, OPC_PUSH_Iv);
+        tcg_out32(s, val);
+    }
+}
+#endif
+
+static inline void tcg_out_push(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_PUSH_r32 + reg);
+}
+
+static inline void tcg_out_pop(TCGContext *s, int reg)
+{
+    tcg_out_opc(s, OPC_POP_r32 + reg);
+}
+
 static inline void tcg_out_ld(TCGContext *s, TCGType type, int ret,
                               int arg1, tcg_target_long arg2)
 {
@@ -912,8 +939,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     if (opc == 3) {
         tcg_out_mov(s, TCG_REG_EDX, data_reg);
         tcg_out_mov(s, TCG_REG_ECX, data_reg2);
-        tcg_out8(s, 0x6a); /* push Ib */
-        tcg_out8(s, mem_index);
+        tcg_out_pushi(s, mem_index);
         tcg_out8(s, 0xe8);
         tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
                   (tcg_target_long)s->code_ptr - 4);
@@ -938,10 +964,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 #else
     if (opc == 3) {
         tcg_out_mov(s, TCG_REG_EDX, addr_reg2);
-        tcg_out8(s, 0x6a); /* push Ib */
-        tcg_out8(s, mem_index);
-        tcg_out_opc(s, 0x50 + data_reg2); /* push */
-        tcg_out_opc(s, 0x50 + data_reg); /* push */
+        tcg_out_pushi(s, mem_index);
+        tcg_out_push(s, data_reg2);
+        tcg_out_push(s, data_reg);
         tcg_out8(s, 0xe8);
         tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
                   (tcg_target_long)s->code_ptr - 4);
@@ -959,8 +984,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
             tcg_out_mov(s, TCG_REG_ECX, data_reg);
             break;
         }
-        tcg_out8(s, 0x6a); /* push Ib */
-        tcg_out8(s, mem_index);
+        tcg_out_pushi(s, mem_index);
         tcg_out8(s, 0xe8);
         tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
                   (tcg_target_long)s->code_ptr - 4);
@@ -1374,16 +1398,6 @@ static int tcg_target_callee_save_regs[] = {
     TCG_REG_EDI,
 };
 
-static inline void tcg_out_push(TCGContext *s, int reg)
-{
-    tcg_out_opc(s, 0x50 + reg);
-}
-
-static inline void tcg_out_pop(TCGContext *s, int reg)
-{
-    tcg_out_opc(s, 0x58 + reg);
-}
-
 /* Generate global QEMU prologue and epilogue code */
 void tcg_target_qemu_prologue(TCGContext *s)
 {
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 14/21] tcg-i386: Tidy calls.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (12 preceding siblings ...)
  2010-04-14 17:59 ` [Qemu-devel] [PATCH 13/21] tcg-i386: Tidy push/pop Richard Henderson
@ 2010-04-14 18:02 ` Richard Henderson
  2010-04-14 18:04 ` [Qemu-devel] [PATCH 15/21] tcg-i386: Tidy ret Richard Henderson
                   ` (6 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 18:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_CALL_Jz, generated by tcg_out_calli; use the later
throughout.  Unify the calls within qemu_st; adjust the stack
with a single pop if applicable.

Define and use EXT_CALLN_Ev for indirect calls.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   49 +++++++++++++++++++++++++++----------------------
 1 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 044d7fe..f6399ab 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -167,6 +167,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_ARITH_EvIb	(0x83)
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_CALL_Jz	(0xe8)
 #define OPC_CMP_GvEv	(OPC_ARITH_GvEv | (ARITH_CMP << 3))
 #define OPC_DEC_r32	(0x48)
 #define OPC_BSWAP	(0xc8 | P_EXT)
@@ -210,6 +211,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define SHIFT_SAR 7
 
 /* Group 5 opcode extensions for 0xff.  */
+#define EXT_CALLN_Ev	2
 #define EXT_JMPN_Ev	4
 
 /* Condition codes to be added to OPC_JCC_{long,short}.  */
@@ -647,6 +649,12 @@ static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
     }
 }
 
+static void tcg_out_calli(TCGContext *s, tcg_target_long dest)
+{
+    tcg_out_opc(s, OPC_CALL_Jz);
+    tcg_out32(s, dest - (tcg_target_long)s->code_ptr - 4);
+}
+
 #if defined(CONFIG_SOFTMMU)
 
 #include "../../softmmu_defs.h"
@@ -751,9 +759,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     tcg_out_mov(s, TCG_REG_EDX, addr_reg2);
     tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, mem_index);
 #endif
-    tcg_out8(s, 0xe8);
-    tcg_out32(s, (tcg_target_long)qemu_ld_helpers[s_bits] - 
-              (tcg_target_long)s->code_ptr - 4);
+    tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]);
 
     switch(opc) {
     case 0 | 4:
@@ -867,6 +873,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 {
     int addr_reg, data_reg, data_reg2, r0, r1, mem_index, s_bits, bswap;
 #if defined(CONFIG_SOFTMMU)
+    int stack_adjust;
     uint8_t *label1_ptr, *label2_ptr;
 #endif
 #if TARGET_LONG_BITS == 64
@@ -940,10 +947,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         tcg_out_mov(s, TCG_REG_EDX, data_reg);
         tcg_out_mov(s, TCG_REG_ECX, data_reg2);
         tcg_out_pushi(s, mem_index);
-        tcg_out8(s, 0xe8);
-        tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
-                  (tcg_target_long)s->code_ptr - 4);
-        tcg_out_addi(s, TCG_REG_ESP, 4);
+        stack_adjust = 4;
     } else {
         switch(opc) {
         case 0:
@@ -957,9 +961,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
             break;
         }
         tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, mem_index);
-        tcg_out8(s, 0xe8);
-        tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
-                  (tcg_target_long)s->code_ptr - 4);
+        stack_adjust = 0;
     }
 #else
     if (opc == 3) {
@@ -967,10 +969,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
         tcg_out_pushi(s, mem_index);
         tcg_out_push(s, data_reg2);
         tcg_out_push(s, data_reg);
-        tcg_out8(s, 0xe8);
-        tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
-                  (tcg_target_long)s->code_ptr - 4);
-        tcg_out_addi(s, TCG_REG_ESP, 12);
+        stack_adjust = 12;
     } else {
         tcg_out_mov(s, TCG_REG_EDX, addr_reg2);
         switch(opc) {
@@ -985,13 +984,19 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
             break;
         }
         tcg_out_pushi(s, mem_index);
-        tcg_out8(s, 0xe8);
-        tcg_out32(s, (tcg_target_long)qemu_st_helpers[s_bits] - 
-                  (tcg_target_long)s->code_ptr - 4);
-        tcg_out_addi(s, TCG_REG_ESP, 4);
+        stack_adjust = 4;
     }
 #endif
-    
+
+    tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]);
+
+    if (stack_adjust == 4) {
+        /* Pop and discard.  This is 2 bytes smaller than the add.  */
+        tcg_out_pop(s, TCG_REG_ECX);
+    } else if (stack_adjust != 0) {
+        tcg_out_addi(s, TCG_REG_ESP, stack_adjust);
+    }
+
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
     label2_ptr = s->code_ptr;
@@ -1085,10 +1090,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
     case INDEX_op_call:
         if (const_args[0]) {
-            tcg_out8(s, 0xe8);
-            tcg_out32(s, args[0] - (tcg_target_long)s->code_ptr - 4);
+            tcg_out_calli(s, args[0]);
         } else {
-            tcg_out_modrm(s, 0xff, 2, args[0]);
+            /* call *reg */
+            tcg_out_modrm(s, 0xff, EXT_CALLN_Ev, args[0]);
         }
         break;
     case INDEX_op_jmp:
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 15/21] tcg-i386: Tidy ret.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (13 preceding siblings ...)
  2010-04-14 18:02 ` [Qemu-devel] [PATCH 14/21] tcg-i386: Tidy calls Richard Henderson
@ 2010-04-14 18:04 ` Richard Henderson
  2010-04-14 18:07 ` [Qemu-devel] [PATCH 16/21] tcg-i386: Tidy setcc Richard Henderson
                   ` (5 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 18:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define and use OPC_RET.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index f6399ab..376ac7b 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -188,6 +188,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_PUSH_r32	(0x50)
 #define OPC_PUSH_Iv	(0x68)
 #define OPC_PUSH_Ib	(0x6a)
+#define OPC_RET		(0xc3)
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
@@ -1429,7 +1430,7 @@ void tcg_target_qemu_prologue(TCGContext *s)
     for(i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
     }
-    tcg_out8(s, 0xc3); /* ret */
+    tcg_out_opc(s, OPC_RET);
 }
 
 void tcg_target_init(TCGContext *s)
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 16/21] tcg-i386: Tidy setcc.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (14 preceding siblings ...)
  2010-04-14 18:04 ` [Qemu-devel] [PATCH 15/21] tcg-i386: Tidy ret Richard Henderson
@ 2010-04-14 18:07 ` Richard Henderson
  2010-04-14 18:22 ` [Qemu-devel] [PATCH 17/21] tcg-i386: Tidy unary arithmetic Richard Henderson
                   ` (4 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 18:07 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define and use OPC_SETCC.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 376ac7b..af051b0 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -189,6 +189,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_PUSH_Iv	(0x68)
 #define OPC_PUSH_Ib	(0x6a)
 #define OPC_RET		(0xc3)
+#define OPC_SETCC	(0x90 | P_EXT)	/* ... plus condition code */
 #define OPC_SHIFT_1	(0xd1)
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
@@ -603,8 +604,7 @@ static void tcg_out_setcond(TCGContext *s, TCGCond cond, TCGArg dest,
                             TCGArg arg1, TCGArg arg2, int const_arg2)
 {
     tcg_out_cmp(s, arg1, arg2, const_arg2);
-    /* setcc */
-    tcg_out_modrm(s, 0x90 | tcg_cond_to_jcc[cond] | P_EXT, 0, dest);
+    tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
     tcg_out_ext8u(s, dest, dest);
 }
 
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 17/21] tcg-i386: Tidy unary arithmetic.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (15 preceding siblings ...)
  2010-04-14 18:07 ` [Qemu-devel] [PATCH 16/21] tcg-i386: Tidy setcc Richard Henderson
@ 2010-04-14 18:22 ` Richard Henderson
  2010-04-14 18:29 ` [Qemu-devel] [PATCH 18/21] tcg-i386: Tidy multiply Richard Henderson
                   ` (3 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 18:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define OPC_GRP3 and EXT3_FOO to match.  Use them instead of
bare constants.

Define OPC_GRP5 and rename the existing EXT_BAR to EXT5_BAR to
make it clear which extension should be used with which opcode.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   38 +++++++++++++++++++++++++-------------
 1 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index af051b0..484e789 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -195,7 +195,11 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_SHIFT_cl	(0xd3)
 #define OPC_TESTL	(0x85)
 
-/* Group 1 opcode extensions for 0x80-0x83.  */
+#define OPC_GRP3_Ev	(0xf7)
+#define OPC_GRP5	(0xff)
+
+/* Group 1 opcode extensions for 0x80-0x83.
+   These are also used as modifiers for OPC_ARITH.  */
 #define ARITH_ADD 0
 #define ARITH_OR  1
 #define ARITH_ADC 2
@@ -212,9 +216,17 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define SHIFT_SHR 5
 #define SHIFT_SAR 7
 
-/* Group 5 opcode extensions for 0xff.  */
-#define EXT_CALLN_Ev	2
-#define EXT_JMPN_Ev	4
+/* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
+#define EXT3_NOT   2
+#define EXT3_NEG   3
+#define EXT3_MUL   4
+#define EXT3_IMUL  5
+#define EXT3_DIV   6
+#define EXT3_IDIV  7
+
+/* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
+#define EXT5_CALLN_Ev	2
+#define EXT5_JMPN_Ev	4
 
 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 #define JCC_JMP (-1)
@@ -1084,7 +1096,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out32(s, 0);
         } else {
             /* indirect jump method */
-            tcg_out_modrm_offset(s, 0xff, EXT_JMPN_Ev, -1, 
+            tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1, 
                                  (tcg_target_long)(s->tb_next + args[0]));
         }
         s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf;
@@ -1094,7 +1106,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out_calli(s, args[0]);
         } else {
             /* call *reg */
-            tcg_out_modrm(s, 0xff, EXT_CALLN_Ev, args[0]);
+            tcg_out_modrm(s, OPC_GRP5, EXT5_CALLN_Ev, args[0]);
         }
         break;
     case INDEX_op_jmp:
@@ -1103,7 +1115,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             tcg_out32(s, args[0] - (tcg_target_long)s->code_ptr - 4);
         } else {
             /* jmp *reg */
-            tcg_out_modrm(s, 0xff, EXT_JMPN_Ev, args[0]);
+            tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, args[0]);
         }
         break;
     case INDEX_op_br:
@@ -1180,13 +1192,13 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
     case INDEX_op_mulu2_i32:
-        tcg_out_modrm(s, 0xf7, 4, args[3]);
+        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_MUL, args[3]);
         break;
     case INDEX_op_div2_i32:
-        tcg_out_modrm(s, 0xf7, 7, args[4]);
+        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_IDIV, args[4]);
         break;
     case INDEX_op_divu2_i32:
-        tcg_out_modrm(s, 0xf7, 6, args[4]);
+        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_DIV, args[4]);
         break;
     case INDEX_op_shl_i32:
         c = SHIFT_SHL;
@@ -1250,11 +1262,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_neg_i32:
-        tcg_out_modrm(s, 0xf7, 3, args[0]);
+        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, args[0]);
         break;
 
     case INDEX_op_not_i32:
-        tcg_out_modrm(s, 0xf7, 2, args[0]);
+        tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NOT, args[0]);
         break;
 
     case INDEX_op_ext8s_i32:
@@ -1422,7 +1434,7 @@ void tcg_target_qemu_prologue(TCGContext *s)
     stack_addend = frame_size - push_size;
     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
 
-    tcg_out_modrm(s, 0xff, EXT_JMPN_Ev, TCG_REG_EAX); /* jmp *%eax */
+    tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_EAX); /* jmp *%eax */
     
     /* TB epilogue */
     tb_ret_addr = s->code_ptr;
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 18/21] tcg-i386: Tidy multiply.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (16 preceding siblings ...)
  2010-04-14 18:22 ` [Qemu-devel] [PATCH 17/21] tcg-i386: Tidy unary arithmetic Richard Henderson
@ 2010-04-14 18:29 ` Richard Henderson
  2010-04-14 18:32 ` [Qemu-devel] [PATCH 19/21] tcg-i386: Tidy xchg Richard Henderson
                   ` (2 subsequent siblings)
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 18:29 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define and use OPC_IMUL_GvEv{,Ib,Iz}.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |    9 ++++++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 484e789..b806d18 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -184,6 +184,9 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_MOVZWL	(0xb7 | P_EXT)
 #define OPC_MOVSBL	(0xbe | P_EXT)
 #define OPC_MOVSWL	(0xbf | P_EXT)
+#define OPC_IMUL_GvEv	(0xaf | P_EXT)
+#define OPC_IMUL_GvEvIb	(0x6b)
+#define OPC_IMUL_GvEvIz	(0x69)
 #define OPC_POP_r32	(0x58)
 #define OPC_PUSH_r32	(0x50)
 #define OPC_PUSH_Iv	(0x68)
@@ -1181,14 +1184,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
             int32_t val;
             val = args[2];
             if (val == (int8_t)val) {
-                tcg_out_modrm(s, 0x6b, args[0], args[0]);
+                tcg_out_modrm(s, OPC_IMUL_GvEvIb, args[0], args[0]);
                 tcg_out8(s, val);
             } else {
-                tcg_out_modrm(s, 0x69, args[0], args[0]);
+                tcg_out_modrm(s, OPC_IMUL_GvEvIz, args[0], args[0]);
                 tcg_out32(s, val);
             }
         } else {
-            tcg_out_modrm(s, 0xaf | P_EXT, args[0], args[2]);
+            tcg_out_modrm(s, OPC_IMUL_GvEv, args[0], args[2]);
         }
         break;
     case INDEX_op_mulu2_i32:
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 19/21] tcg-i386: Tidy xchg.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (17 preceding siblings ...)
  2010-04-14 18:29 ` [Qemu-devel] [PATCH 18/21] tcg-i386: Tidy multiply Richard Henderson
@ 2010-04-14 18:32 ` Richard Henderson
  2010-04-14 19:08 ` [Qemu-devel] [PATCH 20/21] tcg-i386: Tidy lea Richard Henderson
  2010-04-14 20:29 ` [Qemu-devel] [PATCH 21/21] tcg-i386: Use lea for three-operand add Richard Henderson
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 18:32 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Define and use OPC_XCHG_ax_r32.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index b806d18..d8367b0 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -197,6 +197,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_SHIFT_Ib	(0xc1)
 #define OPC_SHIFT_cl	(0xd3)
 #define OPC_TESTL	(0x85)
+#define OPC_XCHG_ax_r32	(0x90)
 
 #define OPC_GRP3_Ev	(0xf7)
 #define OPC_GRP5	(0xff)
@@ -796,7 +797,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
         break;
     case 3:
         if (data_reg == TCG_REG_EDX) {
-            tcg_out_opc(s, 0x90 + TCG_REG_EDX); /* xchg %edx, %eax */
+            /* xchg %edx, %eax */
+            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX);
             tcg_out_mov(s, data_reg2, TCG_REG_EAX);
         } else {
             tcg_out_mov(s, data_reg, TCG_REG_EAX);
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 20/21] tcg-i386: Tidy lea.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (18 preceding siblings ...)
  2010-04-14 18:32 ` [Qemu-devel] [PATCH 19/21] tcg-i386: Tidy xchg Richard Henderson
@ 2010-04-14 19:08 ` Richard Henderson
  2010-04-14 20:29 ` [Qemu-devel] [PATCH 21/21] tcg-i386: Use lea for three-operand add Richard Henderson
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 19:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Implement full modrm+sib addressing mode processing.
Use that in qemu_ld/st to output the LEA.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   91 ++++++++++++++++++++++++++++++++-----------------
 1 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index d8367b0..4dec422 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -176,6 +176,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
 #define OPC_JCC_short	(0x70)		/* ... plus condition code */
 #define OPC_JMP_long	(0xe9)
 #define OPC_JMP_short	(0xeb)
+#define OPC_LEA         (0x8d)
 #define OPC_MOVB_EvGv	(0x88)		/* stores, more or less */
 #define OPC_MOVL_EvGv	(0x89)		/* stores, more or less */
 #define OPC_MOVL_GvEv	(0x8b)		/* loads, more or less */
@@ -277,40 +278,70 @@ static inline void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
     tcg_out8(s, 0xc0 | (r << 3) | rm);
 }
 
-/* rm == -1 means no register index */
-static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm, 
-                                        int32_t offset)
+/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.  
+   We handle either RM and INDEX missing with a -1 value.  */
+
+static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
+                                     int index, int shift, int32_t offset)
 {
+    int mod, len;
+
+    if (index == -1 && rm == -1) {
+        /* Absolute address.  */
+        tcg_out_opc(s, opc);
+        tcg_out8(s, (r << 3) | 5);
+        tcg_out32(s, offset);
+        return;
+    }
+
     tcg_out_opc(s, opc);
+
+    /* Find the length of the immediate addend.  Note that the encoding
+       that would be used for (%ebp) indicates absolute addressing.  */
     if (rm == -1) {
-        tcg_out8(s, 0x05 | (r << 3));
-        tcg_out32(s, offset);
+        mod = 0, len = 4, rm = 5;
     } else if (offset == 0 && rm != TCG_REG_EBP) {
-        if (rm == TCG_REG_ESP) {
-            tcg_out8(s, 0x04 | (r << 3));
-            tcg_out8(s, 0x24);
-        } else {
-            tcg_out8(s, 0x00 | (r << 3) | rm);
-        }
-    } else if ((int8_t)offset == offset) {
-        if (rm == TCG_REG_ESP) {
-            tcg_out8(s, 0x44 | (r << 3));
-            tcg_out8(s, 0x24);
-        } else {
-            tcg_out8(s, 0x40 | (r << 3) | rm);
-        }
-        tcg_out8(s, offset);
+        mod = 0, len = 0;
+    } else if (offset == (int8_t)offset) {
+        mod = 0x40, len = 1;
     } else {
-        if (rm == TCG_REG_ESP) {
-            tcg_out8(s, 0x84 | (r << 3));
-            tcg_out8(s, 0x24);
+        mod = 0x80, len = 4;
+    }
+
+    /* Use a single byte MODRM format if possible.  Note that the encoding
+       that would be used for %esp is the escape to the two byte form.  */
+    if (index == -1 && rm != TCG_REG_ESP) {
+        /* Single byte MODRM format.  */
+        tcg_out8(s, mod | (r << 3) | rm);
+    } else {
+        /* Two byte MODRM+SIB format.  */
+
+        /* Note that the encoding that would place %esp into the index
+           field indicates no index register.  */
+        if (index == -1) {
+            index = 4;
         } else {
-            tcg_out8(s, 0x80 | (r << 3) | rm);
+            assert(index != TCG_REG_ESP);
         }
+
+        tcg_out8(s, mod | (r << 3) | 4);
+        tcg_out8(s, (shift << 6) | (index << 3) | rm);
+    }
+
+    if (len == 1) {
+        tcg_out8(s, offset);
+    } else if (len == 4) {
         tcg_out32(s, offset);
     }
 }
 
+/* rm == -1 means no register index */
+static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm, 
+                                        int32_t offset)
+{
+    tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
+}
+
 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 {
@@ -736,10 +767,9 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     tgen_arithi(s, ARITH_AND, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
     tgen_arithi(s, ARITH_AND, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
-    tcg_out_opc(s, 0x8d); /* lea offset(r1, %ebp), r1 */
-    tcg_out8(s, 0x80 | (r1 << 3) | 0x04);
-    tcg_out8(s, (5 << 3) | r1);
-    tcg_out32(s, offsetof(CPUState, tlb_table[mem_index][0].addr_read));
+    tcg_out_modrm_sib_offset(s, OPC_LEA, r1, TCG_AREG0, r1, 0,
+                             offsetof(CPUState,
+                                      tlb_table[mem_index][0].addr_read));
 
     /* cmp 0(r1), r0 */
     tcg_out_modrm_offset(s, OPC_CMP_GvEv, r0, r1, 0);
@@ -926,10 +956,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     tgen_arithi(s, ARITH_AND, r0, TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
     tgen_arithi(s, ARITH_AND, r1, (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
 
-    tcg_out_opc(s, 0x8d); /* lea offset(r1, %ebp), r1 */
-    tcg_out8(s, 0x80 | (r1 << 3) | 0x04);
-    tcg_out8(s, (5 << 3) | r1);
-    tcg_out32(s, offsetof(CPUState, tlb_table[mem_index][0].addr_write));
+    tcg_out_modrm_sib_offset(s, OPC_LEA, r1, TCG_AREG0, r1, 0,
+                             offsetof(CPUState,
+                                      tlb_table[mem_index][0].addr_write));
 
     /* cmp 0(r1), r0 */
     tcg_out_modrm_offset(s, OPC_CMP_GvEv, r0, r1, 0);
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 21/21] tcg-i386: Use lea for three-operand add.
  2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
                   ` (19 preceding siblings ...)
  2010-04-14 19:08 ` [Qemu-devel] [PATCH 20/21] tcg-i386: Tidy lea Richard Henderson
@ 2010-04-14 20:29 ` Richard Henderson
  20 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 20:29 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

The result is shorter than the mov+add that TCG would
otherwise generate for us.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |   23 ++++++++++++++++++++---
 1 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4dec422..46e4574 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -1189,6 +1189,25 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_st_i32:
         tcg_out_st(s, TCG_TYPE_I32, args[0], args[1], args[2]);
         break;
+    case INDEX_op_add_i32:
+        /* For 3-operand addition, use LEA.  */
+        if (args[0] != args[1]) {
+            TCGArg a0 = args[0], a1 = args[1], a2 = args[2], c3 = 0;
+
+            if (const_args[2]) {
+                c3 = a2, a2 = -1;
+            } else if (a0 == a2) {
+                /* Watch out for dest = src + dest, since we've removed
+                   the matching constraint on the add.  */
+                tgen_arithr(s, ARITH_ADD, a0, a1);
+                break;
+            }
+
+            tcg_out_modrm_sib_offset(s, OPC_LEA, a0, a1, a2, 0, c3);
+            break;
+        }
+        c = ARITH_ADD;
+        goto gen_arith;
     case INDEX_op_sub_i32:
         c = ARITH_SUB;
         goto gen_arith;
@@ -1201,8 +1220,6 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_xor_i32:
         c = ARITH_XOR;
         goto gen_arith;
-    case INDEX_op_add_i32:
-        c = ARITH_ADD;
     gen_arith:
         if (const_args[2]) {
             tgen_arithi(s, c, args[0], args[2], 0);
@@ -1377,7 +1394,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_st16_i32, { "r", "r" } },
     { INDEX_op_st_i32, { "r", "r" } },
 
-    { INDEX_op_add_i32, { "r", "0", "ri" } },
+    { INDEX_op_add_i32, { "r", "r", "ri" } },
     { INDEX_op_sub_i32, { "r", "0", "ri" } },
     { INDEX_op_mul_i32, { "r", "0", "ri" } },
     { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } },
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement
@ 2010-04-14 20:35 Richard Henderson
  2010-04-13 22:23 ` [Qemu-devel] [PATCH 01/21] tcg-i386: Allocate call-saved registers first Richard Henderson
                   ` (20 more replies)
  0 siblings, 21 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-14 20:35 UTC (permalink / raw)
  To: qemu-devel; +Cc: aurelien

Most of this patch series replaces the use of hard-coded constants
with symbolic definitions of the opcode space.  There are a few
changes to code generation scattered within, as I came across them.

After the review cycle on this sequence, I'll do the same for the
x86-64 port.  Although I do begin to wonder if we couldn't usefully
merge the two ports; the REX handling can't be that hard to have go
away under appropriate conditional compilation...


r~


Richard Henderson (21):
  tcg-i386: Allocate call-saved registers first.
  tcg-i386: Tidy initialization of tcg_target_call_clobber_regs.
  tcg-i386: Tidy ext8u and ext16u operations.
  tcg-i386: Tidy ext8s and ext16s operations.
  tcg-i386: Tidy bswap operations.
  tcg-i386: Tidy shift operations.
  tcg-i386: Tidy move operations.
  tcg-i386: Eliminate extra move from qemu_ld64.
  tcg-i386: Tidy jumps.
  tcg-i386: Tidy immediate arithmetic operations.
  tcg-i386: Tidy non-immediate arithmetic operations.
  tcg-i386: Tidy movi.
  tcg-i386: Tidy push/pop.
  tcg-i386: Tidy calls.
  tcg-i386: Tidy ret.
  tcg-i386: Tidy setcc.
  tcg-i386: Tidy unary arithmetic.
  tcg-i386: Tidy multiply.
  tcg-i386: Tidy xchg.
  tcg-i386: Tidy lea.
  tcg-i386: Use lea for three-operand add.

 tcg/i386/tcg-target.c |  708 +++++++++++++++++++++++++++++--------------------
 1 files changed, 427 insertions(+), 281 deletions(-)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations.
  2010-04-13 23:33 ` [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations Richard Henderson
@ 2010-04-18 22:13   ` Aurelien Jarno
  2010-04-19 13:56     ` Richard Henderson
  0 siblings, 1 reply; 26+ messages in thread
From: Aurelien Jarno @ 2010-04-18 22:13 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote:
> Define OPC_BSWAP.  Factor opcode emission to separate functions.
> Use bswap+shift to implement 16-bit swap instead of a rolw; this
> gets the proper zero-extension required by INDEX_op_bswap16_i32.

This is not required by INDEX_op_bswap16_i32. What is need is that the
value in the input register has the 16 upper bits set to 0. Considering
that, the rolw instruction is faster than bswap + shift.

> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  tcg/i386/tcg-target.c |   53 +++++++++++++++++++++++++------------------------
>  1 files changed, 27 insertions(+), 26 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index 75b9915..0bafd00 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -163,6 +163,7 @@ static inline int tcg_target_const_match(tcg_target_long val,
>  
>  #define P_EXT   0x100 /* 0x0f opcode prefix */
>  
> +#define OPC_BSWAP	(0xc8 | P_EXT)
>  #define OPC_MOVZBL	(0xb6 | P_EXT)
>  #define OPC_MOVZWL	(0xb7 | P_EXT)
>  #define OPC_MOVSBL	(0xbe | P_EXT)

> @@ -339,6 +340,22 @@ static inline void tcg_out_ext16s(TCGContext *s, int dest, int src)
>      tcg_out_modrm(s, OPC_MOVSWL, dest, src);
>  }
>  
> +static inline void tcg_out_bswap32(TCGContext *s, int reg)
> +{
> +    tcg_out_opc(s, OPC_BSWAP + reg);
> +}
> +
> +static inline void tcg_out_bswap16(TCGContext *s, int reg, int sign)
> +{
> +    /* This swap+shift combination guarantees that the high part contains
> +       the sign or zero extension required.  It also doesn't suffer the
> +       problem of partial register stalls that using rolw does.  */
> +    tcg_out_bswap32(s, reg);
> +    /* shr $16, dest */
> +    tcg_out_modrm(s, 0xc1, (sign ? SHIFT_SAR : SHIFT_SHR), reg);
> +    tcg_out8(s, 16);
> +}
> +
>  static inline void tgen_arithi(TCGContext *s, int c, int r0, int32_t val, int cf)
>  {
>      if (!cf && ((c == ARITH_ADD && val == 1) || (c == ARITH_SUB && val == -1))) {
> @@ -745,31 +762,21 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>          /* movzwl */
>          tcg_out_modrm_offset(s, OPC_MOVZWL, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* rolw $8, data_reg */
> -            tcg_out8(s, 0x66); 
> -            tcg_out_modrm(s, 0xc1, 0, data_reg);
> -            tcg_out8(s, 8);
> +            tcg_out_bswap16(s, data_reg, 0);
>          }
>          break;
>      case 1 | 4:
>          /* movswl */
>          tcg_out_modrm_offset(s, OPC_MOVSWL, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* rolw $8, data_reg */
> -            tcg_out8(s, 0x66); 
> -            tcg_out_modrm(s, 0xc1, 0, data_reg);
> -            tcg_out8(s, 8);
> -
> -            /* movswl data_reg, data_reg */
> -            tcg_out_modrm(s, OPC_MOVSWL, data_reg, data_reg);
> +            tcg_out_bswap16(s, data_reg, 1);
>          }
>          break;
>      case 2:
>          /* movl (r0), data_reg */
>          tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE);
>          if (bswap) {
> -            /* bswap */
> -            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
> +            tcg_out_bswap32(s, data_reg);
>          }
>          break;
>      case 3:
> @@ -786,11 +793,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
>              tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE + 4);
>          } else {
>              tcg_out_modrm_offset(s, 0x8b, data_reg, r0, GUEST_BASE + 4);
> -            tcg_out_opc(s, (0xc8 + data_reg) | P_EXT);
> +            tcg_out_bswap32(s, data_reg);
>  
>              tcg_out_modrm_offset(s, 0x8b, data_reg2, r0, GUEST_BASE);
> -            /* bswap */
> -            tcg_out_opc(s, (0xc8 + data_reg2) | P_EXT);
> +            tcg_out_bswap32(s, data_reg2);
>          }
>          break;
>      default:
> @@ -982,8 +988,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      case 2:
>          if (bswap) {
>              tcg_out_mov(s, r1, data_reg);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              data_reg = r1;
>          }
>          /* movl */
> @@ -992,12 +997,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
>      case 3:
>          if (bswap) {
>              tcg_out_mov(s, r1, data_reg2);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE);
>              tcg_out_mov(s, r1, data_reg);
> -            /* bswap data_reg */
> -            tcg_out_opc(s, (0xc8 + r1) | P_EXT);
> +            tcg_out_bswap32(s, r1);
>              tcg_out_modrm_offset(s, 0x89, r1, r0, GUEST_BASE + 4);
>          } else {
>              tcg_out_modrm_offset(s, 0x89, data_reg, r0, GUEST_BASE);
> @@ -1195,12 +1198,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>  
>      case INDEX_op_bswap16_i32:
> -        tcg_out8(s, 0x66);
> -        tcg_out_modrm(s, 0xc1, SHIFT_ROL, args[0]);
> -        tcg_out8(s, 8);
> +        tcg_out_bswap16(s, args[0], 0);
>          break;
>      case INDEX_op_bswap32_i32:
> -        tcg_out_opc(s, (0xc8 + args[0]) | P_EXT);
> +        tcg_out_bswap32(s, args[0]);
>          break;
>  
>      case INDEX_op_neg_i32:
> -- 
> 1.6.2.5
> 
> 
> 
> 

-- 
Aurelien Jarno                          GPG: 1024D/F1BCDB73
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations.
  2010-04-18 22:13   ` Aurelien Jarno
@ 2010-04-19 13:56     ` Richard Henderson
  2010-04-19 16:05       ` malc
  0 siblings, 1 reply; 26+ messages in thread
From: Richard Henderson @ 2010-04-19 13:56 UTC (permalink / raw)
  To: Aurelien Jarno; +Cc: qemu-devel

On 04/18/2010 05:13 PM, Aurelien Jarno wrote:
> On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote:
>> Define OPC_BSWAP.  Factor opcode emission to separate functions.
>> Use bswap+shift to implement 16-bit swap instead of a rolw; this
>> gets the proper zero-extension required by INDEX_op_bswap16_i32.
> 
> This is not required by INDEX_op_bswap16_i32. What is need is that the
> value in the input register has the 16 upper bits set to 0.

Ah.

> Considering
> that, the rolw instruction is faster than bswap + shift.

Well, no, it isn't.

 static inline int test_rolw(unsigned short *s)
 {
   int i, start, end;
   asm volatile("rdtsc\n\t"
                "movl %%eax, %1\n\t"
                "movzwl %3,%2\n\t"
                "rolw $8, %w2\n\t"
                "addl $1,%2\n\t"
                "rdtsc"
                : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx");
   return end - start;
 }
 
 static inline int test_bswap(unsigned short *s)
 {
   int i, start, end;
   asm volatile("rdtsc\n\t"
                "movl %%eax, %1\n\t"
                "movzwl %3,%2\n\t"
                "bswap %2\n\t"
                "shl $16,%2\n\t"
                "addl $1,%2\n\t"
                "rdtsc"
                : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx");
   return end - start;
 }


model name	: Intel(R) Core(TM)2 Duo CPU     T7700  @ 2.40GHz
 rolw	   60   60   72   60   60   72   60   60   72   60
 bswap	   60   60   60   60   60   60   60   60   60   60

model name	: Dual-Core AMD Opteron(tm) Processor 1210
 rolw	    9   10    9    9    8    8    8    8    8    8
 bswap	    9    9    8    8    8    8    8    8    8    8

The rolw sequence isn't ever faster, and it's more unstable,
likely due to the partial register stall I mentioned.

I will grant that the rolw sequence is smaller, and I can 
adjust this patch to use that sequence if you wish.


r~

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations.
  2010-04-19 13:56     ` Richard Henderson
@ 2010-04-19 16:05       ` malc
  2010-04-19 19:19         ` Richard Henderson
  0 siblings, 1 reply; 26+ messages in thread
From: malc @ 2010-04-19 16:05 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, Aurelien Jarno

On Mon, 19 Apr 2010, Richard Henderson wrote:

> On 04/18/2010 05:13 PM, Aurelien Jarno wrote:
> > On Tue, Apr 13, 2010 at 04:33:59PM -0700, Richard Henderson wrote:
> >> Define OPC_BSWAP.  Factor opcode emission to separate functions.
> >> Use bswap+shift to implement 16-bit swap instead of a rolw; this
> >> gets the proper zero-extension required by INDEX_op_bswap16_i32.
> > 
> > This is not required by INDEX_op_bswap16_i32. What is need is that the
> > value in the input register has the 16 upper bits set to 0.
> 
> Ah.

Apparently i'm not the only one who misinterpreted this bit of bswap
documentation. How about:

diff --git a/tcg/README b/tcg/README
index 68d27ff..5b39a38 100644
--- a/tcg/README
+++ b/tcg/README
@@ -269,7 +269,7 @@ ext32u_i64 t0, t1
 * bswap16_i32/i64 t0, t1
 
 16 bit byte swap on a 32/64 bit value. It assumes that the two/six high 
order
-bytes are set to zero.
+bytes of t1 are set to zero.
 
 * bswap32_i32/i64 t0, t1
 

> 
> > Considering
> > that, the rolw instruction is faster than bswap + shift.
> 
> Well, no, it isn't.
> 
>  static inline int test_rolw(unsigned short *s)
>  {
>    int i, start, end;
>    asm volatile("rdtsc\n\t"
>                 "movl %%eax, %1\n\t"
>                 "movzwl %3,%2\n\t"
>                 "rolw $8, %w2\n\t"
>                 "addl $1,%2\n\t"
>                 "rdtsc"
>                 : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx");
>    return end - start;
>  }
>  
>  static inline int test_bswap(unsigned short *s)
>  {
>    int i, start, end;
>    asm volatile("rdtsc\n\t"
>                 "movl %%eax, %1\n\t"
>                 "movzwl %3,%2\n\t"
>                 "bswap %2\n\t"
>                 "shl $16,%2\n\t"
>                 "addl $1,%2\n\t"
>                 "rdtsc"
>                 : "=&a"(end), "=r"(start), "=r"(i) : "m"(*s) : "edx");
>    return end - start;
>  }
> 
> 
> model name	: Intel(R) Core(TM)2 Duo CPU     T7700  @ 2.40GHz
>  rolw	   60   60   72   60   60   72   60   60   72   60
>  bswap	   60   60   60   60   60   60   60   60   60   60
> 
> model name	: Dual-Core AMD Opteron(tm) Processor 1210
>  rolw	    9   10    9    9    8    8    8    8    8    8
>  bswap	    9    9    8    8    8    8    8    8    8    8
> 
> The rolw sequence isn't ever faster, and it's more unstable,
> likely due to the partial register stall I mentioned.
> 
> I will grant that the rolw sequence is smaller, and I can 
> adjust this patch to use that sequence if you wish.
> 
> 
> r~
> 
> 

-- 
mailto:av1474@comtv.ru

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations.
  2010-04-19 16:05       ` malc
@ 2010-04-19 19:19         ` Richard Henderson
  0 siblings, 0 replies; 26+ messages in thread
From: Richard Henderson @ 2010-04-19 19:19 UTC (permalink / raw)
  To: malc; +Cc: qemu-devel, Aurelien Jarno

On 04/19/2010 11:05 AM, malc wrote:
> Apparently i'm not the only one who misinterpreted this bit of bswap
> documentation. How about:
> 
> diff --git a/tcg/README b/tcg/README
> index 68d27ff..5b39a38 100644
> --- a/tcg/README
> +++ b/tcg/README
> @@ -269,7 +269,7 @@ ext32u_i64 t0, t1
>  * bswap16_i32/i64 t0, t1
>  
>  16 bit byte swap on a 32/64 bit value. It assumes that the two/six high 
> order
> -bytes are set to zero.
> +bytes of t1 are set to zero.

Ok by me.  You should also adjust the bswap32 documentation just below.


r~

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2010-04-19 19:19 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-14 20:35 [Qemu-devel] [PATCH 00/21] tcg-i386 cleanup and improvement Richard Henderson
2010-04-13 22:23 ` [Qemu-devel] [PATCH 01/21] tcg-i386: Allocate call-saved registers first Richard Henderson
2010-04-13 22:26 ` [Qemu-devel] [PATCH 02/21] tcg-i386: Tidy initialization of tcg_target_call_clobber_regs Richard Henderson
2010-04-13 22:59 ` [Qemu-devel] [PATCH 03/21] tcg-i386: Tidy ext8u and ext16u operations Richard Henderson
2010-04-13 23:13 ` [Qemu-devel] [PATCH 04/21] tcg-i386: Tidy ext8s and ext16s operations Richard Henderson
2010-04-13 23:33 ` [Qemu-devel] [PATCH 05/21] tcg-i386: Tidy bswap operations Richard Henderson
2010-04-18 22:13   ` Aurelien Jarno
2010-04-19 13:56     ` Richard Henderson
2010-04-19 16:05       ` malc
2010-04-19 19:19         ` Richard Henderson
2010-04-13 23:44 ` [Qemu-devel] [PATCH 06/21] tcg-i386: Tidy shift operations Richard Henderson
2010-04-14 14:58 ` [Qemu-devel] [PATCH 07/21] tcg-i386: Tidy move operations Richard Henderson
2010-04-14 15:06 ` [Qemu-devel] [PATCH 08/21] tcg-i386: Eliminate extra move from qemu_ld64 Richard Henderson
2010-04-14 15:26 ` [Qemu-devel] [PATCH 09/21] tcg-i386: Tidy jumps Richard Henderson
2010-04-14 15:38 ` [Qemu-devel] [PATCH 10/21] tcg-i386: Tidy immediate arithmetic operations Richard Henderson
2010-04-14 17:16 ` [Qemu-devel] [PATCH 11/21] tcg-i386: Tidy non-immediate " Richard Henderson
2010-04-14 17:20 ` [Qemu-devel] [PATCH 12/21] tcg-i386: Tidy movi Richard Henderson
2010-04-14 17:59 ` [Qemu-devel] [PATCH 13/21] tcg-i386: Tidy push/pop Richard Henderson
2010-04-14 18:02 ` [Qemu-devel] [PATCH 14/21] tcg-i386: Tidy calls Richard Henderson
2010-04-14 18:04 ` [Qemu-devel] [PATCH 15/21] tcg-i386: Tidy ret Richard Henderson
2010-04-14 18:07 ` [Qemu-devel] [PATCH 16/21] tcg-i386: Tidy setcc Richard Henderson
2010-04-14 18:22 ` [Qemu-devel] [PATCH 17/21] tcg-i386: Tidy unary arithmetic Richard Henderson
2010-04-14 18:29 ` [Qemu-devel] [PATCH 18/21] tcg-i386: Tidy multiply Richard Henderson
2010-04-14 18:32 ` [Qemu-devel] [PATCH 19/21] tcg-i386: Tidy xchg Richard Henderson
2010-04-14 19:08 ` [Qemu-devel] [PATCH 20/21] tcg-i386: Tidy lea Richard Henderson
2010-04-14 20:29 ` [Qemu-devel] [PATCH 21/21] tcg-i386: Use lea for three-operand add Richard Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.