Re: [Qemu-devel] [PATCH 16/25] tcg/i386: Handle ctz and clz opcodes

From: Richard Henderson <rth@twiddle.net>
To: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>,
	qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH 16/25] tcg/i386: Handle ctz and clz opcodes
Date: Fri, 18 Nov 2016 00:03:08 +0100	[thread overview]
Message-ID: <f48d56ff-ab93-d3eb-97ed-d16c753c91f7@twiddle.net> (raw)
In-Reply-To: <0e841b99-5994-120d-bd4f-051b694d1ca4@mail.uni-paderborn.de>

[-- Attachment #1: Type: text/plain, Size: 1488 bytes --]

On 11/17/2016 11:09 PM, Bastian Koppelmann wrote:
> On 11/17/2016 08:59 PM, Richard Henderson wrote:
>> On 11/17/2016 08:53 PM, Richard Henderson wrote:
>>> On 11/17/2016 05:50 PM, Bastian Koppelmann wrote:
>>>> On 11/16/2016 08:25 PM, Richard Henderson wrote:
>>>>> +
>>>>> +    OP_32_64(clz):
>>>>> +        if (const_args[2]) {
>>>>> +            tcg_debug_assert(have_bmi1);
>>>>> +            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
>>>>> +            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
>>>>> +        } else {
>>>>> +            /* ??? See above.  */
>>>>> +            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
>>>>
>>>> The Intel ISA manual states that it find the bit index of the most
>>>> significant bit, where the least significant bit is index 0. So for the
>>>> input 0x2 this should return 1. However this is not the number of
>>>> leading zeros.
>>>
>>> Oh, of course you're right.  I thought I was testing this, but while
>>> alpha does
>>> have this operation, it turns out it isn't used much.
>>
>> Alternately, what I tested was on a haswell machine, which takes the
>> LZCNT path, which *does* produce the intended results.  Just the BSR
>> path doesn't.
>
> Luckily my old laptop is a Core 2 Duo without LZCNT :)

Heh.  Well, I've given it another few tests with LZCNT hacked off, and with 
i686 32-bit.  Here's an incremental update.  Wherein I also note that lzcnt 
isn't in the same cpuid flag as tzcnt.  Double whoops.


r~


[-- Attachment #2: 0001-fixup-tcg-i386.patch --]
[-- Type: text/x-patch, Size: 5865 bytes --]

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 3eeb58f..c3f7adc 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -139,6 +139,11 @@ static bool have_bmi2;
 #else
 # define have_bmi2 0
 #endif
+#if defined(CONFIG_CPUID_H) && defined(bit_LZCNT)
+static bool have_lzcnt;
+#else
+# define have_lzcnt 0
+#endif
 
 static tcg_insn_unit *tb_ret_addr;
 
@@ -1148,6 +1153,76 @@ static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
 }
 #endif
 
+static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+                        TCGArg arg2, bool a2const)
+{
+    if (a2const) {
+        tcg_debug_assert(have_bmi1);
+        tcg_debug_assert(arg2 == (rexw ? 64 : 32));
+        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
+    } else {
+        /* ??? The manual says that the output is undefined when the
+           input is zero, but real hardware leaves it unchanged.  As
+           noted in target-i386/translate.c, real programs depend on
+           this -- now we are one more of those.  */
+        /* ??? We could avoid this if TCG had an early clobber marking
+           for the output.  */
+        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
+        if (dest != arg2) {
+            tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
+        }
+    }
+}
+
+static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+                        TCGArg arg2, bool a2const)
+{
+    TCGLabel *over;
+    TCGType type;
+    unsigned rev;
+
+    /* ??? All this would be easier (and would avoid the semi-undefined
+       behaviour) if TCG had an early clobber marking for the output.  */
+
+    if (have_lzcnt) {
+        if (a2const && arg2 == (rexw ? 64 : 32)) {
+            tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
+            return;
+        }
+        if (!a2const && dest != arg2) {
+            tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
+            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
+            return;
+        }
+    }
+
+    over = gen_new_label();
+    type = rexw ? TCG_TYPE_I64: TCG_TYPE_I32;
+    rev = rexw ? 63 : 31;
+
+    tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
+
+    /* Recall that the output of BSR is the index not the count.
+       Therefore we must adjust the result by ^ (SIZE-1).  In some
+       cases below, we prefer an extra XOR to an extra JMP.  */
+    if (!a2const && dest == arg2) {
+        /* ??? See the comment in tcg_out_ctz re BSF.  */
+        tcg_out_jxx(s, tcg_cond_to_jcc[TCG_COND_EQ], over, 1);
+        tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+        tcg_out_label(s, over, s->code_ptr);
+    } else {
+        tcg_out_jxx(s, tcg_cond_to_jcc[TCG_COND_NE], over, 1);
+        if (a2const) {
+            tcg_out_movi(s, type, dest, arg2 ^ rev);
+        } else {
+            tcg_out_mov(s, type, dest, arg2);
+            tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+        }
+        tcg_out_label(s, over, s->code_ptr);
+        tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+    }
+}
+
 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
 {
     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
@@ -2024,34 +2099,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     OP_32_64(ctz):
-        if (const_args[2]) {
-            tcg_debug_assert(have_bmi1);
-            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
-            tcg_out_modrm(s, OPC_TZCNT + rexw, args[0], args[1]);
-        } else {
-            /* ??? The manual says that the output is undefined when the
-               input is zero, but real hardware leaves it unchanged.  As
-               noted in target-i386/translate.c, real programs depend on
-               this -- now we are one more of those.  */
-            tcg_out_modrm(s, OPC_BSF + rexw, args[0], args[1]);
-            if (args[0] != args[2]) {
-                tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
-            }
-        }
+        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
         break;
-
     OP_32_64(clz):
-        if (const_args[2]) {
-            tcg_debug_assert(have_bmi1);
-            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
-            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
-        } else {
-            /* ??? See above.  */
-            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
-            if (args[0] != args[2]) {
-                tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
-            }
-        }
+        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
         break;
 
     case INDEX_op_brcond_i32:
@@ -2281,7 +2332,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_sar_i32, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i32, { "r", "0", "ci" } },
     { INDEX_op_rotr_i32, { "r", "0", "ci" } },
-    { INDEX_op_clz_i32, { "r", "r", "rW" } },
+    { INDEX_op_clz_i32, { "r", "r", "ri" } },
     { INDEX_op_ctz_i32, { "r", "r", "rW" } },
 
     { INDEX_op_brcond_i32, { "r", "ri" } },
@@ -2344,7 +2395,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_sar_i64, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i64, { "r", "0", "ci" } },
     { INDEX_op_rotr_i64, { "r", "0", "ci" } },
-    { INDEX_op_clz_i64, { "r", "r", "rW" } },
+    { INDEX_op_clz_i64, { "r", "r", "re" } },
     { INDEX_op_ctz_i64, { "r", "r", "rW" } },
 
     { INDEX_op_brcond_i64, { "r", "re" } },
@@ -2498,6 +2549,10 @@ static void tcg_target_init(TCGContext *s)
            need to probe for it.  */
         have_movbe = (c & bit_MOVBE) != 0;
 #endif
+#ifndef have_lzcnt
+        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
+        have_lzcnt = (c & bit_LZCNT) != 0;
+#endif
     }
 
     if (max >= 7) {