All of lore.kernel.org
 help / color / mirror / Atom feed
From: Richard Henderson <rth@twiddle.net>
To: Bastian Koppelmann <kbastian@mail.uni-paderborn.de>,
	qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [PATCH 16/25] tcg/i386: Handle ctz and clz opcodes
Date: Fri, 18 Nov 2016 00:03:08 +0100	[thread overview]
Message-ID: <f48d56ff-ab93-d3eb-97ed-d16c753c91f7@twiddle.net> (raw)
In-Reply-To: <0e841b99-5994-120d-bd4f-051b694d1ca4@mail.uni-paderborn.de>

[-- Attachment #1: Type: text/plain, Size: 1488 bytes --]

On 11/17/2016 11:09 PM, Bastian Koppelmann wrote:
> On 11/17/2016 08:59 PM, Richard Henderson wrote:
>> On 11/17/2016 08:53 PM, Richard Henderson wrote:
>>> On 11/17/2016 05:50 PM, Bastian Koppelmann wrote:
>>>> On 11/16/2016 08:25 PM, Richard Henderson wrote:
>>>>> +
>>>>> +    OP_32_64(clz):
>>>>> +        if (const_args[2]) {
>>>>> +            tcg_debug_assert(have_bmi1);
>>>>> +            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
>>>>> +            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
>>>>> +        } else {
>>>>> +            /* ??? See above.  */
>>>>> +            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
>>>>
>>>> The Intel ISA manual states that it find the bit index of the most
>>>> significant bit, where the least significant bit is index 0. So for the
>>>> input 0x2 this should return 1. However this is not the number of
>>>> leading zeros.
>>>
>>> Oh, of course you're right.  I thought I was testing this, but while
>>> alpha does
>>> have this operation, it turns out it isn't used much.
>>
>> Alternately, what I tested was on a haswell machine, which takes the
>> LZCNT path, which *does* produce the intended results.  Just the BSR
>> path doesn't.
>
> Luckily my old laptop is a Core 2 Duo without LZCNT :)

Heh.  Well, I've given it another few tests with LZCNT hacked off, and with 
i686 32-bit.  Here's an incremental update.  Wherein I also note that lzcnt 
isn't in the same cpuid flag as tzcnt.  Double whoops.


r~


[-- Attachment #2: 0001-fixup-tcg-i386.patch --]
[-- Type: text/x-patch, Size: 5865 bytes --]

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 3eeb58f..c3f7adc 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -139,6 +139,11 @@ static bool have_bmi2;
 #else
 # define have_bmi2 0
 #endif
+#if defined(CONFIG_CPUID_H) && defined(bit_LZCNT)
+static bool have_lzcnt;
+#else
+# define have_lzcnt 0
+#endif
 
 static tcg_insn_unit *tb_ret_addr;
 
@@ -1148,6 +1153,76 @@ static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
 }
 #endif
 
+static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+                        TCGArg arg2, bool a2const)
+{
+    if (a2const) {
+        tcg_debug_assert(have_bmi1);
+        tcg_debug_assert(arg2 == (rexw ? 64 : 32));
+        tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
+    } else {
+        /* ??? The manual says that the output is undefined when the
+           input is zero, but real hardware leaves it unchanged.  As
+           noted in target-i386/translate.c, real programs depend on
+           this -- now we are one more of those.  */
+        /* ??? We could avoid this if TCG had an early clobber marking
+           for the output.  */
+        tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
+        if (dest != arg2) {
+            tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
+        }
+    }
+}
+
+static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
+                        TCGArg arg2, bool a2const)
+{
+    TCGLabel *over;
+    TCGType type;
+    unsigned rev;
+
+    /* ??? All this would be easier (and would avoid the semi-undefined
+       behaviour) if TCG had an early clobber marking for the output.  */
+
+    if (have_lzcnt) {
+        if (a2const && arg2 == (rexw ? 64 : 32)) {
+            tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
+            return;
+        }
+        if (!a2const && dest != arg2) {
+            tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
+            tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
+            return;
+        }
+    }
+
+    over = gen_new_label();
+    type = rexw ? TCG_TYPE_I64: TCG_TYPE_I32;
+    rev = rexw ? 63 : 31;
+
+    tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
+
+    /* Recall that the output of BSR is the index not the count.
+       Therefore we must adjust the result by ^ (SIZE-1).  In some
+       cases below, we prefer an extra XOR to an extra JMP.  */
+    if (!a2const && dest == arg2) {
+        /* ??? See the comment in tcg_out_ctz re BSF.  */
+        tcg_out_jxx(s, tcg_cond_to_jcc[TCG_COND_EQ], over, 1);
+        tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+        tcg_out_label(s, over, s->code_ptr);
+    } else {
+        tcg_out_jxx(s, tcg_cond_to_jcc[TCG_COND_NE], over, 1);
+        if (a2const) {
+            tcg_out_movi(s, type, dest, arg2 ^ rev);
+        } else {
+            tcg_out_mov(s, type, dest, arg2);
+            tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+        }
+        tcg_out_label(s, over, s->code_ptr);
+        tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
+    }
+}
+
 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
 {
     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
@@ -2024,34 +2099,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     OP_32_64(ctz):
-        if (const_args[2]) {
-            tcg_debug_assert(have_bmi1);
-            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
-            tcg_out_modrm(s, OPC_TZCNT + rexw, args[0], args[1]);
-        } else {
-            /* ??? The manual says that the output is undefined when the
-               input is zero, but real hardware leaves it unchanged.  As
-               noted in target-i386/translate.c, real programs depend on
-               this -- now we are one more of those.  */
-            tcg_out_modrm(s, OPC_BSF + rexw, args[0], args[1]);
-            if (args[0] != args[2]) {
-                tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
-            }
-        }
+        tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
         break;
-
     OP_32_64(clz):
-        if (const_args[2]) {
-            tcg_debug_assert(have_bmi1);
-            tcg_debug_assert(args[2] == (rexw ? 64 : 32));
-            tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
-        } else {
-            /* ??? See above.  */
-            tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
-            if (args[0] != args[2]) {
-                tcg_out_cmov(s, TCG_COND_EQ, rexw, args[0], args[2]);
-            }
-        }
+        tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
         break;
 
     case INDEX_op_brcond_i32:
@@ -2281,7 +2332,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_sar_i32, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i32, { "r", "0", "ci" } },
     { INDEX_op_rotr_i32, { "r", "0", "ci" } },
-    { INDEX_op_clz_i32, { "r", "r", "rW" } },
+    { INDEX_op_clz_i32, { "r", "r", "ri" } },
     { INDEX_op_ctz_i32, { "r", "r", "rW" } },
 
     { INDEX_op_brcond_i32, { "r", "ri" } },
@@ -2344,7 +2395,7 @@ static const TCGTargetOpDef x86_op_defs[] = {
     { INDEX_op_sar_i64, { "r", "0", "Ci" } },
     { INDEX_op_rotl_i64, { "r", "0", "ci" } },
     { INDEX_op_rotr_i64, { "r", "0", "ci" } },
-    { INDEX_op_clz_i64, { "r", "r", "rW" } },
+    { INDEX_op_clz_i64, { "r", "r", "re" } },
     { INDEX_op_ctz_i64, { "r", "r", "rW" } },
 
     { INDEX_op_brcond_i64, { "r", "re" } },
@@ -2498,6 +2549,10 @@ static void tcg_target_init(TCGContext *s)
            need to probe for it.  */
         have_movbe = (c & bit_MOVBE) != 0;
 #endif
+#ifndef have_lzcnt
+        /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
+        have_lzcnt = (c & bit_LZCNT) != 0;
+#endif
     }
 
     if (max >= 7) {

  reply	other threads:[~2016-11-17 23:03 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-11-16 19:25 [Qemu-devel] [PATCH 00/25] tcg: Handle clz, ctz, and clrsb generically Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 01/25] tcg: Add clz and ctz opcodes Richard Henderson
2016-11-21 15:11   ` Alex Bennée
2016-11-21 16:05     ` Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 02/25] target-alpha: Use the ctz and clz opcodes Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 03/25] target-cris: Use clz opcode Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 04/25] target-microblaze: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 05/25] target-mips: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 06/25] target-openrisc: Use clz and ctz opcodes Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 07/25] target-ppc: " Richard Henderson
2016-11-17  3:09   ` David Gibson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 08/25] target-s390x: Use clz opcode Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 09/25] target-tilegx: Use clz and ctz opcodes Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 10/25] target-tricore: Use clz opcode Richard Henderson
2016-11-17 14:42   ` Bastian Koppelmann
2016-11-17 15:47     ` Bastian Koppelmann
2016-11-16 19:25 ` [Qemu-devel] [PATCH 11/25] target-unicore32: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 12/25] target-xtensa: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 13/25] target-arm: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 14/25] target-i386: Use clz and ctz opcodes Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 15/25] disas/i386.c: Handle tzcnt Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 16/25] tcg/i386: Handle ctz and clz opcodes Richard Henderson
2016-11-17 16:50   ` Bastian Koppelmann
2016-11-17 19:53     ` Richard Henderson
2016-11-17 19:59       ` Richard Henderson
2016-11-17 22:09         ` Bastian Koppelmann
2016-11-17 23:03           ` Richard Henderson [this message]
2016-11-18 12:48             ` Bastian Koppelmann
2016-11-21 10:37               ` Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 17/25] tcg/ppc: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 18/25] tcg/aarch64: " Richard Henderson
2016-11-17 11:53   ` Richard Henderson
2016-11-22 10:41     ` Alex Bennée
2016-11-16 19:25 ` [Qemu-devel] [PATCH 19/25] tcg/arm: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 20/25] tcg/mips: Handle clz opcode Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 21/25] tcg/s390: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 22/25] tcg: Add helpers for clrsb Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 23/25] target-arm: Use clrsb helper Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 24/25] target-tricore: " Richard Henderson
2016-11-16 19:25 ` [Qemu-devel] [PATCH 25/25] target-xtensa: " Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f48d56ff-ab93-d3eb-97ed-d16c753c91f7@twiddle.net \
    --to=rth@twiddle.net \
    --cc=kbastian@mail.uni-paderborn.de \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.