All of lore.kernel.org
 help / color / mirror / Atom feed
From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Cc: "Luis Pires" <luis.pires@eldorado.org.br>,
	"Alex Bennée" <alex.bennee@linaro.org>
Subject: [PULL 52/56] tcg/optimize: Optimize sign extensions
Date: Wed, 27 Oct 2021 19:41:27 -0700	[thread overview]
Message-ID: <20211028024131.1492790-53-richard.henderson@linaro.org> (raw)
In-Reply-To: <20211028024131.1492790-1-richard.henderson@linaro.org>

Certain targets, like riscv, produce signed 32-bit results.
This can lead to lots of redundant extensions as values are
manipulated.

Begin by tracking only the obvious sign-extensions, and
converting them to simple copies when possible.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Luis Pires <luis.pires@eldorado.org.br>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 123 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 102 insertions(+), 21 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 7ac63c9231..ef202abbcb 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -43,6 +43,7 @@ typedef struct TempOptInfo {
     TCGTemp *next_copy;
     uint64_t val;
     uint64_t z_mask;  /* mask bit is 0 if and only if value bit is 0 */
+    uint64_t s_mask;  /* a left-aligned mask of clrsb(value) bits. */
 } TempOptInfo;
 
 typedef struct OptContext {
@@ -53,9 +54,37 @@ typedef struct OptContext {
     /* In flight values from optimization. */
     uint64_t a_mask;  /* mask bit is 0 iff value identical to first input */
     uint64_t z_mask;  /* mask bit is 0 iff value bit is 0 */
+    uint64_t s_mask;  /* mask of clrsb(value) bits */
     TCGType type;
 } OptContext;
 
+/* Calculate the smask for a specific value. */
+static uint64_t smask_from_value(uint64_t value)
+{
+    int rep = clrsb64(value);
+    return ~(~0ull >> rep);
+}
+
+/*
+ * Calculate the smask for a given set of known-zeros.
+ * If there are lots of zeros on the left, we can consider the remainder
+ * an unsigned field, and thus the corresponding signed field is one bit
+ * larger.
+ */
+static uint64_t smask_from_zmask(uint64_t zmask)
+{
+    /*
+     * Only the 0 bits are significant for zmask, thus the msb itself
+     * must be zero, else we have no sign information.
+     */
+    int rep = clz64(zmask);
+    if (rep == 0) {
+        return 0;
+    }
+    rep -= 1;
+    return ~(~0ull >> rep);
+}
+
 static inline TempOptInfo *ts_info(TCGTemp *ts)
 {
     return ts->state_ptr;
@@ -94,6 +123,7 @@ static void reset_ts(TCGTemp *ts)
     ti->prev_copy = ts;
     ti->is_const = false;
     ti->z_mask = -1;
+    ti->s_mask = 0;
 }
 
 static void reset_temp(TCGArg arg)
@@ -124,9 +154,11 @@ static void init_ts_info(OptContext *ctx, TCGTemp *ts)
         ti->is_const = true;
         ti->val = ts->val;
         ti->z_mask = ts->val;
+        ti->s_mask = smask_from_value(ts->val);
     } else {
         ti->is_const = false;
         ti->z_mask = -1;
+        ti->s_mask = 0;
     }
 }
 
@@ -220,6 +252,7 @@ static bool tcg_opt_gen_mov(OptContext *ctx, TCGOp *op, TCGArg dst, TCGArg src)
     op->args[1] = src;
 
     di->z_mask = si->z_mask;
+    di->s_mask = si->s_mask;
 
     if (src_ts->type == dst_ts->type) {
         TempOptInfo *ni = ts_info(si->next_copy);
@@ -658,13 +691,15 @@ static void finish_folding(OptContext *ctx, TCGOp *op)
 
     nb_oargs = def->nb_oargs;
     for (i = 0; i < nb_oargs; i++) {
-        reset_temp(op->args[i]);
+        TCGTemp *ts = arg_temp(op->args[i]);
+        reset_ts(ts);
         /*
-         * Save the corresponding known-zero bits mask for the
+         * Save the corresponding known-zero/sign bits mask for the
          * first output argument (only one supported so far).
          */
         if (i == 0) {
-            arg_info(op->args[i])->z_mask = ctx->z_mask;
+            ts_info(ts)->z_mask = ctx->z_mask;
+            ts_info(ts)->s_mask = ctx->s_mask;
         }
     }
 }
@@ -714,6 +749,7 @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
 {
     uint64_t a_mask = ctx->a_mask;
     uint64_t z_mask = ctx->z_mask;
+    uint64_t s_mask = ctx->s_mask;
 
     /*
      * 32-bit ops generate 32-bit results, which for the purpose of
@@ -725,7 +761,9 @@ static bool fold_masks(OptContext *ctx, TCGOp *op)
     if (ctx->type == TCG_TYPE_I32) {
         a_mask = (int32_t)a_mask;
         z_mask = (int32_t)z_mask;
+        s_mask |= MAKE_64BIT_MASK(32, 32);
         ctx->z_mask = z_mask;
+        ctx->s_mask = s_mask;
     }
 
     if (z_mask == 0) {
@@ -1072,7 +1110,7 @@ static bool fold_brcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_bswap(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask, sign;
+    uint64_t z_mask, s_mask, sign;
 
     if (arg_is_const(op->args[1])) {
         uint64_t t = arg_info(op->args[1])->val;
@@ -1082,6 +1120,7 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     }
 
     z_mask = arg_info(op->args[1])->z_mask;
+
     switch (op->opc) {
     case INDEX_op_bswap16_i32:
     case INDEX_op_bswap16_i64:
@@ -1100,6 +1139,7 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
     default:
         g_assert_not_reached();
     }
+    s_mask = smask_from_zmask(z_mask);
 
     switch (op->args[2] & (TCG_BSWAP_OZ | TCG_BSWAP_OS)) {
     case TCG_BSWAP_OZ:
@@ -1108,14 +1148,17 @@ static bool fold_bswap(OptContext *ctx, TCGOp *op)
         /* If the sign bit may be 1, force all the bits above to 1. */
         if (z_mask & sign) {
             z_mask |= sign;
+            s_mask = sign << 1;
         }
         break;
     default:
         /* The high bits are undefined: force all bits above the sign to 1. */
         z_mask |= sign << 1;
+        s_mask = 0;
         break;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
 
     return fold_masks(ctx, op);
 }
@@ -1263,21 +1306,24 @@ static bool fold_eqv(OptContext *ctx, TCGOp *op)
 static bool fold_extract(OptContext *ctx, TCGOp *op)
 {
     uint64_t z_mask_old, z_mask;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = extract64(t, op->args[2], op->args[3]);
+        t = extract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
     z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = extract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0) {
+    z_mask = extract64(z_mask_old, pos, len);
+    if (pos == 0) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
 
     return fold_masks(ctx, op);
 }
@@ -1303,14 +1349,16 @@ static bool fold_extract2(OptContext *ctx, TCGOp *op)
 
 static bool fold_exts(OptContext *ctx, TCGOp *op)
 {
-    uint64_t z_mask_old, z_mask, sign;
+    uint64_t s_mask_old, s_mask, z_mask, sign;
     bool type_change = false;
 
     if (fold_const1(ctx, op)) {
         return true;
     }
 
-    z_mask_old = z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = arg_info(op->args[1])->z_mask;
+    s_mask = arg_info(op->args[1])->s_mask;
+    s_mask_old = s_mask;
 
     switch (op->opc) {
     CASE_OP_32_64(ext8s):
@@ -1334,10 +1382,14 @@ static bool fold_exts(OptContext *ctx, TCGOp *op)
 
     if (z_mask & sign) {
         z_mask |= sign;
-    } else if (!type_change) {
-        ctx->a_mask = z_mask_old ^ z_mask;
     }
+    s_mask |= sign << 1;
+
     ctx->z_mask = z_mask;
+    ctx->s_mask = s_mask;
+    if (!type_change) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
 
     return fold_masks(ctx, op);
 }
@@ -1376,6 +1428,7 @@ static bool fold_extu(OptContext *ctx, TCGOp *op)
     }
 
     ctx->z_mask = z_mask;
+    ctx->s_mask = smask_from_zmask(z_mask);
     if (!type_change) {
         ctx->a_mask = z_mask_old ^ z_mask;
     }
@@ -1606,8 +1659,12 @@ static bool fold_qemu_ld(OptContext *ctx, TCGOp *op)
     MemOp mop = get_memop(oi);
     int width = 8 * memop_size(mop);
 
-    if (!(mop & MO_SIGN) && width < 64) {
-        ctx->z_mask = MAKE_64BIT_MASK(0, width);
+    if (width < 64) {
+        ctx->s_mask = MAKE_64BIT_MASK(width, 64 - width);
+        if (!(mop & MO_SIGN)) {
+            ctx->z_mask = MAKE_64BIT_MASK(0, width);
+            ctx->s_mask <<= 1;
+        }
     }
 
     /* Opcodes that touch guest memory stop the mb optimization.  */
@@ -1726,23 +1783,31 @@ static bool fold_setcond2(OptContext *ctx, TCGOp *op)
 
 static bool fold_sextract(OptContext *ctx, TCGOp *op)
 {
-    int64_t z_mask_old, z_mask;
+    uint64_t z_mask, s_mask, s_mask_old;
+    int pos = op->args[2];
+    int len = op->args[3];
 
     if (arg_is_const(op->args[1])) {
         uint64_t t;
 
         t = arg_info(op->args[1])->val;
-        t = sextract64(t, op->args[2], op->args[3]);
+        t = sextract64(t, pos, len);
         return tcg_opt_gen_movi(ctx, op, op->args[0], t);
     }
 
-    z_mask_old = arg_info(op->args[1])->z_mask;
-    z_mask = sextract64(z_mask_old, op->args[2], op->args[3]);
-    if (op->args[2] == 0 && z_mask >= 0) {
-        ctx->a_mask = z_mask_old ^ z_mask;
-    }
+    z_mask = arg_info(op->args[1])->z_mask;
+    z_mask = sextract64(z_mask, pos, len);
     ctx->z_mask = z_mask;
 
+    s_mask_old = arg_info(op->args[1])->s_mask;
+    s_mask = sextract64(s_mask_old, pos, len);
+    s_mask |= MAKE_64BIT_MASK(len, 64 - len);
+    ctx->s_mask = s_mask;
+
+    if (pos == 0) {
+        ctx->a_mask = s_mask & ~s_mask_old;
+    }
+
     return fold_masks(ctx, op);
 }
 
@@ -1819,14 +1884,26 @@ static bool fold_tcg_ld(OptContext *ctx, TCGOp *op)
 {
     /* We can't do any folding with a load, but we can record bits. */
     switch (op->opc) {
+    CASE_OP_32_64(ld8s):
+        ctx->s_mask = MAKE_64BIT_MASK(8, 56);
+        break;
     CASE_OP_32_64(ld8u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 8);
+        ctx->s_mask = MAKE_64BIT_MASK(9, 55);
+        break;
+    CASE_OP_32_64(ld16s):
+        ctx->s_mask = MAKE_64BIT_MASK(16, 48);
         break;
     CASE_OP_32_64(ld16u):
         ctx->z_mask = MAKE_64BIT_MASK(0, 16);
+        ctx->s_mask = MAKE_64BIT_MASK(17, 47);
+        break;
+    case INDEX_op_ld32s_i64:
+        ctx->s_mask = MAKE_64BIT_MASK(32, 32);
         break;
     case INDEX_op_ld32u_i64:
         ctx->z_mask = MAKE_64BIT_MASK(0, 32);
+        ctx->s_mask = MAKE_64BIT_MASK(33, 31);
         break;
     default:
         g_assert_not_reached();
@@ -1889,9 +1966,10 @@ void tcg_optimize(TCGContext *s)
             ctx.type = TCG_TYPE_I32;
         }
 
-        /* Assume all bits affected, and no bits known zero. */
+        /* Assume all bits affected, no bits known zero, no sign reps. */
         ctx.a_mask = -1;
         ctx.z_mask = -1;
+        ctx.s_mask = 0;
 
         /*
          * Process each opcode.
@@ -1964,8 +2042,11 @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_extrh_i64_i32:
             done = fold_extu(&ctx, op);
             break;
+        CASE_OP_32_64(ld8s):
         CASE_OP_32_64(ld8u):
+        CASE_OP_32_64(ld16s):
         CASE_OP_32_64(ld16u):
+        case INDEX_op_ld32s_i64:
         case INDEX_op_ld32u_i64:
             done = fold_tcg_ld(&ctx, op);
             break;
-- 
2.25.1



  parent reply	other threads:[~2021-10-28  3:24 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-28  2:40 [PULL 00/56] tcg patch queue Richard Henderson
2021-10-28  2:40 ` [PULL 01/56] qemu/int128: Add int128_{not,xor} Richard Henderson
2021-10-28  2:40 ` [PULL 02/56] host-utils: move checks out of divu128/divs128 Richard Henderson
2021-10-28  2:40 ` [PULL 03/56] host-utils: move udiv_qrnnd() to host-utils Richard Henderson
2021-10-28  2:40 ` [PULL 04/56] host-utils: add 128-bit quotient support to divu128/divs128 Richard Henderson
2021-10-28  2:40 ` [PULL 05/56] host-utils: add unit tests for divu128/divs128 Richard Henderson
2021-10-28  2:40 ` [PULL 06/56] tcg/optimize: Rename "mask" to "z_mask" Richard Henderson
2021-10-28  2:40 ` [PULL 07/56] tcg/optimize: Split out OptContext Richard Henderson
2021-10-28  2:40 ` [PULL 08/56] tcg/optimize: Remove do_default label Richard Henderson
2021-10-28  2:40 ` [PULL 09/56] tcg/optimize: Change tcg_opt_gen_{mov,movi} interface Richard Henderson
2021-10-28  2:40 ` [PULL 10/56] tcg/optimize: Move prev_mb into OptContext Richard Henderson
2021-10-28  2:40 ` [PULL 11/56] tcg/optimize: Split out init_arguments Richard Henderson
2021-10-28  2:40 ` [PULL 12/56] tcg/optimize: Split out copy_propagate Richard Henderson
2021-10-28  2:40 ` [PULL 13/56] tcg/optimize: Split out fold_call Richard Henderson
2021-10-28  2:40 ` [PULL 14/56] tcg/optimize: Drop nb_oargs, nb_iargs locals Richard Henderson
2021-10-28  2:40 ` [PULL 15/56] tcg/optimize: Change fail return for do_constant_folding_cond* Richard Henderson
2021-10-28  2:40 ` [PULL 16/56] tcg/optimize: Return true from tcg_opt_gen_{mov,movi} Richard Henderson
2021-10-28  2:40 ` [PULL 17/56] tcg/optimize: Split out finish_folding Richard Henderson
2021-10-28  2:40 ` [PULL 18/56] tcg/optimize: Use a boolean to avoid a mass of continues Richard Henderson
2021-10-28  2:40 ` [PULL 19/56] tcg/optimize: Split out fold_mb, fold_qemu_{ld,st} Richard Henderson
2021-10-28  2:40 ` [PULL 20/56] tcg/optimize: Split out fold_const{1,2} Richard Henderson
2021-10-28  2:40 ` [PULL 21/56] tcg/optimize: Split out fold_setcond2 Richard Henderson
2021-10-28  2:40 ` [PULL 22/56] tcg/optimize: Split out fold_brcond2 Richard Henderson
2021-10-28  2:40 ` [PULL 23/56] tcg/optimize: Split out fold_brcond Richard Henderson
2021-10-28  2:40 ` [PULL 24/56] tcg/optimize: Split out fold_setcond Richard Henderson
2021-10-28  2:41 ` [PULL 25/56] tcg/optimize: Split out fold_mulu2_i32 Richard Henderson
2021-10-28  2:41 ` [PULL 26/56] tcg/optimize: Split out fold_addsub2_i32 Richard Henderson
2021-10-28  2:41 ` [PULL 27/56] tcg/optimize: Split out fold_movcond Richard Henderson
2021-10-28  2:41 ` [PULL 28/56] tcg/optimize: Split out fold_extract2 Richard Henderson
2021-10-28  2:41 ` [PULL 29/56] tcg/optimize: Split out fold_extract, fold_sextract Richard Henderson
2021-10-28  2:41 ` [PULL 30/56] tcg/optimize: Split out fold_deposit Richard Henderson
2021-10-28  2:41 ` [PULL 31/56] tcg/optimize: Split out fold_count_zeros Richard Henderson
2021-10-28  2:41 ` [PULL 32/56] tcg/optimize: Split out fold_bswap Richard Henderson
2021-10-28  2:41 ` [PULL 33/56] tcg/optimize: Split out fold_dup, fold_dup2 Richard Henderson
2021-10-28  2:41 ` [PULL 34/56] tcg/optimize: Split out fold_mov Richard Henderson
2021-10-28  2:41 ` [PULL 35/56] tcg/optimize: Split out fold_xx_to_i Richard Henderson
2021-10-28  2:41 ` [PULL 36/56] tcg/optimize: Split out fold_xx_to_x Richard Henderson
2021-10-28  2:41 ` [PULL 37/56] tcg/optimize: Split out fold_xi_to_i Richard Henderson
2021-10-28  2:41 ` [PULL 38/56] tcg/optimize: Add type to OptContext Richard Henderson
2021-10-28  2:41 ` [PULL 39/56] tcg/optimize: Split out fold_to_not Richard Henderson
2021-10-28  2:41 ` [PULL 40/56] tcg/optimize: Split out fold_sub_to_neg Richard Henderson
2021-10-28  2:41 ` [PULL 41/56] tcg/optimize: Split out fold_xi_to_x Richard Henderson
2021-10-28  2:41 ` [PULL 42/56] tcg/optimize: Split out fold_ix_to_i Richard Henderson
2021-10-28  2:41 ` [PULL 43/56] tcg/optimize: Split out fold_masks Richard Henderson
2021-10-28  2:41 ` [PULL 44/56] tcg/optimize: Expand fold_mulu2_i32 to all 4-arg multiplies Richard Henderson
2021-10-28  2:41 ` [PULL 45/56] tcg/optimize: Expand fold_addsub2_i32 to 64-bit ops Richard Henderson
2021-10-28  2:41 ` [PULL 46/56] tcg/optimize: Sink commutative operand swapping into fold functions Richard Henderson
2021-10-28  2:41 ` [PULL 47/56] tcg/optimize: Stop forcing z_mask to "garbage" for 32-bit values Richard Henderson
2021-10-28  2:41 ` [PULL 48/56] tcg/optimize: Use fold_xx_to_i for orc Richard Henderson
2021-10-28  2:41 ` [PULL 49/56] tcg/optimize: Use fold_xi_to_x for mul Richard Henderson
2021-10-28  2:41 ` [PULL 50/56] tcg/optimize: Use fold_xi_to_x for div Richard Henderson
2021-10-28  2:41 ` [PULL 51/56] tcg/optimize: Use fold_xx_to_i for rem Richard Henderson
2021-10-28  2:41 ` Richard Henderson [this message]
2021-10-28  2:41 ` [PULL 53/56] tcg/optimize: Propagate sign info for logical operations Richard Henderson
2021-10-28  2:41 ` [PULL 54/56] tcg/optimize: Propagate sign info for setcond Richard Henderson
2021-10-28  2:41 ` [PULL 55/56] tcg/optimize: Propagate sign info for bit counting Richard Henderson
2021-10-28  2:41 ` [PULL 56/56] tcg/optimize: Propagate sign info for shifting Richard Henderson
2021-10-28 14:51 ` [PULL 00/56] tcg patch queue Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211028024131.1492790-53-richard.henderson@linaro.org \
    --to=richard.henderson@linaro.org \
    --cc=alex.bennee@linaro.org \
    --cc=luis.pires@eldorado.org.br \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.