Re: [PATCH v3 09/30] tcg/loongarch64: Implement tcg_out_mov and tcg_out_movi

From: Richard Henderson <richard.henderson@linaro.org>
To: WANG Xuerui <git@xen0n.name>, qemu-devel@nongnu.org
Cc: "Peter Maydell" <peter.maydell@linaro.org>,
	"Philippe Mathieu-Daudé" <f4bug@amsat.org>,
	"Laurent Vivier" <laurent@vivier.eu>
Subject: Re: [PATCH v3 09/30] tcg/loongarch64: Implement tcg_out_mov and tcg_out_movi
Date: Thu, 23 Sep 2021 09:50:46 -0700	[thread overview]
Message-ID: <5ace7b10-b7de-46e2-2021-01129024ffe2@linaro.org> (raw)
In-Reply-To: <20210922180927.666273-10-git@xen0n.name>

On 9/22/21 11:09 AM, WANG Xuerui wrote:

Following up on previous, I suggest:

> +static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
> +                         tcg_target_long val)
> +{
> +    if (type == TCG_TYPE_I32) {
> +        val = (int32_t)val;
> +    }
> +
> +    /* Single-instruction cases.  */
> +    tcg_target_long low = sextreg(val, 0, 12);
> +    if (low == val) {
> +        /* val fits in simm12: addi.w rd, zero, val */
> +        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
> +        return;
> +    }
> +    if (0x800 <= val && val <= 0xfff) {
> +        /* val fits in uimm12: ori rd, zero, val */
> +        tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
> +        return;
> +    }

> +    /* Test for PC-relative values that can be loaded faster.  */
> +    intptr_t pc_offset = tcg_pcrel_diff(s, (void *)val);
> +    if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) {
> +        tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
> +        return;
> +    }

     /* Handle all 32-bit constants. */
     if (val == (int32_t)val) {
         tcg_out_opc_lu12i(s, rd, val >> 12);
         if (low) {
             tcg_out_opc_ori(s, rd, rd, val & 0xfff);
         }
         return;
     }

     /* Handle pc-relative values requiring 2 instructions. */
     intptr_t pc_lo = sextract64(pc_offset, 0, 12);
     intptr_t pc_hi = pc_offset - pc_low;
     if (pc_hi == (int32_t)pc_hi) {
         tcg_out_opc_pcaddu12i(s, rd, pc_hi >> 12);
         tcg_out_opc_addi_d(s, rd, rd, pc_lo);
         return;
     }

     /*
      * Choose signed low part if bit 13 is also set,
      * which gives us a chance of making more zeros.
      * Otherwise, let low be unsigned.
      */
     if ((val & 0x1800) != 0x1800) {
         low = val & 0xfff;
     }
     val -= low;

     tcg_target_long hi20 = sextract64(val, 12, 20);
     tcg_target_long hi32 = sextract64(val, 32, 20);
     tcg_target_long hi52 = sextract64(val, 52, 12);

     /*
      * If we can use the sign-extension of a previous
      * operation, suppress higher -1.
      */
     if (hi32 < 0 && hi52 == -1) {
         hi52 = 0;
     }
     if (hi20 < 0 && hi32 == -1) {
         hi32 = 0;
     }

     /* Initialize RD with the least non-zero component. */
     if (hi20) {
         tcg_out_opc_lu12i_w(s, rd, hi20 >> 12);
     } else if (hi32) {
         /* CU32I_D is modify in place, so RD must be initialized. */
         if (low < 0) {
             tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, low);
         } else {
             tcg_out_opc_ori(s, rd, TCG_REG_ZERO, low);
         }
         low = 0;
     } else {
         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
         hi52 = 0;
     }

     /* Assume that lu12i + ori are fusable */
     if (low > 0) {
         tcg_out_opc_ori(s, rd, rd, low);
     }

     /* Set the high 32 bits */
     if (hi32) {
         tcg_out_opc_cu32i_d(s, rd, hi32);
     }
     if (hi52) {
         tcg_out_opc_cu52i(s, rd, rd, hi52);
     }

     /*
      * Note that any subtraction must come last,
      * because cu32i and cu52i overwrite high bits,
      * and we have computed them as val - low.
      */
     if (low < 0) {
         tcg_out_opc_addi_d(s, rd, rd, low);
     }

Untested, and all bugs are mine, of course.

Try "qemu-system-ppc64 -D z -d in_asm,op_opt,out_asm".
You should see some masking constants like

  ---- 000000001daf2898
  and_i64 CA,r9,$0x7fffffffffffffff        dead: 2  pref=0xffff

   cu52i.d rd, zero, 0x800
   addi.d  rd, rd, -1

  ---- 000000001db0775c
  mov_i64 r26,$0x300000002                 sync: 0  dead: 0 1  pref=0xffff

   ori     rd, zero, 2
   cu32i   rd, 3

r~