All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Alex Bennée" <alex.bennee@linaro.org>
To: qemu-devel@nongnu.org
Cc: Alistair.Francis@wdc.com
Subject: Re: [Qemu-devel] [PATCH for-4.0 v2 08/37] tcg/i386: Force qemu_ld/st arguments into fixed registers
Date: Fri, 30 Nov 2018 16:16:00 +0000	[thread overview]
Message-ID: <87tvjy35j3.fsf@linaro.org> (raw)
In-Reply-To: <20181123144558.5048-9-richard.henderson@linaro.org>


Richard Henderson <richard.henderson@linaro.org> writes:

> This is an incremental step toward moving the qemu_ld/st
> code sequence out of line.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/i386/tcg-target.inc.c | 203 +++++++++++++++++++++++++++++++-------
>  1 file changed, 169 insertions(+), 34 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 07df4b2b12..50e5dc31b3 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c
> @@ -171,6 +171,80 @@ static bool have_lzcnt;
>
>  static tcg_insn_unit *tb_ret_addr;
>
> +typedef enum {
> +    ARG_ADDR,
> +    ARG_STVAL,
> +    ARG_LDVAL,
> +} QemuMemArgType;
> +
> +#ifdef CONFIG_SOFTMMU
> +/*
> + * Constraint to choose a particular register.  This is used for softmmu
> + * loads and stores.  Registers with no assignment get an empty string.
> + */
> +static const char * const one_reg_constraint[TCG_TARGET_NB_REGS] = {
> +    [TCG_REG_EAX] = "a",
> +    [TCG_REG_EBX] = "b",
> +    [TCG_REG_ECX] = "c",
> +    [TCG_REG_EDX] = "d",
> +    [TCG_REG_ESI] = "S",
> +    [TCG_REG_EDI] = "D",
> +#if TCG_TARGET_REG_BITS == 64
> +    [TCG_REG_R8]  = "E",
> +    [TCG_REG_R9]  = "N",
> +#endif
> +};
> +
> +/*
> + * Calling convention for the softmmu load and store thunks.
> + *
> + * For 64-bit, we mostly use the host calling convention, therefore the
> + * real first argument is reserved for the ENV parameter that is passed
> + * on to the slow path helpers.
> + *
> + * For 32-bit, the host calling convention is stack based; we invent a
> + * private convention that uses 4 of the 6 available host registers.
> + * We reserve EAX and EDX as temporaries for use by the thunk, we require
> + * INDEX_op_qemu_st_i32 to have a 'q' register from which to store, and
> + * further complicate this last by wanting a call-clobbered for that store.
> + * The 'q' requirement allows MO_8 stores at all; the call-clobbered part
> + * allows bswap to operate in-place, clobbering the input.
> + */
> +static TCGReg softmmu_arg(QemuMemArgType type, bool is_64, int hi)
> +{
> +    switch (type) {
> +    case ARG_ADDR:
> +        tcg_debug_assert(!hi || TARGET_LONG_BITS > TCG_TARGET_REG_BITS);
> +        if (TCG_TARGET_REG_BITS == 64) {
> +            return tcg_target_call_iarg_regs[1];
> +        } else {
> +            return hi ? TCG_REG_EDI : TCG_REG_ESI;
> +        }
> +    case ARG_STVAL:
> +        tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS == 32 && is_64));
> +        if (TCG_TARGET_REG_BITS == 64) {
> +            return tcg_target_call_iarg_regs[2];
> +        } else {
> +            return hi ? TCG_REG_EBX : TCG_REG_ECX;
> +        }
> +    case ARG_LDVAL:
> +        tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS == 32 && is_64));
> +        return tcg_target_call_oarg_regs[hi];
> +    }
> +    g_assert_not_reached();
> +}
> +
> +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, int hi)
> +{
> +    return one_reg_constraint[softmmu_arg(type, is_64, hi)];
> +}
> +#else
> +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, int hi)
> +{
> +    return "L";
> +}
> +#endif /* CONFIG_SOFTMMU */
> +
>  static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
>                          intptr_t value, intptr_t addend)
>  {
> @@ -1680,11 +1754,15 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
>         copies the entire guest address for the slow path, while truncation
>         for the 32-bit host happens with the fastpath ADDL below.  */
>      if (TCG_TARGET_REG_BITS == 64) {
> -        base = tcg_target_call_iarg_regs[1];
> +        tcg_debug_assert(addrlo == tcg_target_call_iarg_regs[1]);
> +        if (TARGET_LONG_BITS == 32) {
> +            tcg_out_ext32u(s, addrlo, addrlo);
> +        }
> +        base = addrlo;
>      } else {
>          base = r1;
> +        tcg_out_mov(s, ttype, base, addrlo);
>      }
> -    tcg_out_mov(s, ttype, base, addrlo);
>
>      /* jne slow_path */
>      tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
> @@ -2009,16 +2087,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>     common. */
>  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
>  {
> -    TCGReg datalo, datahi, addrlo;
> -    TCGReg addrhi __attribute__((unused));
> +    TCGReg datalo, addrlo;
> +    TCGReg datahi __attribute__((unused)) = -1;
> +    TCGReg addrhi __attribute__((unused)) = -1;
>      TCGMemOpIdx oi;
>      TCGMemOp opc;
> +    int i = -1;
>
> -    datalo = *args++;
> -    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
> -    addrlo = *args++;
> -    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
> -    oi = *args++;
> +    datalo = args[++i];

Swapping to i = -1 and pre-indexes seems a little unnatural here
compared to a more normal 0 and i++ unless there was a specific reason
to have i in the range of 2-4?

> +    if (TCG_TARGET_REG_BITS == 32 && is64) {
> +        datahi = args[++i];
> +    }
> +    addrlo = args[++i];
> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> +        addrhi = args[++i];
> +    }
> +    oi = args[++i];
>      opc = get_memop(oi);
>
>  #if defined(CONFIG_SOFTMMU)
> @@ -2027,6 +2111,15 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
>          tcg_insn_unit *label_ptr[2];
>          TCGReg base;
>
> +        tcg_debug_assert(datalo == softmmu_arg(ARG_LDVAL, is64, 0));
> +        if (TCG_TARGET_REG_BITS == 32 && is64) {
> +            tcg_debug_assert(datahi == softmmu_arg(ARG_LDVAL, is64, 1));
> +        }
> +        tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
> +        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> +            tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
> +        }
> +
>          base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
>                                  label_ptr, offsetof(CPUTLBEntry, addr_read));
>
> @@ -2149,16 +2242,22 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>
>  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>  {
> -    TCGReg datalo, datahi, addrlo;
> -    TCGReg addrhi __attribute__((unused));
> +    TCGReg datalo, addrlo;
> +    TCGReg datahi __attribute__((unused)) = -1;
> +    TCGReg addrhi __attribute__((unused)) = -1;
>      TCGMemOpIdx oi;
>      TCGMemOp opc;
> +    int i = -1;

And again here

>
> -    datalo = *args++;
> -    datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
> -    addrlo = *args++;
> -    addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
> -    oi = *args++;
> +    datalo = args[++i];
> +    if (TCG_TARGET_REG_BITS == 32 && is64) {
> +        datahi = args[++i];
> +    }
> +    addrlo = args[++i];
> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> +        addrhi = args[++i];
> +    }
> +    oi = args[++i];
>      opc = get_memop(oi);
>
>  #if defined(CONFIG_SOFTMMU)
> @@ -2167,6 +2266,15 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
>          tcg_insn_unit *label_ptr[2];
>          TCGReg base;
>
> +        tcg_debug_assert(datalo == softmmu_arg(ARG_STVAL, is64, 0));
> +        if (TCG_TARGET_REG_BITS == 32 && is64) {
> +            tcg_debug_assert(datahi == softmmu_arg(ARG_STVAL, is64, 1));
> +        }
> +        tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
> +        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
> +            tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
> +        }
> +
>          base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
>                                  label_ptr, offsetof(CPUTLBEntry, addr_write));
>
> @@ -2836,15 +2944,6 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
>      static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
>      static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
>      static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
> -    static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
> -    static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
> -    static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
> -    static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
> -    static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
> -    static const TCGTargetOpDef r_r_L_L
> -        = { .args_ct_str = { "r", "r", "L", "L" } };
> -    static const TCGTargetOpDef L_L_L_L
> -        = { .args_ct_str = { "L", "L", "L", "L" } };
>      static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
>      static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
>      static const TCGTargetOpDef x_x_x_x
> @@ -3026,17 +3125,53 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
>          }
>
>      case INDEX_op_qemu_ld_i32:
> -        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
> -    case INDEX_op_qemu_st_i32:
> -        return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
> +        {
> +            static TCGTargetOpDef ld32;
> +            int i;
> +
> +            ld32.args_ct_str[0] = constrain_memop_arg(ARG_LDVAL, 0, 0);
> +            for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS;
>      ++i) {

This formulation is a bit weird to follow, at first I thought it was
going to overflow TCG_MAX_OP_ARGS until I realised what it was doing.
Maybe a helper function would be a little clearer:

               for (src_reg = 0; src_reg < tcg_target_regs_for(TARGET_LONG_BITS), ++i)

Same comment applies bellow.

> +                ld32.args_ct_str[i + 1] = constrain_memop_arg(ARG_ADDR, 0, i);
> +            }
> +            return &ld32;
> +        }
>      case INDEX_op_qemu_ld_i64:
> -        return (TCG_TARGET_REG_BITS == 64 ? &r_L
> -                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
> -                : &r_r_L_L);
> +        {
> +            static TCGTargetOpDef ld64;
> +            int i, j = 0;
> +
> +            for (i = 0; i * TCG_TARGET_REG_BITS < 64; ++i) {
> +                ld64.args_ct_str[j++] = constrain_memop_arg(ARG_LDVAL, 1, i);
> +            }
> +            for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) {
> +                ld64.args_ct_str[j++] = constrain_memop_arg(ARG_ADDR, 0, i);
> +            }
> +            return &ld64;
> +        }
> +    case INDEX_op_qemu_st_i32:
> +        {
> +            static TCGTargetOpDef st32;
> +            int i;
> +
> +            st32.args_ct_str[0] = constrain_memop_arg(ARG_STVAL, 0, 0);
> +            for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) {
> +                st32.args_ct_str[i + 1] = constrain_memop_arg(ARG_ADDR, 0, i);
> +            }
> +            return &st32;
> +        }
>      case INDEX_op_qemu_st_i64:
> -        return (TCG_TARGET_REG_BITS == 64 ? &L_L
> -                : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
> -                : &L_L_L_L);
> +        {
> +            static TCGTargetOpDef st64;
> +            int i, j = 0;
> +
> +            for (i = 0; i * TCG_TARGET_REG_BITS < 64; ++i) {
> +                st64.args_ct_str[j++] = constrain_memop_arg(ARG_STVAL, 1, i);
> +            }
> +            for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) {
> +                st64.args_ct_str[j++] = constrain_memop_arg(ARG_ADDR, 0, i);
> +            }
> +            return &st64;
> +        }
>
>      case INDEX_op_brcond2_i32:
>          {


--
Alex Bennée

  reply	other threads:[~2018-11-30 16:16 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-23 14:45 [Qemu-devel] [PATCH for-4.0 v2 00/37] tcg: Assorted cleanups Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 01/37] tcg/i386: Always use %ebp for TCG_AREG0 Richard Henderson
2018-11-29 12:52   ` Alex Bennée
2018-11-29 14:55     ` Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 02/37] tcg/i386: Move TCG_REG_CALL_STACK from define to enum Richard Henderson
2018-11-29 12:52   ` Alex Bennée
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 03/37] tcg: Return success from patch_reloc Richard Henderson
2018-11-29 14:47   ` Alex Bennée
2018-11-29 17:35     ` Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 04/37] tcg: Add TCG_TARGET_NEED_LDST_OOL_LABELS Richard Henderson
2018-11-26  0:31   ` Emilio G. Cota
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 05/37] tcg/i386: Add constraints for r8 and r9 Richard Henderson
2018-11-29 15:00   ` Alex Bennée
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 06/37] tcg/i386: Return a base register from tcg_out_tlb_load Richard Henderson
2018-11-29 16:34   ` Alex Bennée
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 07/37] tcg/i386: Change TCG_REG_L[01] to not overlap function arguments Richard Henderson
2018-11-29 17:13   ` Alex Bennée
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 08/37] tcg/i386: Force qemu_ld/st arguments into fixed registers Richard Henderson
2018-11-30 16:16   ` Alex Bennée [this message]
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 09/37] tcg/i386: Use TCG_TARGET_NEED_LDST_OOL_LABELS Richard Henderson
2018-11-30 17:22   ` Alex Bennée
2018-11-30 17:37     ` Richard Henderson
2018-11-30 17:52       ` Alex Bennée
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 10/37] tcg/aarch64: Add constraints for x0, x1, x2 Richard Henderson
2018-11-30 17:25   ` Alex Bennée
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 11/37] tcg/aarch64: Parameterize the temps for tcg_out_tlb_read Richard Henderson
2018-11-30 17:50   ` Alex Bennée
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 12/37] tcg/aarch64: Parameterize the temp for tcg_out_goto_long Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 13/37] tcg/aarch64: Use B not BL " Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 14/37] tcg/aarch64: Use TCG_TARGET_NEED_LDST_OOL_LABELS Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 15/37] tcg/arm: Parameterize the temps for tcg_out_tlb_read Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 16/37] tcg/arm: Add constraints for R0-R5 Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 17/37] tcg/arm: Reduce the number of temps for tcg_out_tlb_read Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 18/37] tcg/arm: Force qemu_ld/st arguments into fixed registers Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 19/37] tcg/arm: Use TCG_TARGET_NEED_LDST_OOL_LABELS Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 20/37] tcg/ppc: Parameterize the temps for tcg_out_tlb_read Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 21/37] tcg/ppc: Split out tcg_out_call_int Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 22/37] tcg/ppc: Add constraints for R7-R8 Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 23/37] tcg/ppc: Change TCG_TARGET_CALL_ALIGN_ARGS to bool Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 24/37] tcg/ppc: Force qemu_ld/st arguments into fixed registers Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 25/37] tcg/ppc: Use TCG_TARGET_NEED_LDST_OOL_LABELS Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 26/37] tcg: Clean up generic bswap32 Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 27/37] tcg: Clean up generic bswap64 Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 28/37] tcg/optimize: Optimize bswap Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 29/37] tcg: Add TCG_TARGET_HAS_MEMORY_BSWAP Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 30/37] tcg/i386: Adjust TCG_TARGET_HAS_MEMORY_BSWAP Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 31/37] tcg/aarch64: Set TCG_TARGET_HAS_MEMORY_BSWAP to false Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 32/37] tcg/arm: Set TCG_TARGET_HAS_MEMORY_BSWAP to false for user-only Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 33/37] tcg/i386: Propagate is64 to tcg_out_qemu_ld_direct Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 34/37] tcg/i386: Restrict user-only qemu_st_i32 values to q-regs Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 35/37] tcg/i386: Add setup_guest_base_seg for FreeBSD Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 36/37] tcg/i386: Require segment syscalls to succeed Richard Henderson
2018-11-23 14:45 ` [Qemu-devel] [PATCH for-4.0 v2 37/37] tcg/i386: Remove L constraint Richard Henderson
2018-11-23 21:04 ` [Qemu-devel] [PATCH for-4.0 v2 00/37] tcg: Assorted cleanups no-reply
2018-11-26  0:30 ` Emilio G. Cota

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87tvjy35j3.fsf@linaro.org \
    --to=alex.bennee@linaro.org \
    --cc=Alistair.Francis@wdc.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.