From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:45879) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1gSlSY-00059e-Fj for qemu-devel@nongnu.org; Fri, 30 Nov 2018 11:16:07 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1gSlSV-0000HH-PG for qemu-devel@nongnu.org; Fri, 30 Nov 2018 11:16:06 -0500 Received: from mail-wm1-x341.google.com ([2a00:1450:4864:20::341]:38837) by eggs.gnu.org with esmtps (TLS1.0:RSA_AES_128_CBC_SHA1:16) (Exim 4.71) (envelope-from ) id 1gSlSV-0000Fj-HD for qemu-devel@nongnu.org; Fri, 30 Nov 2018 11:16:03 -0500 Received: by mail-wm1-x341.google.com with SMTP id k198so6289915wmd.3 for ; Fri, 30 Nov 2018 08:16:03 -0800 (PST) References: <20181123144558.5048-1-richard.henderson@linaro.org> <20181123144558.5048-9-richard.henderson@linaro.org> From: Alex =?utf-8?Q?Benn=C3=A9e?= In-reply-to: <20181123144558.5048-9-richard.henderson@linaro.org> Date: Fri, 30 Nov 2018 16:16:00 +0000 Message-ID: <87tvjy35j3.fsf@linaro.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH for-4.0 v2 08/37] tcg/i386: Force qemu_ld/st arguments into fixed registers List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: Alistair.Francis@wdc.com Richard Henderson writes: > This is an incremental step toward moving the qemu_ld/st > code sequence out of line. > > Signed-off-by: Richard Henderson > --- > tcg/i386/tcg-target.inc.c | 203 +++++++++++++++++++++++++++++++------- > 1 file changed, 169 insertions(+), 34 deletions(-) > > diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c > index 07df4b2b12..50e5dc31b3 100644 > --- a/tcg/i386/tcg-target.inc.c > +++ b/tcg/i386/tcg-target.inc.c > @@ -171,6 +171,80 @@ static bool have_lzcnt; > > static tcg_insn_unit *tb_ret_addr; > > +typedef enum { > + ARG_ADDR, > + ARG_STVAL, > + ARG_LDVAL, > +} QemuMemArgType; > + > +#ifdef CONFIG_SOFTMMU > +/* > + * Constraint to choose a particular register. This is used for softmmu > + * loads and stores. Registers with no assignment get an empty string. > + */ > +static const char * const one_reg_constraint[TCG_TARGET_NB_REGS] =3D { > + [TCG_REG_EAX] =3D "a", > + [TCG_REG_EBX] =3D "b", > + [TCG_REG_ECX] =3D "c", > + [TCG_REG_EDX] =3D "d", > + [TCG_REG_ESI] =3D "S", > + [TCG_REG_EDI] =3D "D", > +#if TCG_TARGET_REG_BITS =3D=3D 64 > + [TCG_REG_R8] =3D "E", > + [TCG_REG_R9] =3D "N", > +#endif > +}; > + > +/* > + * Calling convention for the softmmu load and store thunks. > + * > + * For 64-bit, we mostly use the host calling convention, therefore the > + * real first argument is reserved for the ENV parameter that is passed > + * on to the slow path helpers. > + * > + * For 32-bit, the host calling convention is stack based; we invent a > + * private convention that uses 4 of the 6 available host registers. > + * We reserve EAX and EDX as temporaries for use by the thunk, we require > + * INDEX_op_qemu_st_i32 to have a 'q' register from which to store, and > + * further complicate this last by wanting a call-clobbered for that sto= re. > + * The 'q' requirement allows MO_8 stores at all; the call-clobbered part > + * allows bswap to operate in-place, clobbering the input. > + */ > +static TCGReg softmmu_arg(QemuMemArgType type, bool is_64, int hi) > +{ > + switch (type) { > + case ARG_ADDR: > + tcg_debug_assert(!hi || TARGET_LONG_BITS > TCG_TARGET_REG_BITS); > + if (TCG_TARGET_REG_BITS =3D=3D 64) { > + return tcg_target_call_iarg_regs[1]; > + } else { > + return hi ? TCG_REG_EDI : TCG_REG_ESI; > + } > + case ARG_STVAL: > + tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS =3D=3D 32 && is_64)= ); > + if (TCG_TARGET_REG_BITS =3D=3D 64) { > + return tcg_target_call_iarg_regs[2]; > + } else { > + return hi ? TCG_REG_EBX : TCG_REG_ECX; > + } > + case ARG_LDVAL: > + tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS =3D=3D 32 && is_64)= ); > + return tcg_target_call_oarg_regs[hi]; > + } > + g_assert_not_reached(); > +} > + > +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, = int hi) > +{ > + return one_reg_constraint[softmmu_arg(type, is_64, hi)]; > +} > +#else > +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, = int hi) > +{ > + return "L"; > +} > +#endif /* CONFIG_SOFTMMU */ > + > static bool patch_reloc(tcg_insn_unit *code_ptr, int type, > intptr_t value, intptr_t addend) > { > @@ -1680,11 +1754,15 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCG= Reg addrlo, TCGReg addrhi, > copies the entire guest address for the slow path, while truncati= on > for the 32-bit host happens with the fastpath ADDL below. */ > if (TCG_TARGET_REG_BITS =3D=3D 64) { > - base =3D tcg_target_call_iarg_regs[1]; > + tcg_debug_assert(addrlo =3D=3D tcg_target_call_iarg_regs[1]); > + if (TARGET_LONG_BITS =3D=3D 32) { > + tcg_out_ext32u(s, addrlo, addrlo); > + } > + base =3D addrlo; > } else { > base =3D r1; > + tcg_out_mov(s, ttype, base, addrlo); > } > - tcg_out_mov(s, ttype, base, addrlo); > > /* jne slow_path */ > tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); > @@ -2009,16 +2087,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s,= TCGReg datalo, TCGReg datahi, > common. */ > static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64) > { > - TCGReg datalo, datahi, addrlo; > - TCGReg addrhi __attribute__((unused)); > + TCGReg datalo, addrlo; > + TCGReg datahi __attribute__((unused)) =3D -1; > + TCGReg addrhi __attribute__((unused)) =3D -1; > TCGMemOpIdx oi; > TCGMemOp opc; > + int i =3D -1; > > - datalo =3D *args++; > - datahi =3D (TCG_TARGET_REG_BITS =3D=3D 32 && is64 ? *args++ : 0); > - addrlo =3D *args++; > - addrhi =3D (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); > - oi =3D *args++; > + datalo =3D args[++i]; Swapping to i =3D -1 and pre-indexes seems a little unnatural here compared to a more normal 0 and i++ unless there was a specific reason to have i in the range of 2-4? > + if (TCG_TARGET_REG_BITS =3D=3D 32 && is64) { > + datahi =3D args[++i]; > + } > + addrlo =3D args[++i]; > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + addrhi =3D args[++i]; > + } > + oi =3D args[++i]; > opc =3D get_memop(oi); > > #if defined(CONFIG_SOFTMMU) > @@ -2027,6 +2111,15 @@ static void tcg_out_qemu_ld(TCGContext *s, const T= CGArg *args, bool is64) > tcg_insn_unit *label_ptr[2]; > TCGReg base; > > + tcg_debug_assert(datalo =3D=3D softmmu_arg(ARG_LDVAL, is64, 0)); > + if (TCG_TARGET_REG_BITS =3D=3D 32 && is64) { > + tcg_debug_assert(datahi =3D=3D softmmu_arg(ARG_LDVAL, is64, = 1)); > + } > + tcg_debug_assert(addrlo =3D=3D softmmu_arg(ARG_ADDR, 0, 0)); > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + tcg_debug_assert(addrhi =3D=3D softmmu_arg(ARG_ADDR, 0, 1)); > + } > + > base =3D tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, > label_ptr, offsetof(CPUTLBEntry, addr_re= ad)); > > @@ -2149,16 +2242,22 @@ static void tcg_out_qemu_st_direct(TCGContext *s,= TCGReg datalo, TCGReg datahi, > > static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) > { > - TCGReg datalo, datahi, addrlo; > - TCGReg addrhi __attribute__((unused)); > + TCGReg datalo, addrlo; > + TCGReg datahi __attribute__((unused)) =3D -1; > + TCGReg addrhi __attribute__((unused)) =3D -1; > TCGMemOpIdx oi; > TCGMemOp opc; > + int i =3D -1; And again here > > - datalo =3D *args++; > - datahi =3D (TCG_TARGET_REG_BITS =3D=3D 32 && is64 ? *args++ : 0); > - addrlo =3D *args++; > - addrhi =3D (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); > - oi =3D *args++; > + datalo =3D args[++i]; > + if (TCG_TARGET_REG_BITS =3D=3D 32 && is64) { > + datahi =3D args[++i]; > + } > + addrlo =3D args[++i]; > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + addrhi =3D args[++i]; > + } > + oi =3D args[++i]; > opc =3D get_memop(oi); > > #if defined(CONFIG_SOFTMMU) > @@ -2167,6 +2266,15 @@ static void tcg_out_qemu_st(TCGContext *s, const T= CGArg *args, bool is64) > tcg_insn_unit *label_ptr[2]; > TCGReg base; > > + tcg_debug_assert(datalo =3D=3D softmmu_arg(ARG_STVAL, is64, 0)); > + if (TCG_TARGET_REG_BITS =3D=3D 32 && is64) { > + tcg_debug_assert(datahi =3D=3D softmmu_arg(ARG_STVAL, is64, = 1)); > + } > + tcg_debug_assert(addrlo =3D=3D softmmu_arg(ARG_ADDR, 0, 0)); > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + tcg_debug_assert(addrhi =3D=3D softmmu_arg(ARG_ADDR, 0, 1)); > + } > + > base =3D tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, > label_ptr, offsetof(CPUTLBEntry, addr_wr= ite)); > > @@ -2836,15 +2944,6 @@ static const TCGTargetOpDef *tcg_target_op_def(TCG= Opcode op) > static const TCGTargetOpDef r_r_re =3D { .args_ct_str =3D { "r", "r"= , "re" } }; > static const TCGTargetOpDef r_0_re =3D { .args_ct_str =3D { "r", "0"= , "re" } }; > static const TCGTargetOpDef r_0_ci =3D { .args_ct_str =3D { "r", "0"= , "ci" } }; > - static const TCGTargetOpDef r_L =3D { .args_ct_str =3D { "r", "L" } = }; > - static const TCGTargetOpDef L_L =3D { .args_ct_str =3D { "L", "L" } = }; > - static const TCGTargetOpDef r_L_L =3D { .args_ct_str =3D { "r", "L",= "L" } }; > - static const TCGTargetOpDef r_r_L =3D { .args_ct_str =3D { "r", "r",= "L" } }; > - static const TCGTargetOpDef L_L_L =3D { .args_ct_str =3D { "L", "L",= "L" } }; > - static const TCGTargetOpDef r_r_L_L > - =3D { .args_ct_str =3D { "r", "r", "L", "L" } }; > - static const TCGTargetOpDef L_L_L_L > - =3D { .args_ct_str =3D { "L", "L", "L", "L" } }; > static const TCGTargetOpDef x_x =3D { .args_ct_str =3D { "x", "x" } = }; > static const TCGTargetOpDef x_x_x =3D { .args_ct_str =3D { "x", "x",= "x" } }; > static const TCGTargetOpDef x_x_x_x > @@ -3026,17 +3125,53 @@ static const TCGTargetOpDef *tcg_target_op_def(TC= GOpcode op) > } > > case INDEX_op_qemu_ld_i32: > - return TARGET_LONG_BITS <=3D TCG_TARGET_REG_BITS ? &r_L : &r_L_L; > - case INDEX_op_qemu_st_i32: > - return TARGET_LONG_BITS <=3D TCG_TARGET_REG_BITS ? &L_L : &L_L_L; > + { > + static TCGTargetOpDef ld32; > + int i; > + > + ld32.args_ct_str[0] =3D constrain_memop_arg(ARG_LDVAL, 0, 0); > + for (i =3D 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; > ++i) { This formulation is a bit weird to follow, at first I thought it was going to overflow TCG_MAX_OP_ARGS until I realised what it was doing. Maybe a helper function would be a little clearer: for (src_reg =3D 0; src_reg < tcg_target_regs_for(TARGET_LON= G_BITS), ++i) Same comment applies bellow. > + ld32.args_ct_str[i + 1] =3D constrain_memop_arg(ARG_ADDR= , 0, i); > + } > + return &ld32; > + } > case INDEX_op_qemu_ld_i64: > - return (TCG_TARGET_REG_BITS =3D=3D 64 ? &r_L > - : TARGET_LONG_BITS <=3D TCG_TARGET_REG_BITS ? &r_r_L > - : &r_r_L_L); > + { > + static TCGTargetOpDef ld64; > + int i, j =3D 0; > + > + for (i =3D 0; i * TCG_TARGET_REG_BITS < 64; ++i) { > + ld64.args_ct_str[j++] =3D constrain_memop_arg(ARG_LDVAL,= 1, i); > + } > + for (i =3D 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++= i) { > + ld64.args_ct_str[j++] =3D constrain_memop_arg(ARG_ADDR, = 0, i); > + } > + return &ld64; > + } > + case INDEX_op_qemu_st_i32: > + { > + static TCGTargetOpDef st32; > + int i; > + > + st32.args_ct_str[0] =3D constrain_memop_arg(ARG_STVAL, 0, 0); > + for (i =3D 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++= i) { > + st32.args_ct_str[i + 1] =3D constrain_memop_arg(ARG_ADDR= , 0, i); > + } > + return &st32; > + } > case INDEX_op_qemu_st_i64: > - return (TCG_TARGET_REG_BITS =3D=3D 64 ? &L_L > - : TARGET_LONG_BITS <=3D TCG_TARGET_REG_BITS ? &L_L_L > - : &L_L_L_L); > + { > + static TCGTargetOpDef st64; > + int i, j =3D 0; > + > + for (i =3D 0; i * TCG_TARGET_REG_BITS < 64; ++i) { > + st64.args_ct_str[j++] =3D constrain_memop_arg(ARG_STVAL,= 1, i); > + } > + for (i =3D 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++= i) { > + st64.args_ct_str[j++] =3D constrain_memop_arg(ARG_ADDR, = 0, i); > + } > + return &st64; > + } > > case INDEX_op_brcond2_i32: > { -- Alex Benn=C3=A9e