Hi Richard, On Tue, Feb 09, 2016 at 09:40:02PM +1100, Richard Henderson wrote: > Using compact branches, when possible, avoids a delay slot nop. > > Signed-off-by: Richard Henderson > --- > include/elf.h | 4 + > tcg/mips/tcg-target.c | 216 +++++++++++++++++++++++++++++++++++++++----------- > 2 files changed, 172 insertions(+), 48 deletions(-) > > diff --git a/include/elf.h b/include/elf.h > index 1098d21..6e52ba0 100644 > --- a/include/elf.h > +++ b/include/elf.h > @@ -352,6 +352,10 @@ typedef struct { > #define R_MIPS_CALLHI16 30 > #define R_MIPS_CALLLO16 31 > /* > + * Incomplete list of MIPS R6 relocation types. > + */ > +#define R_MIPS_PC26_S2 61 > +/* > * This range is reserved for vendor specific relocations. > */ > #define R_MIPS_LOVENDOR 100 > diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c > index e0972ba..06e15d4 100644 > --- a/tcg/mips/tcg-target.c > +++ b/tcg/mips/tcg-target.c > @@ -152,6 +152,19 @@ static inline void reloc_pc16(tcg_insn_unit *pc, tcg_insn_unit *target) > *pc = deposit32(*pc, 0, 16, reloc_pc16_val(pc, target)); > } > > +static inline uint32_t reloc_pc26_val(tcg_insn_unit *pc, tcg_insn_unit *target) > +{ > + /* Let the compiler perform the right-shift as part of the arithmetic. */ > + ptrdiff_t disp = target - (pc + 1); > + assert(disp == sextract32(disp, 0, 26)); > + return disp & 0x1ffffff; > +} > + > +static inline void reloc_pc26(tcg_insn_unit *pc, tcg_insn_unit *target) > +{ > + *pc = deposit32(*pc, 0, 26, reloc_pc16_val(pc, target)); > +} > + > static inline uint32_t reloc_26_val(tcg_insn_unit *pc, tcg_insn_unit *target) > { > assert((((uintptr_t)pc ^ (uintptr_t)target) & 0xf0000000) == 0); > @@ -166,9 +179,17 @@ static inline void reloc_26(tcg_insn_unit *pc, tcg_insn_unit *target) > static void patch_reloc(tcg_insn_unit *code_ptr, int type, > intptr_t value, intptr_t addend) > { > - assert(type == R_MIPS_PC16); > assert(addend == 0); > - reloc_pc16(code_ptr, (tcg_insn_unit *)value); > + switch (type) { > + case R_MIPS_PC16: > + reloc_pc16(code_ptr, (tcg_insn_unit *)value); > + break; > + case R_MIPS_PC26_S2: > + reloc_pc26(code_ptr, (tcg_insn_unit *)value); > + break; > + default: > + tcg_abort(); > + } > } > > #define TCG_CT_CONST_ZERO 0x100 > @@ -309,7 +330,10 @@ typedef enum { > OPC_BEQ = 004 << 26, > OPC_BNE = 005 << 26, > OPC_BLEZ = 006 << 26, > + OPC_BGEUC = OPC_BLEZ, /* R6: rs != 0, rt != 0, rs != rt */ > OPC_BGTZ = 007 << 26, > + OPC_BLTUC = OPC_BGTZ, /* R6: rs != 0, rt != 0, rs != rt */ > + OPC_BEQC = 010 << 26, /* R6: rs > rt */ > OPC_ADDIU = 011 << 26, > OPC_SLTI = 012 << 26, > OPC_SLTIU = 013 << 26, > @@ -318,6 +342,9 @@ typedef enum { > OPC_XORI = 016 << 26, > OPC_LUI = 017 << 26, > OPC_AUI = OPC_LUI, > + OPC_BGEC = 026 << 26, > + OPC_BLTC = 027 << 26, > + OPC_BNEC = 030 << 26, /* R6: rs > rt */ > OPC_DADDIU = 031 << 26, > OPC_DAUI = 035 << 26, > OPC_LB = 040 << 26, > @@ -329,6 +356,7 @@ typedef enum { > OPC_SB = 050 << 26, > OPC_SH = 051 << 26, > OPC_SW = 053 << 26, > + OPC_BC = 062 << 26, > OPC_LD = 067 << 26, > OPC_SD = 077 << 26, > > @@ -527,6 +555,17 @@ static inline void tcg_out_opc_br(TCGContext *s, MIPSInsn opc, > tcg_out_opc_imm(s, opc, rt, rs, offset); > } > > +static void tcg_out_opc_br_pc16(TCGContext *s, MIPSInsn opc, > + TCGReg arg1, TCGReg arg2, TCGLabel *l) > +{ > + tcg_out_opc_br(s, opc, arg1, arg2); > + if (l->has_value) { > + reloc_pc16(s->code_ptr - 1, l->u.value_ptr); > + } else { > + tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0); > + } > +} > + > /* > * Type sa > */ > @@ -1002,59 +1041,129 @@ static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGReg arg1, > [TCG_COND_GE] = OPC_BGEZ, > }; > > - MIPSInsn s_opc = OPC_SLTU; > - MIPSInsn b_opc; > - int cmp_map; > + MIPSInsn b_opc = 0; > + bool compact = false; > + int cmp_map, t; > + > + /* We shouldn't expect to have arg1 == arg2, as the TCG optimizer > + should have eliminated all such. However, the R6 encodings do > + not allow this situation, so e.g. if the optimizer is disabled > + we must fall back to normal compares. */ > + if (use_mips32r6_instructions && arg1 != arg2) { > + switch (cond) { > + case TCG_COND_EQ: > + case TCG_COND_NE: > + if (arg1 < arg2) { > + t = arg1, arg1 = arg2, arg2 = t; > + } > + b_opc = cond == TCG_COND_EQ ? OPC_BEQC : OPC_BNEC; > + compact = true; > + break; > > - switch (cond) { > - case TCG_COND_EQ: > - b_opc = OPC_BEQ; > - break; > - case TCG_COND_NE: > - b_opc = OPC_BNE; > - break; > + case TCG_COND_LE: > + case TCG_COND_GT: > + if (arg1 == TCG_REG_ZERO) { > + break; > + } > + /* Swap arguments to turn LE to GE or GT to LT. > + This also produces BLEZC/BGTZC when arg2 = 0. */ > + t = arg1, arg1 = arg2, arg2 = t; > + b_opc = cond == TCG_COND_LE ? OPC_BGEC : OPC_BLTC; > + compact = true; > + break; > > - case TCG_COND_LT: > - case TCG_COND_GT: > - case TCG_COND_LE: > - case TCG_COND_GE: > - if (arg2 == 0) { > - b_opc = b_zero[cond]; > - arg2 = arg1; > - arg1 = 0; > + case TCG_COND_GE: > + case TCG_COND_LT: > + if (arg1 == TCG_REG_ZERO) { > + break; > + } > + /* The encoding of BGEZC/BLTZC requires rs = rt. */ > + if (arg2 == TCG_REG_ZERO) { > + arg2 = arg1; > + } > + b_opc = cond == TCG_COND_GE ? OPC_BGEC : OPC_BLTC; > + compact = true; > break; > - } > - s_opc = OPC_SLT; > - /* FALLTHRU */ > > - case TCG_COND_LTU: > - case TCG_COND_GTU: > - case TCG_COND_LEU: > - case TCG_COND_GEU: > - cmp_map = mips_cmp_map[cond]; > - if (cmp_map & MIPS_CMP_SWAP) { > - TCGReg t = arg1; > - arg1 = arg2; > - arg2 = t; > + case TCG_COND_LEU: > + /* Swap arguments to turn LE to GE. */ > + t = arg1, arg1 = arg2, arg2 = t; > + /* FALLTHRU */ > + case TCG_COND_GEU: > + b_opc = OPC_BGEUC; > + compact = true; > + break; > + > + case TCG_COND_GTU: > + /* Swap arguments to turn GT to LT. */ > + t = arg1, arg1 = arg2, arg2 = t; > + /* FALLTHRU */ > + case TCG_COND_LTU: > + b_opc = OPC_BLTUC; > + compact = true; > + break; > + > + default: > + tcg_abort(); > + break; > } > - tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2); > - b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE); > - arg1 = TCG_TMP0; > - arg2 = TCG_REG_ZERO; > - break; > + } > > - default: > - tcg_abort(); > - break; > + if (b_opc == 0) { > + MIPSInsn s_opc = OPC_SLTU; > + > + switch (cond) { > + case TCG_COND_EQ: > + b_opc = OPC_BEQ; > + break; > + case TCG_COND_NE: > + b_opc = OPC_BNE; > + break; > + > + case TCG_COND_LT: > + case TCG_COND_GT: > + case TCG_COND_LE: > + case TCG_COND_GE: > + if (arg2 == 0) { > + b_opc = b_zero[cond]; > + arg2 = arg1; > + arg1 = 0; > + break; > + } > + s_opc = OPC_SLT; > + /* FALLTHRU */ > + > + case TCG_COND_LTU: > + case TCG_COND_GTU: > + case TCG_COND_LEU: > + case TCG_COND_GEU: > + cmp_map = mips_cmp_map[cond]; > + if (cmp_map & MIPS_CMP_SWAP) { > + TCGReg t = arg1; > + arg1 = arg2; > + arg2 = t; > + } > + tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2); > + if (use_mips32r6_instructions) { > + b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQC : OPC_BNEC); > + compact = true; > + } else { > + b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE); > + } > + arg1 = TCG_TMP0; > + arg2 = TCG_REG_ZERO; > + break; > + > + default: > + tcg_abort(); > + break; > + } > } > > - tcg_out_opc_br(s, b_opc, arg1, arg2); > - if (l->has_value) { > - reloc_pc16(s->code_ptr - 1, l->u.value_ptr); > - } else { > - tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0); > + tcg_out_opc_br_pc16(s, b_opc, arg1, arg2, l); > + if (!compact) { > + tcg_out_nop(s); Unfortunately this isn't quite right. As far as I understand them, conditional compact branches have a forbidden slot after them which isn't permitted to contain a control transfer instruction (CTI). Executing a conditional compact branch with a CTI in the forbidden slot is required to signal a reserved instruction, but only if the branch is not taken (giving user process a SIGILL). E.g. Program received signal SIGILL, Illegal instruction. [Switching to Thread 0xfff1c32e00 (LWP 204)] 0x000000fff30b0068 in code_gen_buffer () (gdb) disas/r Dump of assembler code for function code_gen_buffer: 0x000000fff30b0064 <+0>: f8 ff 11 8e lw s1,-8(s0) => 0x000000fff30b0068 <+4>: 08 00 11 60 bnezalc s1,0xfff30b008c 0x000000fff30b006c <+8>: 1d c0 c2 08 j 0xfff30b0074 0x000000fff30b0070 <+12>: 00 00 00 00 nop (gdb) set *0x000000fff30b006c=0 (gdb) disas/r Dump of assembler code for function code_gen_buffer: 0x000000fff30b0064 <+0>: f8 ff 11 8e lw s1,-8(s0) => 0x000000fff30b0068 <+4>: 08 00 11 60 bnezalc s1,0xfff30b008c 0x000000fff30b006c <+8>: 00 00 00 00 nop 0x000000fff30b0070 <+12>: 00 00 00 00 nop (gdb) stepi 0x000000fff30b0070 in code_gen_buffer () (gdb) disas/r Dump of assembler code for function code_gen_buffer: 0x000000fff30b0064 <+0>: f8 ff 11 8e lw s1,-8(s0) 0x000000fff30b0068 <+4>: 08 00 11 60 bnezalc s1,0xfff30b008c 0x000000fff30b006c <+8>: 00 00 00 00 nop => 0x000000fff30b0070 <+12>: 00 00 00 00 nop So to be correct + efficient, it should only put the nop in if the next generated instruction is a CTI. I imagine that would be a bit messy / fragile, but maybe doable? I haven't looked too deeply. Cheers James > } > - tcg_out_nop(s); > } > > static TCGReg tcg_out_reduce_eq2(TCGContext *s, TCGReg tmp0, TCGReg tmp1, > @@ -1826,8 +1935,19 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, > s->tb_next_offset[a0] = tcg_current_code_size(s); > break; > case INDEX_op_br: > - tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO, > - arg_label(a0)); > + { > + TCGLabel *l = arg_label(a0); > + if (use_mips32r6_instructions) { > + tcg_out32(s, OPC_BC); > + if (l->has_value) { > + reloc_pc26(s->code_ptr - 1, l->u.value_ptr); > + } else { > + tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC26_S2, l, 0); > + } > + } else { > + tcg_out_opc_br_pc16(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO, l); > + } > + } > break; > > case INDEX_op_ld8u_i32: > -- > 2.5.0 >