All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions
@ 2023-08-04 21:33 Richard Henderson
  2023-08-04 21:33 ` [PATCH 1/7] tcg/ppc: Untabify tcg-target.c.inc Richard Henderson
                   ` (7 more replies)
  0 siblings, 8 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

Emit one 64-bit instruction for large constants and pc-relatives.
With pc-relative addressing, we don't need REG_TB, which means we
can re-enable direct branching for goto_tb.


r~


Richard Henderson (7):
  tcg/ppc: Untabify tcg-target.c.inc
  tcg/ppc: Use PADDI in tcg_out_movi
  tcg/ppc: Use prefixed instructions in tcg_out_mem_long
  tcg/ppc: Use PLD in tcg_out_movi for constant pool
  tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec
  tcg/ppc: Disable USE_REG_TB for Power v3.1
  tcg/ppc: Use prefixed instructions for tcg_out_goto_tb

 tcg/ppc/tcg-target.c.inc | 233 +++++++++++++++++++++++++++++++++++----
 1 file changed, 211 insertions(+), 22 deletions(-)

-- 
2.34.1



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/7] tcg/ppc: Untabify tcg-target.c.inc
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
@ 2023-08-04 21:33 ` Richard Henderson
  2023-08-04 21:33 ` [PATCH 2/7] tcg/ppc: Use PADDI in tcg_out_movi Richard Henderson
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 511e14b180..642d0fd128 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -221,7 +221,7 @@ static inline bool in_range_b(tcg_target_long target)
 }
 
 static uint32_t reloc_pc24_val(const tcg_insn_unit *pc,
-			       const tcg_insn_unit *target)
+                               const tcg_insn_unit *target)
 {
     ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
     tcg_debug_assert(in_range_b(disp));
@@ -241,7 +241,7 @@ static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
 }
 
 static uint16_t reloc_pc14_val(const tcg_insn_unit *pc,
-			       const tcg_insn_unit *target)
+                               const tcg_insn_unit *target)
 {
     ptrdiff_t disp = tcg_ptr_byte_diff(target, pc);
     tcg_debug_assert(disp == (int16_t) disp);
@@ -3587,7 +3587,7 @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
                   tcgv_vec_arg(t1), tcgv_vec_arg(t2));
         vec_gen_3(INDEX_op_ppc_pkum_vec, type, vece, tcgv_vec_arg(v0),
                   tcgv_vec_arg(v0), tcgv_vec_arg(t1));
-	break;
+        break;
 
     case MO_32:
         tcg_debug_assert(!have_isa_2_07);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/7] tcg/ppc: Use PADDI in tcg_out_movi
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
  2023-08-04 21:33 ` [PATCH 1/7] tcg/ppc: Untabify tcg-target.c.inc Richard Henderson
@ 2023-08-04 21:33 ` Richard Henderson
  2023-08-07  3:53   ` Jordan Niethe
  2023-08-04 21:33 ` [PATCH 3/7] tcg/ppc: Use prefixed instructions in tcg_out_mem_long Richard Henderson
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

PADDI can load 34-bit immediates and 34-bit pc-relative addresses.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 47 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 642d0fd128..7fa2a2500b 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -707,6 +707,33 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
     return true;
 }
 
+/* Ensure that the prefixed instruction does not cross a 64-byte boundary. */
+static bool tcg_out_need_prefix_align(TCGContext *s)
+{
+    return ((uintptr_t)s->code_ptr & 0x3f) == 0x3c;
+}
+
+static void tcg_out_prefix_align(TCGContext *s)
+{
+    if (tcg_out_need_prefix_align(s)) {
+        tcg_out32(s, NOP);
+    }
+}
+
+/* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */
+static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
+                          unsigned ra, tcg_target_long imm, bool r)
+{
+    tcg_insn_unit p, i;
+
+    p = OPCD(1) | (2 << 24) | (r << 20) | ((imm >> 16) & 0x3ffff);
+    i = opc | TAI(rt, ra, imm);
+
+    tcg_out_prefix_align(s);
+    tcg_out32(s, p);
+    tcg_out32(s, i);
+}
+
 static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
                              TCGReg base, tcg_target_long offset);
 
@@ -992,6 +1019,26 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
         return;
     }
 
+    /*
+     * Load values up to 34 bits, and pc-relative addresses,
+     * with one prefixed insn.
+     */
+    if (have_isa_3_10) {
+        if (arg == sextract64(arg, 0, 34)) {
+            /* pli ret,value = paddi ret,0,value,0 */
+            tcg_out_mls_d(s, ADDI, ret, 0, arg, 0);
+            return;
+        }
+
+        tmp = tcg_out_need_prefix_align(s) * 4;
+        tmp = tcg_pcrel_diff(s, (void *)arg) - tmp;
+        if (tmp == sextract64(tmp, 0, 34)) {
+            /* pla ret,value = paddi ret,0,value,1 */
+            tcg_out_mls_d(s, ADDI, ret, 0, tmp, 1);
+            return;
+        }
+    }
+
     /* Load 32-bit immediates with two insns.  Note that we've already
        eliminated bare ADDIS, so we know both insns are required.  */
     if (TCG_TARGET_REG_BITS == 32 || arg == (int32_t)arg) {
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/7] tcg/ppc: Use prefixed instructions in tcg_out_mem_long
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
  2023-08-04 21:33 ` [PATCH 1/7] tcg/ppc: Untabify tcg-target.c.inc Richard Henderson
  2023-08-04 21:33 ` [PATCH 2/7] tcg/ppc: Use PADDI in tcg_out_movi Richard Henderson
@ 2023-08-04 21:33 ` Richard Henderson
  2023-08-07  3:51   ` Jordan Niethe
  2023-08-04 21:33 ` [PATCH 4/7] tcg/ppc: Use PLD in tcg_out_movi for constant pool Richard Henderson
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

When the offset is out of range of the non-prefixed insn, but
fits the 34-bit immediate of the prefixed insn, use that.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 66 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 7fa2a2500b..d41c499b7d 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -323,6 +323,15 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define STDX   XO31(149)
 #define STQ    XO62(  2)
 
+#define PLWA   OPCD( 41)
+#define PLD    OPCD( 57)
+#define PLXSD  OPCD( 42)
+#define PLXV   OPCD(25 * 2 + 1)  /* force tx=1 */
+
+#define PSTD   OPCD( 61)
+#define PSTXSD OPCD( 46)
+#define PSTXV  OPCD(27 * 2 + 1)  /* force tx=1 */
+
 #define ADDIC  OPCD( 12)
 #define ADDI   OPCD( 14)
 #define ADDIS  OPCD( 15)
@@ -720,6 +729,20 @@ static void tcg_out_prefix_align(TCGContext *s)
     }
 }
 
+/* Output Type 00 Prefix - 8-Byte Load/Store Form (8LS:D) */
+static void tcg_out_8ls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
+                          unsigned ra, tcg_target_long imm, bool r)
+{
+    tcg_insn_unit p, i;
+
+    p = OPCD(1) | (r << 20) | ((imm >> 16) & 0x3ffff);
+    i = opc | TAI(rt, ra, imm);
+
+    tcg_out_prefix_align(s);
+    tcg_out32(s, p);
+    tcg_out32(s, i);
+}
+
 /* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */
 static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
                           unsigned ra, tcg_target_long imm, bool r)
@@ -1364,6 +1387,49 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
         break;
     }
 
+    /* For unaligned or large offsets, use the prefixed form. */
+    if (have_isa_3_10
+        && (offset != (int16_t)offset || (offset & align))
+        && offset == sextract64(offset, 0, 34)) {
+        /*
+         * Note that the MLS:D insns retain their un-prefixed opcode,
+         * while the 8LS:D insns use a different opcode space.
+         */
+        switch (opi) {
+        case LBZ:
+        case LHZ:
+        case LHA:
+        case LWZ:
+        case STB:
+        case STH:
+        case STW:
+        case ADDI:
+            tcg_out_mls_d(s, opi, rt, base, offset, 0);
+            return;
+        case LWA:
+            tcg_out_8ls_d(s, PLWA, rt, base, offset, 0);
+            return;
+        case LD:
+            tcg_out_8ls_d(s, PLD, rt, base, offset, 0);
+            return;
+        case STD:
+            tcg_out_8ls_d(s, PSTD, rt, base, offset, 0);
+            return;
+        case LXSD:
+            tcg_out_8ls_d(s, PLXSD, rt & 31, base, offset, 0);
+            return;
+        case STXSD:
+            tcg_out_8ls_d(s, PSTXSD, rt & 31, base, offset, 0);
+            return;
+        case LXV:
+            tcg_out_8ls_d(s, PLXV, rt & 31, base, offset, 0);
+            return;
+        case STXV:
+            tcg_out_8ls_d(s, PSTXV, rt & 31, base, offset, 0);
+            return;
+        }
+    }
+
     /* For unaligned, or very large offsets, use the indexed form.  */
     if (offset & align || offset != (int32_t)offset || opi == 0) {
         if (rs == base) {
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/7] tcg/ppc: Use PLD in tcg_out_movi for constant pool
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
                   ` (2 preceding siblings ...)
  2023-08-04 21:33 ` [PATCH 3/7] tcg/ppc: Use prefixed instructions in tcg_out_mem_long Richard Henderson
@ 2023-08-04 21:33 ` Richard Henderson
  2023-08-04 21:33 ` [PATCH 5/7] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec Richard Henderson
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

The prefixed instruction has a pc-relative form to use here.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index d41c499b7d..a9e48a51c8 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -101,6 +101,10 @@
 #define ALL_GENERAL_REGS  0xffffffffu
 #define ALL_VECTOR_REGS   0xffffffff00000000ull
 
+#ifndef R_PPC64_PCREL34
+#define R_PPC64_PCREL34  132
+#endif
+
 #define have_isel  (cpuinfo & CPUINFO_ISEL)
 
 #ifndef CONFIG_SOFTMMU
@@ -260,6 +264,19 @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
     return false;
 }
 
+static bool reloc_pc34(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
+{
+    const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
+    ptrdiff_t disp = tcg_ptr_byte_diff(target, src_rx);
+
+    if (disp == sextract64(disp, 0, 34)) {
+        src_rw[0] = (src_rw[0] & ~0x3ffff) | ((disp >> 16) & 0x3ffff);
+        src_rw[1] = (src_rw[1] & ~0xffff) | (disp & 0xffff);
+        return true;
+    }
+    return false;
+}
+
 /* test if a constant matches the constraint */
 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 {
@@ -684,6 +701,8 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
         return reloc_pc14(code_ptr, target);
     case R_PPC_REL24:
         return reloc_pc24(code_ptr, target);
+    case R_PPC64_PCREL34:
+        return reloc_pc34(code_ptr, target);
     case R_PPC_ADDR16:
         /*
          * We are (slightly) abusing this relocation type.  In particular,
@@ -1107,6 +1126,11 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 
     /* Use the constant pool, if possible.  */
+    if (have_isa_3_10) {
+        tcg_out_8ls_d(s, PLD, ret, 0, 0, 1);
+        new_pool_label(s, arg, R_PPC64_PCREL34, s->code_ptr - 2, 0);
+        return;
+    }
     if (!in_prologue && USE_REG_TB) {
         new_pool_label(s, arg, R_PPC_ADDR16, s->code_ptr,
                        tcg_tbrel_diff(s, NULL));
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/7] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
                   ` (3 preceding siblings ...)
  2023-08-04 21:33 ` [PATCH 4/7] tcg/ppc: Use PLD in tcg_out_movi for constant pool Richard Henderson
@ 2023-08-04 21:33 ` Richard Henderson
  2023-08-04 21:33 ` [PATCH 6/7] tcg/ppc: Disable USE_REG_TB for Power v3.1 Richard Henderson
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

The prefixed instructions have a pc-relative form to use here.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index a9e48a51c8..e8eced7cf3 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -1191,6 +1191,18 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
     /*
      * Otherwise we must load the value from the constant pool.
      */
+
+    if (have_isa_3_10) {
+        if (type == TCG_TYPE_V64) {
+            tcg_out_8ls_d(s, PLXSD, ret & 31, 0, 0, 1);
+            new_pool_label(s, val, R_PPC64_PCREL34, s->code_ptr - 2, 0);
+        } else {
+            tcg_out_8ls_d(s, PLXV, ret & 31, 0, 0, 1);
+            new_pool_l2(s, R_PPC64_PCREL34, s->code_ptr - 2, 0, val, val);
+        }
+        return;
+    }
+
     if (USE_REG_TB) {
         rel = R_PPC_ADDR16;
         add = tcg_tbrel_diff(s, NULL);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 6/7] tcg/ppc: Disable USE_REG_TB for Power v3.1
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
                   ` (4 preceding siblings ...)
  2023-08-04 21:33 ` [PATCH 5/7] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec Richard Henderson
@ 2023-08-04 21:33 ` Richard Henderson
  2023-08-06 11:58   ` Nicholas Piggin
  2023-08-04 21:33 ` [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb Richard Henderson
  2023-08-06 11:55 ` [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Nicholas Piggin
  7 siblings, 1 reply; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

With Power v3.1, we have pc-relative addressing and so
do not require a register holding the current TB.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index e8eced7cf3..5b243b2353 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -83,7 +83,7 @@
 #define TCG_VEC_TMP2    TCG_REG_V1
 
 #define TCG_REG_TB     TCG_REG_R31
-#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64)
+#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64 && !have_isa_3_10)
 
 /* Shorthand for size of a pointer.  Avoid promotion to unsigned.  */
 #define SZP  ((int)sizeof(void *))
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
                   ` (5 preceding siblings ...)
  2023-08-04 21:33 ` [PATCH 6/7] tcg/ppc: Disable USE_REG_TB for Power v3.1 Richard Henderson
@ 2023-08-04 21:33 ` Richard Henderson
  2023-08-06 12:55   ` Nicholas Piggin
  2023-08-07  4:08   ` Jordan Niethe
  2023-08-06 11:55 ` [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Nicholas Piggin
  7 siblings, 2 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-04 21:33 UTC (permalink / raw)
  To: qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

When a direct branch is out of range, we can load the destination for
the indirect branch using PLA (for 16GB worth of buffer) and PLD from
the TranslationBlock for everything larger.

This means the patch affects exactly one instruction: B (plus filler),
PLA or PLD.  Which means we can update and execute the patch atomically.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 5b243b2353..47c71bb5f2 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
     uintptr_t ptr = get_jmp_target_addr(s, which);
 
     if (USE_REG_TB) {
+        /*
+         * With REG_TB, we must always use indirect branching,
+         * so that the branch destination and TCG_REG_TB match.
+         */
         ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
         tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
-    
-        /* TODO: Use direct branches when possible. */
-        set_jmp_insn_offset(s, which);
         tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-
         tcg_out32(s, BCCTR | BO_ALWAYS);
 
         /* For the unlinked case, need to reset TCG_REG_TB.  */
         set_jmp_reset_offset(s, which);
         tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
                          -tcg_current_code_size(s));
+        return;
+    }
+
+    if (have_isa_3_10) {
+        /* Align, so that we can patch 8 bytes atomically. */
+        if ((uintptr_t)s->code_ptr & 7) {
+            tcg_out32(s, NOP);
+        }
+        set_jmp_insn_offset(s, which);
+        /* Direct branch will be patched by tb_target_set_jmp_target. */
+        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
     } else {
         /* Direct branch will be patched by tb_target_set_jmp_target. */
-        set_jmp_insn_offset(s, which);
-        tcg_out32(s, NOP);
-
+        tcg_out32(s, B);
         /* When branch is out of range, fall through to indirect. */
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
-        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
-        tcg_out32(s, BCCTR | BO_ALWAYS);
-        set_jmp_reset_offset(s, which);
     }
+
+    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
+    tcg_out32(s, BCCTR | BO_ALWAYS);
+    set_jmp_reset_offset(s, which);
 }
 
 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
@@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 {
     uintptr_t addr = tb->jmp_target_addr[n];
     intptr_t diff = addr - jmp_rx;
-    tcg_insn_unit insn;
 
     if (USE_REG_TB) {
         return;
     }
 
-    if (in_range_b(diff)) {
-        insn = B | (diff & 0x3fffffc);
-    } else {
-        insn = NOP;
-    }
+    if (have_isa_3_10) {
+        tcg_insn_unit insn1, insn2;
+        uint64_t pair;
 
-    qatomic_set((uint32_t *)jmp_rw, insn);
-    flush_idcache_range(jmp_rx, jmp_rw, 4);
+        if (in_range_b(diff)) {
+            insn1 = B | (diff & 0x3fffffc);
+            insn2 = NOP;
+        } else if (diff == sextract64(diff, 0, 34)) {
+            /* PLA tmp1, diff */
+            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
+            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
+        } else {
+            addr = (uintptr_t)&tb->jmp_target_addr[n];
+            diff = addr - jmp_rx;
+            tcg_debug_assert(diff == sextract64(diff, 0, 34));
+            /* PLD tmp1, diff */
+            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
+            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
+        }
+
+        if (HOST_BIG_ENDIAN) {
+            pair = ((uint64_t)insn1) << 32 | insn2;
+        } else {
+            pair = ((uint64_t)insn2) << 32 | insn1;
+        }
+
+        qatomic_set((uint64_t *)jmp_rw, pair);
+        flush_idcache_range(jmp_rx, jmp_rw, 8);
+    } else {
+        tcg_insn_unit insn;
+
+        if (in_range_b(diff)) {
+            insn = B | (diff & 0x3fffffc);
+        } else {
+            insn = NOP;
+        }
+        qatomic_set((uint32_t *)jmp_rw, insn);
+        flush_idcache_range(jmp_rx, jmp_rw, 4);
+    }
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions
  2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
                   ` (6 preceding siblings ...)
  2023-08-04 21:33 ` [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb Richard Henderson
@ 2023-08-06 11:55 ` Nicholas Piggin
  2023-08-06 14:22   ` Richard Henderson
  7 siblings, 1 reply; 21+ messages in thread
From: Nicholas Piggin @ 2023-08-06 11:55 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> Emit one 64-bit instruction for large constants and pc-relatives.
> With pc-relative addressing, we don't need REG_TB, which means we
> can re-enable direct branching for goto_tb.

Very cool. I tested this on a POWER10 and it has been stable so
far. Using mttcg with SMP 8 machine, with this patch, a parallel
gcc compile job in a Linux distro is 11.5% faster. Is such a big
improvment expected?

For the series,

Tested-by: Nicholas Piggin <npiggin@gmail.com>

Thanks,
Nick


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 6/7] tcg/ppc: Disable USE_REG_TB for Power v3.1
  2023-08-04 21:33 ` [PATCH 6/7] tcg/ppc: Disable USE_REG_TB for Power v3.1 Richard Henderson
@ 2023-08-06 11:58   ` Nicholas Piggin
  2023-08-06 13:45     ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Nicholas Piggin @ 2023-08-06 11:58 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

Patch 7 is not reqired for this?

Thanks,
Nick

On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> With Power v3.1, we have pc-relative addressing and so
> do not require a register holding the current TB.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.c.inc | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> index e8eced7cf3..5b243b2353 100644
> --- a/tcg/ppc/tcg-target.c.inc
> +++ b/tcg/ppc/tcg-target.c.inc
> @@ -83,7 +83,7 @@
>  #define TCG_VEC_TMP2    TCG_REG_V1
>  
>  #define TCG_REG_TB     TCG_REG_R31
> -#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64)
> +#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64 && !have_isa_3_10)
>  
>  /* Shorthand for size of a pointer.  Avoid promotion to unsigned.  */
>  #define SZP  ((int)sizeof(void *))



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb
  2023-08-04 21:33 ` [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb Richard Henderson
@ 2023-08-06 12:55   ` Nicholas Piggin
  2023-08-06 14:13     ` Richard Henderson
  2023-08-07  4:08   ` Jordan Niethe
  1 sibling, 1 reply; 21+ messages in thread
From: Nicholas Piggin @ 2023-08-06 12:55 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> When a direct branch is out of range, we can load the destination for
> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> the TranslationBlock for everything larger.
>
> This means the patch affects exactly one instruction: B (plus filler),
> PLA or PLD.  Which means we can update and execute the patch atomically.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
>  1 file changed, 58 insertions(+), 18 deletions(-)
>
> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> index 5b243b2353..47c71bb5f2 100644
> --- a/tcg/ppc/tcg-target.c.inc
> +++ b/tcg/ppc/tcg-target.c.inc
> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
>      uintptr_t ptr = get_jmp_target_addr(s, which);
>  
>      if (USE_REG_TB) {
> +        /*
> +         * With REG_TB, we must always use indirect branching,
> +         * so that the branch destination and TCG_REG_TB match.
> +         */
>          ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
>          tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> -    
> -        /* TODO: Use direct branches when possible. */
> -        set_jmp_insn_offset(s, which);
>          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> -
>          tcg_out32(s, BCCTR | BO_ALWAYS);
>  
>          /* For the unlinked case, need to reset TCG_REG_TB.  */
>          set_jmp_reset_offset(s, which);
>          tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
>                           -tcg_current_code_size(s));
> +        return;
> +    }
> +
> +    if (have_isa_3_10) {
> +        /* Align, so that we can patch 8 bytes atomically. */
> +        if ((uintptr_t)s->code_ptr & 7) {
> +            tcg_out32(s, NOP);
> +        }
> +        set_jmp_insn_offset(s, which);
> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
>      } else {
>          /* Direct branch will be patched by tb_target_set_jmp_target. */
> -        set_jmp_insn_offset(s, which);
> -        tcg_out32(s, NOP);
> -
> +        tcg_out32(s, B);
>          /* When branch is out of range, fall through to indirect. */
>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
>          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> -        set_jmp_reset_offset(s, which);
>      }
> +
> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> +    set_jmp_reset_offset(s, which);
>  }
>  
>  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>  {
>      uintptr_t addr = tb->jmp_target_addr[n];
>      intptr_t diff = addr - jmp_rx;
> -    tcg_insn_unit insn;
>  
>      if (USE_REG_TB) {
>          return;
>      }
>  
> -    if (in_range_b(diff)) {
> -        insn = B | (diff & 0x3fffffc);
> -    } else {
> -        insn = NOP;
> -    }
> +    if (have_isa_3_10) {
> +        tcg_insn_unit insn1, insn2;
> +        uint64_t pair;
>  
> -    qatomic_set((uint32_t *)jmp_rw, insn);
> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> +        if (in_range_b(diff)) {
> +            insn1 = B | (diff & 0x3fffffc);
> +            insn2 = NOP;
> +        } else if (diff == sextract64(diff, 0, 34)) {
> +            /* PLA tmp1, diff */
> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> +        } else {
> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> +            diff = addr - jmp_rx;
> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> +            /* PLD tmp1, diff */
> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> +        }

B is a "patch class" word instruction as per CMODX in the ISA, which may
be patched to/from other instructions without a flush+isync sequence
betwen. So that part is okay, at least if you were just patching the B
word. But patching between the PLA and PLD I don't think is kosher per
ISA.

I struggle a bit with this part of the ISA, particularly with prefix
instructions (it only talks about patching 4 bytes at a time).

If we patch something it has to go through a patch instruction, which
is a direct branch, trap, or nop. I think that makes this non-trivial.

It could work if you only patched between B and PLD. B->PLD would have
to patch the suffix word first, possibly with an interleaving sync, and
then the prefix. PLD->B could just patch the B word.

How much would losing the PLA hurt?

Thanks,
Nick

> +
> +        if (HOST_BIG_ENDIAN) {
> +            pair = ((uint64_t)insn1) << 32 | insn2;
> +        } else {
> +            pair = ((uint64_t)insn2) << 32 | insn1;
> +        }
> +
> +        qatomic_set((uint64_t *)jmp_rw, pair);
> +        flush_idcache_range(jmp_rx, jmp_rw, 8);
> +    } else {
> +        tcg_insn_unit insn;
> +
> +        if (in_range_b(diff)) {
> +            insn = B | (diff & 0x3fffffc);
> +        } else {
> +            insn = NOP;
> +        }
> +        qatomic_set((uint32_t *)jmp_rw, insn);
> +        flush_idcache_range(jmp_rx, jmp_rw, 4);
> +    }
>  }
>  
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc,



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 6/7] tcg/ppc: Disable USE_REG_TB for Power v3.1
  2023-08-06 11:58   ` Nicholas Piggin
@ 2023-08-06 13:45     ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-06 13:45 UTC (permalink / raw)
  To: Nicholas Piggin, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On 8/6/23 04:58, Nicholas Piggin wrote:
> Patch 7 is not reqired for this?

No, USE_REG_TB has never been *required*.

The fallback path through tcg_out_movi will generate the 5 insn sequence for a full 64-bit 
constant load.  Mitigating that here is that patch 1 will emit PADDI for most of those 
constants.


r~

> 
> Thanks,
> Nick
> 
> On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
>> With Power v3.1, we have pc-relative addressing and so
>> do not require a register holding the current TB.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   tcg/ppc/tcg-target.c.inc | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
>> index e8eced7cf3..5b243b2353 100644
>> --- a/tcg/ppc/tcg-target.c.inc
>> +++ b/tcg/ppc/tcg-target.c.inc
>> @@ -83,7 +83,7 @@
>>   #define TCG_VEC_TMP2    TCG_REG_V1
>>   
>>   #define TCG_REG_TB     TCG_REG_R31
>> -#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64)
>> +#define USE_REG_TB     (TCG_TARGET_REG_BITS == 64 && !have_isa_3_10)
>>   
>>   /* Shorthand for size of a pointer.  Avoid promotion to unsigned.  */
>>   #define SZP  ((int)sizeof(void *))
> 



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb
  2023-08-06 12:55   ` Nicholas Piggin
@ 2023-08-06 14:13     ` Richard Henderson
  2023-08-07  1:51       ` Nicholas Piggin
  2023-08-07  7:29       ` Nicholas Piggin
  0 siblings, 2 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-06 14:13 UTC (permalink / raw)
  To: Nicholas Piggin, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On 8/6/23 05:55, Nicholas Piggin wrote:
> On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
>> When a direct branch is out of range, we can load the destination for
>> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
>> the TranslationBlock for everything larger.
>>
>> This means the patch affects exactly one instruction: B (plus filler),
>> PLA or PLD.  Which means we can update and execute the patch atomically.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
>>   1 file changed, 58 insertions(+), 18 deletions(-)
>>
>> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
>> index 5b243b2353..47c71bb5f2 100644
>> --- a/tcg/ppc/tcg-target.c.inc
>> +++ b/tcg/ppc/tcg-target.c.inc
>> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
>>       uintptr_t ptr = get_jmp_target_addr(s, which);
>>   
>>       if (USE_REG_TB) {
>> +        /*
>> +         * With REG_TB, we must always use indirect branching,
>> +         * so that the branch destination and TCG_REG_TB match.
>> +         */
>>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
>>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
>> -
>> -        /* TODO: Use direct branches when possible. */
>> -        set_jmp_insn_offset(s, which);
>>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
>> -
>>           tcg_out32(s, BCCTR | BO_ALWAYS);
>>   
>>           /* For the unlinked case, need to reset TCG_REG_TB.  */
>>           set_jmp_reset_offset(s, which);
>>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
>>                            -tcg_current_code_size(s));
>> +        return;
>> +    }
>> +
>> +    if (have_isa_3_10) {
>> +        /* Align, so that we can patch 8 bytes atomically. */
>> +        if ((uintptr_t)s->code_ptr & 7) {
>> +            tcg_out32(s, NOP);
>> +        }
>> +        set_jmp_insn_offset(s, which);
>> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
>> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
>>       } else {
>>           /* Direct branch will be patched by tb_target_set_jmp_target. */
>> -        set_jmp_insn_offset(s, which);
>> -        tcg_out32(s, NOP);
>> -
>> +        tcg_out32(s, B);
>>           /* When branch is out of range, fall through to indirect. */
>>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
>>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
>> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
>> -        tcg_out32(s, BCCTR | BO_ALWAYS);
>> -        set_jmp_reset_offset(s, which);
>>       }
>> +
>> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
>> +    tcg_out32(s, BCCTR | BO_ALWAYS);
>> +    set_jmp_reset_offset(s, which);
>>   }
>>   
>>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>>   {
>>       uintptr_t addr = tb->jmp_target_addr[n];
>>       intptr_t diff = addr - jmp_rx;
>> -    tcg_insn_unit insn;
>>   
>>       if (USE_REG_TB) {
>>           return;
>>       }
>>   
>> -    if (in_range_b(diff)) {
>> -        insn = B | (diff & 0x3fffffc);
>> -    } else {
>> -        insn = NOP;
>> -    }
>> +    if (have_isa_3_10) {
>> +        tcg_insn_unit insn1, insn2;
>> +        uint64_t pair;
>>   
>> -    qatomic_set((uint32_t *)jmp_rw, insn);
>> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
>> +        if (in_range_b(diff)) {
>> +            insn1 = B | (diff & 0x3fffffc);
>> +            insn2 = NOP;
>> +        } else if (diff == sextract64(diff, 0, 34)) {
>> +            /* PLA tmp1, diff */
>> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
>> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
>> +        } else {
>> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
>> +            diff = addr - jmp_rx;
>> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
>> +            /* PLD tmp1, diff */
>> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
>> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
>> +        }
> 
> B is a "patch class" word instruction as per CMODX in the ISA, which may
> be patched to/from other instructions without a flush+isync sequence
> betwen. So that part is okay, at least if you were just patching the B
> word. But patching between the PLA and PLD I don't think is kosher per
> ISA.
> 
> I struggle a bit with this part of the ISA, particularly with prefix
> instructions (it only talks about patching 4 bytes at a time).
> 
> If we patch something it has to go through a patch instruction, which
> is a direct branch, trap, or nop. I think that makes this non-trivial.
> 
> It could work if you only patched between B and PLD. B->PLD would have
> to patch the suffix word first, possibly with an interleaving sync, and
> then the prefix. PLD->B could just patch the B word.
> 
> How much would losing the PLA hurt?

Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
they are not?).

But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
leave the PLD alone on the fall-through path, and drop the PLA.


r~


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions
  2023-08-06 11:55 ` [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Nicholas Piggin
@ 2023-08-06 14:22   ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-06 14:22 UTC (permalink / raw)
  To: Nicholas Piggin, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On 8/6/23 04:55, Nicholas Piggin wrote:
> On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
>> Emit one 64-bit instruction for large constants and pc-relatives.
>> With pc-relative addressing, we don't need REG_TB, which means we
>> can re-enable direct branching for goto_tb.
> 
> Very cool. I tested this on a POWER10 and it has been stable so
> far. Using mttcg with SMP 8 machine, with this patch, a parallel
> gcc compile job in a Linux distro is 11.5% faster. Is such a big
> improvment expected?

Possibly, given that we had to disable direct branch patching for USE_REG_TB.


r~


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb
  2023-08-06 14:13     ` Richard Henderson
@ 2023-08-07  1:51       ` Nicholas Piggin
  2023-08-07  7:29       ` Nicholas Piggin
  1 sibling, 0 replies; 21+ messages in thread
From: Nicholas Piggin @ 2023-08-07  1:51 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On Mon Aug 7, 2023 at 12:13 AM AEST, Richard Henderson wrote:
> On 8/6/23 05:55, Nicholas Piggin wrote:
> > On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> >> When a direct branch is out of range, we can load the destination for
> >> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> >> the TranslationBlock for everything larger.
> >>
> >> This means the patch affects exactly one instruction: B (plus filler),
> >> PLA or PLD.  Which means we can update and execute the patch atomically.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >> ---
> >>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
> >>   1 file changed, 58 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> >> index 5b243b2353..47c71bb5f2 100644
> >> --- a/tcg/ppc/tcg-target.c.inc
> >> +++ b/tcg/ppc/tcg-target.c.inc
> >> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
> >>       uintptr_t ptr = get_jmp_target_addr(s, which);
> >>   
> >>       if (USE_REG_TB) {
> >> +        /*
> >> +         * With REG_TB, we must always use indirect branching,
> >> +         * so that the branch destination and TCG_REG_TB match.
> >> +         */
> >>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
> >>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> >> -
> >> -        /* TODO: Use direct branches when possible. */
> >> -        set_jmp_insn_offset(s, which);
> >>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> >> -
> >>           tcg_out32(s, BCCTR | BO_ALWAYS);
> >>   
> >>           /* For the unlinked case, need to reset TCG_REG_TB.  */
> >>           set_jmp_reset_offset(s, which);
> >>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
> >>                            -tcg_current_code_size(s));
> >> +        return;
> >> +    }
> >> +
> >> +    if (have_isa_3_10) {
> >> +        /* Align, so that we can patch 8 bytes atomically. */
> >> +        if ((uintptr_t)s->code_ptr & 7) {
> >> +            tcg_out32(s, NOP);
> >> +        }
> >> +        set_jmp_insn_offset(s, which);
> >> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
> >>       } else {
> >>           /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> -        set_jmp_insn_offset(s, which);
> >> -        tcg_out32(s, NOP);
> >> -
> >> +        tcg_out32(s, B);
> >>           /* When branch is out of range, fall through to indirect. */
> >>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
> >>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> >> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> >> -        set_jmp_reset_offset(s, which);
> >>       }
> >> +
> >> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> >> +    set_jmp_reset_offset(s, which);
> >>   }
> >>   
> >>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >>   {
> >>       uintptr_t addr = tb->jmp_target_addr[n];
> >>       intptr_t diff = addr - jmp_rx;
> >> -    tcg_insn_unit insn;
> >>   
> >>       if (USE_REG_TB) {
> >>           return;
> >>       }
> >>   
> >> -    if (in_range_b(diff)) {
> >> -        insn = B | (diff & 0x3fffffc);
> >> -    } else {
> >> -        insn = NOP;
> >> -    }
> >> +    if (have_isa_3_10) {
> >> +        tcg_insn_unit insn1, insn2;
> >> +        uint64_t pair;
> >>   
> >> -    qatomic_set((uint32_t *)jmp_rw, insn);
> >> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> >> +        if (in_range_b(diff)) {
> >> +            insn1 = B | (diff & 0x3fffffc);
> >> +            insn2 = NOP;
> >> +        } else if (diff == sextract64(diff, 0, 34)) {
> >> +            /* PLA tmp1, diff */
> >> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> >> +        } else {
> >> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> >> +            diff = addr - jmp_rx;
> >> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> >> +            /* PLD tmp1, diff */
> >> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> >> +        }
> > 
> > B is a "patch class" word instruction as per CMODX in the ISA, which may
> > be patched to/from other instructions without a flush+isync sequence
> > betwen. So that part is okay, at least if you were just patching the B
> > word. But patching between the PLA and PLD I don't think is kosher per
> > ISA.
> > 
> > I struggle a bit with this part of the ISA, particularly with prefix
> > instructions (it only talks about patching 4 bytes at a time).
> > 
> > If we patch something it has to go through a patch instruction, which
> > is a direct branch, trap, or nop. I think that makes this non-trivial.
> > 
> > It could work if you only patched between B and PLD. B->PLD would have
> > to patch the suffix word first, possibly with an interleaving sync, and
> > then the prefix. PLD->B could just patch the B word.
> > 
> > How much would losing the PLA hurt?
>
> Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
> store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
> they are not?).

Good question, that might just be a case of not wanting to define and
verify it. Right now I think the way to CMODX a suffix is to patch the
prefix such that the suffix word can no longer be concurrently executed,
(e.g., with a branch or trap), then patch the suffix word, then patch a
the desired prefix word back in. I'd be almost certain it would work
correctly to patch an aligned dword insn at once, but better to go by
the book.

But there is an additional issue which is not just about torn write,
implementations that fetch an instruction more than once. The same
restriction exists for 4 byte instructions, you aren't meant to patch
an ld to an add, for example.

I don't know how much of that is historical, or whether it's just to
reduce verification space. ISTR there are some real concerns around
pipeline recovery corner cases.

> But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
> leave the PLD alone on the fall-through path, and drop the PLA.

For now if you do that, maybe just leaving a comment that we can't
patch PLA due to CMODX restrictions, I would be less nervous about it.

I might ask about this internally, I did a few months ago have some
similar questions about problems with patching prefixes but didn't
have a good example.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/7] tcg/ppc: Use prefixed instructions in tcg_out_mem_long
  2023-08-04 21:33 ` [PATCH 3/7] tcg/ppc: Use prefixed instructions in tcg_out_mem_long Richard Henderson
@ 2023-08-07  3:51   ` Jordan Niethe
  0 siblings, 0 replies; 21+ messages in thread
From: Jordan Niethe @ 2023-08-07  3:51 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, qemu-ppc, bgray

On Sat, Aug 5, 2023 at 7:33 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> When the offset is out of range of the non-prefixed insn, but
> fits the 34-bit immediate of the prefixed insn, use that.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.c.inc | 66 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 66 insertions(+)
>
> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> index 7fa2a2500b..d41c499b7d 100644
> --- a/tcg/ppc/tcg-target.c.inc
> +++ b/tcg/ppc/tcg-target.c.inc
> @@ -323,6 +323,15 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>  #define STDX   XO31(149)
>  #define STQ    XO62(  2)
>
> +#define PLWA   OPCD( 41)
> +#define PLD    OPCD( 57)
> +#define PLXSD  OPCD( 42)
> +#define PLXV   OPCD(25 * 2 + 1)  /* force tx=1 */
> +
> +#define PSTD   OPCD( 61)
> +#define PSTXSD OPCD( 46)
> +#define PSTXV  OPCD(27 * 2 + 1)  /* force tx=1 */

PSTXV calls it sx not tx


> +
>  #define ADDIC  OPCD( 12)
>  #define ADDI   OPCD( 14)
>  #define ADDIS  OPCD( 15)
> @@ -720,6 +729,20 @@ static void tcg_out_prefix_align(TCGContext *s)
>      }
>  }
>
> +/* Output Type 00 Prefix - 8-Byte Load/Store Form (8LS:D) */
> +static void tcg_out_8ls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
> +                          unsigned ra, tcg_target_long imm, bool r)
> +{
> +    tcg_insn_unit p, i;
> +
> +    p = OPCD(1) | (r << 20) | ((imm >> 16) & 0x3ffff);
> +    i = opc | TAI(rt, ra, imm);
> +
> +    tcg_out_prefix_align(s);
> +    tcg_out32(s, p);
> +    tcg_out32(s, i);
> +}
> +
>  /* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */
>  static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
>                            unsigned ra, tcg_target_long imm, bool r)
> @@ -1364,6 +1387,49 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
>          break;
>      }
>
> +    /* For unaligned or large offsets, use the prefixed form. */
> +    if (have_isa_3_10
> +        && (offset != (int16_t)offset || (offset & align))
> +        && offset == sextract64(offset, 0, 34)) {
> +        /*
> +         * Note that the MLS:D insns retain their un-prefixed opcode,
> +         * while the 8LS:D insns use a different opcode space.
> +         */
> +        switch (opi) {
> +        case LBZ:
> +        case LHZ:
> +        case LHA:
> +        case LWZ:
> +        case STB:
> +        case STH:
> +        case STW:
> +        case ADDI:
> +            tcg_out_mls_d(s, opi, rt, base, offset, 0);
> +            return;
> +        case LWA:
> +            tcg_out_8ls_d(s, PLWA, rt, base, offset, 0);
> +            return;
> +        case LD:
> +            tcg_out_8ls_d(s, PLD, rt, base, offset, 0);
> +            return;
> +        case STD:
> +            tcg_out_8ls_d(s, PSTD, rt, base, offset, 0);
> +            return;
> +        case LXSD:
> +            tcg_out_8ls_d(s, PLXSD, rt & 31, base, offset, 0);
> +            return;
> +        case STXSD:
> +            tcg_out_8ls_d(s, PSTXSD, rt & 31, base, offset, 0);
> +            return;
> +        case LXV:
> +            tcg_out_8ls_d(s, PLXV, rt & 31, base, offset, 0);
> +            return;
> +        case STXV:
> +            tcg_out_8ls_d(s, PSTXV, rt & 31, base, offset, 0);
> +            return;
> +        }
> +    }
> +
>      /* For unaligned, or very large offsets, use the indexed form.  */
>      if (offset & align || offset != (int32_t)offset || opi == 0) {
>          if (rs == base) {
> --
> 2.34.1
>

Reviewed-by: Jordan Niethe <jniethe5@gmail.com>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/7] tcg/ppc: Use PADDI in tcg_out_movi
  2023-08-04 21:33 ` [PATCH 2/7] tcg/ppc: Use PADDI in tcg_out_movi Richard Henderson
@ 2023-08-07  3:53   ` Jordan Niethe
  2023-08-07 21:26     ` Richard Henderson
  0 siblings, 1 reply; 21+ messages in thread
From: Jordan Niethe @ 2023-08-07  3:53 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, qemu-ppc, bgray

On Sat, Aug 5, 2023 at 7:33 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> PADDI can load 34-bit immediates and 34-bit pc-relative addresses.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.c.inc | 47 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
>
> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> index 642d0fd128..7fa2a2500b 100644
> --- a/tcg/ppc/tcg-target.c.inc
> +++ b/tcg/ppc/tcg-target.c.inc
> @@ -707,6 +707,33 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
>      return true;
>  }
>
> +/* Ensure that the prefixed instruction does not cross a 64-byte boundary. */
> +static bool tcg_out_need_prefix_align(TCGContext *s)
> +{
> +    return ((uintptr_t)s->code_ptr & 0x3f) == 0x3c;
> +}
> +
> +static void tcg_out_prefix_align(TCGContext *s)
> +{
> +    if (tcg_out_need_prefix_align(s)) {
> +        tcg_out32(s, NOP);
> +    }
> +}
> +
> +/* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */
> +static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
> +                          unsigned ra, tcg_target_long imm, bool r)
> +{
> +    tcg_insn_unit p, i;
> +
> +    p = OPCD(1) | (2 << 24) | (r << 20) | ((imm >> 16) & 0x3ffff);
> +    i = opc | TAI(rt, ra, imm);
> +
> +    tcg_out_prefix_align(s);
> +    tcg_out32(s, p);
> +    tcg_out32(s, i);
> +}
> +
>  static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
>                               TCGReg base, tcg_target_long offset);
>
> @@ -992,6 +1019,26 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
>          return;
>      }
>
> +    /*
> +     * Load values up to 34 bits, and pc-relative addresses,
> +     * with one prefixed insn.
> +     */
> +    if (have_isa_3_10) {
> +        if (arg == sextract64(arg, 0, 34)) {
> +            /* pli ret,value = paddi ret,0,value,0 */
> +            tcg_out_mls_d(s, ADDI, ret, 0, arg, 0);
> +            return;
> +        }
> +
> +        tmp = tcg_out_need_prefix_align(s) * 4;

tcg_out_need_prefix_align() returns a bool, optionally might prefer

tmp = tcg_out_need_prefix_align(s) ? 4 : 0;


> +        tmp = tcg_pcrel_diff(s, (void *)arg) - tmp;
> +        if (tmp == sextract64(tmp, 0, 34)) {
> +            /* pla ret,value = paddi ret,0,value,1 */
> +            tcg_out_mls_d(s, ADDI, ret, 0, tmp, 1);
> +            return;
> +        }
> +    }
> +
>      /* Load 32-bit immediates with two insns.  Note that we've already
>         eliminated bare ADDIS, so we know both insns are required.  */
>      if (TCG_TARGET_REG_BITS == 32 || arg == (int32_t)arg) {
> --
> 2.34.1
>

Reviewed-by: Jordan Niethe <jniethe5@gmail.com>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb
  2023-08-04 21:33 ` [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb Richard Henderson
  2023-08-06 12:55   ` Nicholas Piggin
@ 2023-08-07  4:08   ` Jordan Niethe
  1 sibling, 0 replies; 21+ messages in thread
From: Jordan Niethe @ 2023-08-07  4:08 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, qemu-ppc, bgray

On Sat, Aug 5, 2023 at 7:34 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> When a direct branch is out of range, we can load the destination for
> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> the TranslationBlock for everything larger.
>
> This means the patch affects exactly one instruction: B (plus filler),
> PLA or PLD.  Which means we can update and execute the patch atomically.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
>  1 file changed, 58 insertions(+), 18 deletions(-)
>
> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> index 5b243b2353..47c71bb5f2 100644
> --- a/tcg/ppc/tcg-target.c.inc
> +++ b/tcg/ppc/tcg-target.c.inc
> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
>      uintptr_t ptr = get_jmp_target_addr(s, which);
>
>      if (USE_REG_TB) {
> +        /*
> +         * With REG_TB, we must always use indirect branching,
> +         * so that the branch destination and TCG_REG_TB match.
> +         */
>          ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
>          tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> -
> -        /* TODO: Use direct branches when possible. */
> -        set_jmp_insn_offset(s, which);
>          tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> -
>          tcg_out32(s, BCCTR | BO_ALWAYS);
>
>          /* For the unlinked case, need to reset TCG_REG_TB.  */
>          set_jmp_reset_offset(s, which);
>          tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
>                           -tcg_current_code_size(s));
> +        return;
> +    }
> +
> +    if (have_isa_3_10) {
> +        /* Align, so that we can patch 8 bytes atomically. */
> +        if ((uintptr_t)s->code_ptr & 7) {
> +            tcg_out32(s, NOP);
> +        }
> +        set_jmp_insn_offset(s, which);
> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
>      } else {
>          /* Direct branch will be patched by tb_target_set_jmp_target. */
> -        set_jmp_insn_offset(s, which);

It looks like 32bit loses its set_jmp_insn_offset(), is that intended?

> -        tcg_out32(s, NOP);
> -
> +        tcg_out32(s, B);
>          /* When branch is out of range, fall through to indirect. */
>          tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
>          tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> -        set_jmp_reset_offset(s, which);
>      }
> +
> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> +    set_jmp_reset_offset(s, which);
>  }
>
>  void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
>  {
>      uintptr_t addr = tb->jmp_target_addr[n];
>      intptr_t diff = addr - jmp_rx;
> -    tcg_insn_unit insn;
>
>      if (USE_REG_TB) {
>          return;
>      }
>
> -    if (in_range_b(diff)) {
> -        insn = B | (diff & 0x3fffffc);
> -    } else {
> -        insn = NOP;
> -    }
> +    if (have_isa_3_10) {
> +        tcg_insn_unit insn1, insn2;
> +        uint64_t pair;
>
> -    qatomic_set((uint32_t *)jmp_rw, insn);
> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> +        if (in_range_b(diff)) {
> +            insn1 = B | (diff & 0x3fffffc);
> +            insn2 = NOP;
> +        } else if (diff == sextract64(diff, 0, 34)) {
> +            /* PLA tmp1, diff */
> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> +        } else {
> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> +            diff = addr - jmp_rx;
> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> +            /* PLD tmp1, diff */
> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> +        }
> +
> +        if (HOST_BIG_ENDIAN) {
> +            pair = ((uint64_t)insn1) << 32 | insn2;
> +        } else {
> +            pair = ((uint64_t)insn2) << 32 | insn1;
> +        }
> +
> +        qatomic_set((uint64_t *)jmp_rw, pair);
> +        flush_idcache_range(jmp_rx, jmp_rw, 8);
> +    } else {
> +        tcg_insn_unit insn;
> +
> +        if (in_range_b(diff)) {
> +            insn = B | (diff & 0x3fffffc);
> +        } else {
> +            insn = NOP;
> +        }
> +        qatomic_set((uint32_t *)jmp_rw, insn);
> +        flush_idcache_range(jmp_rx, jmp_rw, 4);
> +    }
>  }
>
>  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
> --
> 2.34.1
>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb
  2023-08-06 14:13     ` Richard Henderson
  2023-08-07  1:51       ` Nicholas Piggin
@ 2023-08-07  7:29       ` Nicholas Piggin
  2023-08-07  9:38         ` Nicholas Piggin
  1 sibling, 1 reply; 21+ messages in thread
From: Nicholas Piggin @ 2023-08-07  7:29 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On Mon Aug 7, 2023 at 12:13 AM AEST, Richard Henderson wrote:
> On 8/6/23 05:55, Nicholas Piggin wrote:
> > On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> >> When a direct branch is out of range, we can load the destination for
> >> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> >> the TranslationBlock for everything larger.
> >>
> >> This means the patch affects exactly one instruction: B (plus filler),
> >> PLA or PLD.  Which means we can update and execute the patch atomically.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >> ---
> >>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
> >>   1 file changed, 58 insertions(+), 18 deletions(-)
> >>
> >> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> >> index 5b243b2353..47c71bb5f2 100644
> >> --- a/tcg/ppc/tcg-target.c.inc
> >> +++ b/tcg/ppc/tcg-target.c.inc
> >> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
> >>       uintptr_t ptr = get_jmp_target_addr(s, which);
> >>   
> >>       if (USE_REG_TB) {
> >> +        /*
> >> +         * With REG_TB, we must always use indirect branching,
> >> +         * so that the branch destination and TCG_REG_TB match.
> >> +         */
> >>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
> >>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> >> -
> >> -        /* TODO: Use direct branches when possible. */
> >> -        set_jmp_insn_offset(s, which);
> >>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> >> -
> >>           tcg_out32(s, BCCTR | BO_ALWAYS);
> >>   
> >>           /* For the unlinked case, need to reset TCG_REG_TB.  */
> >>           set_jmp_reset_offset(s, which);
> >>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
> >>                            -tcg_current_code_size(s));
> >> +        return;
> >> +    }
> >> +
> >> +    if (have_isa_3_10) {
> >> +        /* Align, so that we can patch 8 bytes atomically. */
> >> +        if ((uintptr_t)s->code_ptr & 7) {
> >> +            tcg_out32(s, NOP);
> >> +        }
> >> +        set_jmp_insn_offset(s, which);
> >> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
> >>       } else {
> >>           /* Direct branch will be patched by tb_target_set_jmp_target. */
> >> -        set_jmp_insn_offset(s, which);
> >> -        tcg_out32(s, NOP);
> >> -
> >> +        tcg_out32(s, B);
> >>           /* When branch is out of range, fall through to indirect. */
> >>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
> >>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> >> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> >> -        set_jmp_reset_offset(s, which);
> >>       }
> >> +
> >> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> >> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> >> +    set_jmp_reset_offset(s, which);
> >>   }
> >>   
> >>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> >>   {
> >>       uintptr_t addr = tb->jmp_target_addr[n];
> >>       intptr_t diff = addr - jmp_rx;
> >> -    tcg_insn_unit insn;
> >>   
> >>       if (USE_REG_TB) {
> >>           return;
> >>       }
> >>   
> >> -    if (in_range_b(diff)) {
> >> -        insn = B | (diff & 0x3fffffc);
> >> -    } else {
> >> -        insn = NOP;
> >> -    }
> >> +    if (have_isa_3_10) {
> >> +        tcg_insn_unit insn1, insn2;
> >> +        uint64_t pair;
> >>   
> >> -    qatomic_set((uint32_t *)jmp_rw, insn);
> >> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> >> +        if (in_range_b(diff)) {
> >> +            insn1 = B | (diff & 0x3fffffc);
> >> +            insn2 = NOP;
> >> +        } else if (diff == sextract64(diff, 0, 34)) {
> >> +            /* PLA tmp1, diff */
> >> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> >> +        } else {
> >> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> >> +            diff = addr - jmp_rx;
> >> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> >> +            /* PLD tmp1, diff */
> >> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> >> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> >> +        }
> > 
> > B is a "patch class" word instruction as per CMODX in the ISA, which may
> > be patched to/from other instructions without a flush+isync sequence
> > betwen. So that part is okay, at least if you were just patching the B
> > word. But patching between the PLA and PLD I don't think is kosher per
> > ISA.
> > 
> > I struggle a bit with this part of the ISA, particularly with prefix
> > instructions (it only talks about patching 4 bytes at a time).
> > 
> > If we patch something it has to go through a patch instruction, which
> > is a direct branch, trap, or nop. I think that makes this non-trivial.
> > 
> > It could work if you only patched between B and PLD. B->PLD would have
> > to patch the suffix word first, possibly with an interleaving sync, and
> > then the prefix. PLD->B could just patch the B word.
> > 
> > How much would losing the PLA hurt?
>
> Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
> store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
> they are not?).
>
> But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
> leave the PLD alone on the fall-through path, and drop the PLA.

Hmm, even patching two different offset B instructions in some sequence
with a PLD would go against the ISA, because PLD is not a patch class
instruction. The only case you can patch those has to have a sequence of
exactly two instruction values. So I think you need the B first, and you
can patch that between any offset of B or a NOP to fall through to PLD.
NOP and B are both patch class, so any sequence of them is allowed.

Thanks,
Nick


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb
  2023-08-07  7:29       ` Nicholas Piggin
@ 2023-08-07  9:38         ` Nicholas Piggin
  0 siblings, 0 replies; 21+ messages in thread
From: Nicholas Piggin @ 2023-08-07  9:38 UTC (permalink / raw)
  To: Nicholas Piggin, Richard Henderson, qemu-devel; +Cc: jniethe5, qemu-ppc, bgray

On Mon Aug 7, 2023 at 5:29 PM AEST, Nicholas Piggin wrote:
> On Mon Aug 7, 2023 at 12:13 AM AEST, Richard Henderson wrote:
> > On 8/6/23 05:55, Nicholas Piggin wrote:
> > > On Sat Aug 5, 2023 at 7:33 AM AEST, Richard Henderson wrote:
> > >> When a direct branch is out of range, we can load the destination for
> > >> the indirect branch using PLA (for 16GB worth of buffer) and PLD from
> > >> the TranslationBlock for everything larger.
> > >>
> > >> This means the patch affects exactly one instruction: B (plus filler),
> > >> PLA or PLD.  Which means we can update and execute the patch atomically.
> > >>
> > >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> > >> ---
> > >>   tcg/ppc/tcg-target.c.inc | 76 ++++++++++++++++++++++++++++++----------
> > >>   1 file changed, 58 insertions(+), 18 deletions(-)
> > >>
> > >> diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
> > >> index 5b243b2353..47c71bb5f2 100644
> > >> --- a/tcg/ppc/tcg-target.c.inc
> > >> +++ b/tcg/ppc/tcg-target.c.inc
> > >> @@ -2642,31 +2642,41 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
> > >>       uintptr_t ptr = get_jmp_target_addr(s, which);
> > >>   
> > >>       if (USE_REG_TB) {
> > >> +        /*
> > >> +         * With REG_TB, we must always use indirect branching,
> > >> +         * so that the branch destination and TCG_REG_TB match.
> > >> +         */
> > >>           ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
> > >>           tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
> > >> -
> > >> -        /* TODO: Use direct branches when possible. */
> > >> -        set_jmp_insn_offset(s, which);
> > >>           tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
> > >> -
> > >>           tcg_out32(s, BCCTR | BO_ALWAYS);
> > >>   
> > >>           /* For the unlinked case, need to reset TCG_REG_TB.  */
> > >>           set_jmp_reset_offset(s, which);
> > >>           tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
> > >>                            -tcg_current_code_size(s));
> > >> +        return;
> > >> +    }
> > >> +
> > >> +    if (have_isa_3_10) {
> > >> +        /* Align, so that we can patch 8 bytes atomically. */
> > >> +        if ((uintptr_t)s->code_ptr & 7) {
> > >> +            tcg_out32(s, NOP);
> > >> +        }
> > >> +        set_jmp_insn_offset(s, which);
> > >> +        /* Direct branch will be patched by tb_target_set_jmp_target. */
> > >> +        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
> > >>       } else {
> > >>           /* Direct branch will be patched by tb_target_set_jmp_target. */
> > >> -        set_jmp_insn_offset(s, which);
> > >> -        tcg_out32(s, NOP);
> > >> -
> > >> +        tcg_out32(s, B);
> > >>           /* When branch is out of range, fall through to indirect. */
> > >>           tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
> > >>           tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
> > >> -        tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> > >> -        tcg_out32(s, BCCTR | BO_ALWAYS);
> > >> -        set_jmp_reset_offset(s, which);
> > >>       }
> > >> +
> > >> +    tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR);
> > >> +    tcg_out32(s, BCCTR | BO_ALWAYS);
> > >> +    set_jmp_reset_offset(s, which);
> > >>   }
> > >>   
> > >>   void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> > >> @@ -2674,20 +2684,50 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
> > >>   {
> > >>       uintptr_t addr = tb->jmp_target_addr[n];
> > >>       intptr_t diff = addr - jmp_rx;
> > >> -    tcg_insn_unit insn;
> > >>   
> > >>       if (USE_REG_TB) {
> > >>           return;
> > >>       }
> > >>   
> > >> -    if (in_range_b(diff)) {
> > >> -        insn = B | (diff & 0x3fffffc);
> > >> -    } else {
> > >> -        insn = NOP;
> > >> -    }
> > >> +    if (have_isa_3_10) {
> > >> +        tcg_insn_unit insn1, insn2;
> > >> +        uint64_t pair;
> > >>   
> > >> -    qatomic_set((uint32_t *)jmp_rw, insn);
> > >> -    flush_idcache_range(jmp_rx, jmp_rw, 4);
> > >> +        if (in_range_b(diff)) {
> > >> +            insn1 = B | (diff & 0x3fffffc);
> > >> +            insn2 = NOP;
> > >> +        } else if (diff == sextract64(diff, 0, 34)) {
> > >> +            /* PLA tmp1, diff */
> > >> +            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> > >> +            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
> > >> +        } else {
> > >> +            addr = (uintptr_t)&tb->jmp_target_addr[n];
> > >> +            diff = addr - jmp_rx;
> > >> +            tcg_debug_assert(diff == sextract64(diff, 0, 34));
> > >> +            /* PLD tmp1, diff */
> > >> +            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
> > >> +            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
> > >> +        }
> > > 
> > > B is a "patch class" word instruction as per CMODX in the ISA, which may
> > > be patched to/from other instructions without a flush+isync sequence
> > > betwen. So that part is okay, at least if you were just patching the B
> > > word. But patching between the PLA and PLD I don't think is kosher per
> > > ISA.
> > > 
> > > I struggle a bit with this part of the ISA, particularly with prefix
> > > instructions (it only talks about patching 4 bytes at a time).
> > > 
> > > If we patch something it has to go through a patch instruction, which
> > > is a direct branch, trap, or nop. I think that makes this non-trivial.
> > > 
> > > It could work if you only patched between B and PLD. B->PLD would have
> > > to patch the suffix word first, possibly with an interleaving sync, and
> > > then the prefix. PLD->B could just patch the B word.
> > > 
> > > How much would losing the PLA hurt?
> >
> > Really?  I can't imagine how some icache would see a torn prefixed insn given an atomic 
> > store (CMODX talks about prefixed instructions which "may be unaligned" -- but what if 
> > they are not?).
> >
> > But if patching an aligned prefixed insn isn't allowed, I would patch between B and NOP, 
> > leave the PLD alone on the fall-through path, and drop the PLA.
>
> Hmm, even patching two different offset B instructions in some sequence
> with a PLD would go against the ISA, because PLD is not a patch class
> instruction. The only case you can patch those has to have a sequence of
> exactly two instruction values. So I think you need the B first, and you
> can patch that between any offset of B or a NOP to fall through to PLD.
> NOP and B are both patch class, so any sequence of them is allowed.

Here is an incremental diff that hopefully solves those issues. Sadly,
by removing the nice PLA patching :( But it seems to be around or close
to original performance on my test.

Feel free to squash it in or take inspiration (or dispute it).

Thanks,
Nick

---
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 47c71bb5f2..569c2e3647 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -748,6 +748,11 @@ static void tcg_out_prefix_align(TCGContext *s)
     }
 }
 
+static ptrdiff_t tcg_pcrel_diff_for_prefix(TCGContext *s, const void *target)
+{
+    return tcg_pcrel_diff(s, target) - (tcg_out_need_prefix_align(s) ? 4 : 0);
+}
+
 /* Output Type 00 Prefix - 8-Byte Load/Store Form (8LS:D) */
 static void tcg_out_8ls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt,
                           unsigned ra, tcg_target_long imm, bool r)
@@ -1072,8 +1077,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
             return;
         }
 
-        tmp = tcg_out_need_prefix_align(s) * 4;
-        tmp = tcg_pcrel_diff(s, (void *)arg) - tmp;
+        tmp = tcg_pcrel_diff_for_prefix(s, (void *)arg);
         if (tmp == sextract64(tmp, 0, 34)) {
             /* pla ret,value = paddi ret,0,value,1 */
             tcg_out_mls_d(s, ADDI, ret, 0, tmp, 1);
@@ -2658,18 +2662,19 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
         return;
     }
 
+    /* Direct branch will be patched by tb_target_set_jmp_target. */
+    set_jmp_insn_offset(s, which);
+    tcg_out32(s, NOP);
+
+    /* When branch is out of range, fall through to indirect. */
     if (have_isa_3_10) {
-        /* Align, so that we can patch 8 bytes atomically. */
-        if ((uintptr_t)s->code_ptr & 7) {
-            tcg_out32(s, NOP);
-        }
-        set_jmp_insn_offset(s, which);
-        /* Direct branch will be patched by tb_target_set_jmp_target. */
-        tcg_out_mls_d(s, ADDI, TCG_REG_TMP1, 0, 0, 1);
+        ptrdiff_t offset = tcg_pcrel_diff_for_prefix(s, (void *)ptr);
+        /*
+         * Would be nice to use PLA if offset is in range,
+         * but CMODX rules make that difficult.
+         */
+        tcg_out_8ls_d(s, PLD, TCG_REG_TMP1, 0, offset, 1);
     } else {
-        /* Direct branch will be patched by tb_target_set_jmp_target. */
-        tcg_out32(s, B);
-        /* When branch is out of range, fall through to indirect. */
         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
     }
@@ -2684,50 +2689,19 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
 {
     uintptr_t addr = tb->jmp_target_addr[n];
     intptr_t diff = addr - jmp_rx;
+    tcg_insn_unit insn;
 
     if (USE_REG_TB) {
         return;
     }
 
-    if (have_isa_3_10) {
-        tcg_insn_unit insn1, insn2;
-        uint64_t pair;
-
-        if (in_range_b(diff)) {
-            insn1 = B | (diff & 0x3fffffc);
-            insn2 = NOP;
-        } else if (diff == sextract64(diff, 0, 34)) {
-            /* PLA tmp1, diff */
-            insn1 = OPCD(1) | (2 << 24) | (1 << 20) | ((diff >> 16) & 0x3ffff);
-            insn2 = ADDI | TAI(TCG_REG_TMP1, 0, diff);
-        } else {
-            addr = (uintptr_t)&tb->jmp_target_addr[n];
-            diff = addr - jmp_rx;
-            tcg_debug_assert(diff == sextract64(diff, 0, 34));
-            /* PLD tmp1, diff */
-            insn1 = OPCD(1) | (1 << 20) | ((diff >> 16) & 0x3ffff);
-            insn2 = PLD | TAI(TCG_REG_TMP1, 0, diff);
-        }
-
-        if (HOST_BIG_ENDIAN) {
-            pair = ((uint64_t)insn1) << 32 | insn2;
-        } else {
-            pair = ((uint64_t)insn2) << 32 | insn1;
-        }
-
-        qatomic_set((uint64_t *)jmp_rw, pair);
-        flush_idcache_range(jmp_rx, jmp_rw, 8);
+    if (in_range_b(diff)) {
+        insn = B | (diff & 0x3fffffc);
     } else {
-        tcg_insn_unit insn;
-
-        if (in_range_b(diff)) {
-            insn = B | (diff & 0x3fffffc);
-        } else {
-            insn = NOP;
-        }
-        qatomic_set((uint32_t *)jmp_rw, insn);
-        flush_idcache_range(jmp_rx, jmp_rw, 4);
+        insn = NOP;
     }
+    qatomic_set((uint32_t *)jmp_rw, insn);
+    flush_idcache_range(jmp_rx, jmp_rw, 4);
 }
 
 static void tcg_out_op(TCGContext *s, TCGOpcode opc,


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/7] tcg/ppc: Use PADDI in tcg_out_movi
  2023-08-07  3:53   ` Jordan Niethe
@ 2023-08-07 21:26     ` Richard Henderson
  0 siblings, 0 replies; 21+ messages in thread
From: Richard Henderson @ 2023-08-07 21:26 UTC (permalink / raw)
  To: Jordan Niethe; +Cc: qemu-devel, qemu-ppc, bgray, Nicholas Piggin

On 8/6/23 20:53, Jordan Niethe wrote:
>> +        tmp = tcg_out_need_prefix_align(s) * 4;
> 
> tcg_out_need_prefix_align() returns a bool, optionally might prefer
> 
> tmp = tcg_out_need_prefix_align(s) ? 4 : 0;

I suppose.  C type promotion rules make the multiplication just the same though.

That said, I've merged back Nick's tcg_pcrel_diff_for_prefix function using ?:.


r~


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2023-08-07 21:27 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-04 21:33 [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Richard Henderson
2023-08-04 21:33 ` [PATCH 1/7] tcg/ppc: Untabify tcg-target.c.inc Richard Henderson
2023-08-04 21:33 ` [PATCH 2/7] tcg/ppc: Use PADDI in tcg_out_movi Richard Henderson
2023-08-07  3:53   ` Jordan Niethe
2023-08-07 21:26     ` Richard Henderson
2023-08-04 21:33 ` [PATCH 3/7] tcg/ppc: Use prefixed instructions in tcg_out_mem_long Richard Henderson
2023-08-07  3:51   ` Jordan Niethe
2023-08-04 21:33 ` [PATCH 4/7] tcg/ppc: Use PLD in tcg_out_movi for constant pool Richard Henderson
2023-08-04 21:33 ` [PATCH 5/7] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec Richard Henderson
2023-08-04 21:33 ` [PATCH 6/7] tcg/ppc: Disable USE_REG_TB for Power v3.1 Richard Henderson
2023-08-06 11:58   ` Nicholas Piggin
2023-08-06 13:45     ` Richard Henderson
2023-08-04 21:33 ` [PATCH 7/7] tcg/ppc: Use prefixed instructions for tcg_out_goto_tb Richard Henderson
2023-08-06 12:55   ` Nicholas Piggin
2023-08-06 14:13     ` Richard Henderson
2023-08-07  1:51       ` Nicholas Piggin
2023-08-07  7:29       ` Nicholas Piggin
2023-08-07  9:38         ` Nicholas Piggin
2023-08-07  4:08   ` Jordan Niethe
2023-08-06 11:55 ` [PATCH for-8.2 0/7] tcg/ppc: Support power10 prefixed instructions Nicholas Piggin
2023-08-06 14:22   ` Richard Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.