qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
@ 2019-05-19  4:15 Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 1/7] tcg/ppc: Initial backend support for Altivec Richard Henderson
                   ` (7 more replies)
  0 siblings, 8 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

Based-on: <20190518190157.21255-1-richard.henderson@linaro.org>
Aka "tcg: misc gvec improvements".

Version 3 was last posted in March,
https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg05859.html

Changes since v3:
  * Add support for bitsel, with the vsx xxsel insn.
  * Rely on the new relocation overflow handling, so
    we don't require 3 insns for a vector load.

Changes since v2:
  * Several generic tcg patches to improve dup vs dupi vs dupm.
    In particular, if a global temp (like guest r10) is not in
    a host register, we should duplicate from memory instead of
    loading to an integer register, spilling to stack, loading
    to a vector register, and then duplicating.
  * I have more confidence that 32-bit ppc host should work
    this time around.  No testing on that front yet, but I've
    unified some code sequences with 64-bit ppc host.
  * Base altivec now supports V128 only.  Moved V64 support to
    Power7 (v2.06), which has 64-bit load/store.
  * Dropped support for 64-bit vector multiply using Power8.
    The expansion was too large compared to using integer regs.


r~


Richard Henderson (7):
  tcg/ppc: Initial backend support for Altivec
  tcg/ppc: Support vector shift by immediate
  tcg/ppc: Support vector multiply
  tcg/ppc: Support vector dup2
  tcg/ppc: Update vector support to v2.06
  tcg/ppc: Update vector support to v2.07
  tcg/ppc: Update vector support to v3.00

 tcg/ppc/tcg-target.h     |   39 +-
 tcg/ppc/tcg-target.opc.h |   11 +
 tcg/ppc/tcg-target.inc.c | 1077 +++++++++++++++++++++++++++++++++++---
 3 files changed, 1063 insertions(+), 64 deletions(-)
 create mode 100644 tcg/ppc/tcg-target.opc.h

-- 
2.17.1



^ permalink raw reply	[flat|nested] 34+ messages in thread

* [Qemu-devel] [PATCH v4 1/7] tcg/ppc: Initial backend support for Altivec
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
@ 2019-05-19  4:15 ` Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 2/7] tcg/ppc: Support vector shift by immediate Richard Henderson
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

There are a few missing operations yet, like expansion of
multiply and shifts.  But this has move, load, store, and
basic arithmetic.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |  36 +-
 tcg/ppc/tcg-target.opc.h |   3 +
 tcg/ppc/tcg-target.inc.c | 707 +++++++++++++++++++++++++++++++++++----
 3 files changed, 685 insertions(+), 61 deletions(-)
 create mode 100644 tcg/ppc/tcg-target.opc.h

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 7627fb62d3..368c250c6a 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -31,7 +31,7 @@
 # define TCG_TARGET_REG_BITS  32
 #endif
 
-#define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_NB_REGS 64
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
 
@@ -45,10 +45,20 @@ typedef enum {
     TCG_REG_R24, TCG_REG_R25, TCG_REG_R26, TCG_REG_R27,
     TCG_REG_R28, TCG_REG_R29, TCG_REG_R30, TCG_REG_R31,
 
+    TCG_REG_V0,  TCG_REG_V1,  TCG_REG_V2,  TCG_REG_V3,
+    TCG_REG_V4,  TCG_REG_V5,  TCG_REG_V6,  TCG_REG_V7,
+    TCG_REG_V8,  TCG_REG_V9,  TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
     TCG_REG_CALL_STACK = TCG_REG_R1,
     TCG_AREG0 = TCG_REG_R27
 } TCGReg;
 
+extern bool have_isa_altivec;
 extern bool have_isa_2_06;
 extern bool have_isa_3_00;
 
@@ -126,6 +136,30 @@ extern bool have_isa_3_00;
 #define TCG_TARGET_HAS_mulsh_i64        1
 #endif
 
+/*
+ * While technically Altivec could support V64, it has no 64-bit store
+ * instruction and substituting two 32-bit stores makes the generated
+ * code quite large.
+ */
+#define TCG_TARGET_HAS_v64              0
+#define TCG_TARGET_HAS_v128             have_isa_altivec
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_andc_vec         1
+#define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_not_vec          1
+#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_abs_vec          0
+#define TCG_TARGET_HAS_shi_vec          0
+#define TCG_TARGET_HAS_shs_vec          0
+#define TCG_TARGET_HAS_shv_vec          0
+#define TCG_TARGET_HAS_cmp_vec          1
+#define TCG_TARGET_HAS_mul_vec          0
+#define TCG_TARGET_HAS_sat_vec          1
+#define TCG_TARGET_HAS_minmax_vec       1
+#define TCG_TARGET_HAS_bitsel_vec       0
+#define TCG_TARGET_HAS_cmpsel_vec       0
+
 void flush_icache_range(uintptr_t start, uintptr_t stop);
 void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t);
 
diff --git a/tcg/ppc/tcg-target.opc.h b/tcg/ppc/tcg-target.opc.h
new file mode 100644
index 0000000000..4816a6c3d4
--- /dev/null
+++ b/tcg/ppc/tcg-target.opc.h
@@ -0,0 +1,3 @@
+/* Target-specific opcodes for host vector expansion.  These will be
+   emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
+   consider these to be UNSPEC with names.  */
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 30c095d3d5..479e653da6 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -42,6 +42,9 @@
 # define TCG_REG_TMP1   TCG_REG_R12
 #endif
 
+#define TCG_VEC_TMP1    TCG_REG_V0
+#define TCG_VEC_TMP2    TCG_REG_V1
+
 #define TCG_REG_TB     TCG_REG_R31
 #define USE_REG_TB     (TCG_TARGET_REG_BITS == 64)
 
@@ -61,6 +64,7 @@
 
 static tcg_insn_unit *tb_ret_addr;
 
+bool have_isa_altivec;
 bool have_isa_2_06;
 bool have_isa_3_00;
 
@@ -72,39 +76,15 @@ bool have_isa_3_00;
 #endif
 
 #ifdef CONFIG_DEBUG_TCG
-static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
-    "r0",
-    "r1",
-    "r2",
-    "r3",
-    "r4",
-    "r5",
-    "r6",
-    "r7",
-    "r8",
-    "r9",
-    "r10",
-    "r11",
-    "r12",
-    "r13",
-    "r14",
-    "r15",
-    "r16",
-    "r17",
-    "r18",
-    "r19",
-    "r20",
-    "r21",
-    "r22",
-    "r23",
-    "r24",
-    "r25",
-    "r26",
-    "r27",
-    "r28",
-    "r29",
-    "r30",
-    "r31"
+static const char tcg_target_reg_names[TCG_TARGET_NB_REGS][4] = {
+    "r0",  "r1",  "r2",  "r3",  "r4",  "r5",  "r6",  "r7",
+    "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
+    "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+    "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
+    "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",
+    "v8",  "v9",  "v10", "v11", "v12", "v13", "v14", "v15",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+    "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
 };
 #endif
 
@@ -139,6 +119,26 @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_R5,
     TCG_REG_R4,
     TCG_REG_R3,
+
+    /* V0 and V1 reserved as temporaries; V20 - V31 are call-saved */
+    TCG_REG_V2,   /* call clobbered, vectors */
+    TCG_REG_V3,
+    TCG_REG_V4,
+    TCG_REG_V5,
+    TCG_REG_V6,
+    TCG_REG_V7,
+    TCG_REG_V8,
+    TCG_REG_V9,
+    TCG_REG_V10,
+    TCG_REG_V11,
+    TCG_REG_V12,
+    TCG_REG_V13,
+    TCG_REG_V14,
+    TCG_REG_V15,
+    TCG_REG_V16,
+    TCG_REG_V17,
+    TCG_REG_V18,
+    TCG_REG_V19,
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -233,6 +233,10 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
         ct->ct |= TCG_CT_REG;
         ct->u.regs = 0xffffffff;
         break;
+    case 'v':
+        ct->ct |= TCG_CT_REG;
+        ct->u.regs = 0xffffffff00000000ull;
+        break;
     case 'L':                   /* qemu_ld constraint */
         ct->ct |= TCG_CT_REG;
         ct->u.regs = 0xffffffff;
@@ -320,6 +324,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define XO31(opc) (OPCD(31)|((opc)<<1))
 #define XO58(opc) (OPCD(58)|(opc))
 #define XO62(opc) (OPCD(62)|(opc))
+#define VX4(opc)  (OPCD(4)|(opc))
 
 #define B      OPCD( 18)
 #define BC     OPCD( 16)
@@ -461,6 +466,72 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 
 #define NOP    ORI  /* ori 0,0,0 */
 
+#define LVX        XO31(103)
+#define LVEBX      XO31(7)
+#define LVEHX      XO31(39)
+#define LVEWX      XO31(71)
+
+#define STVX       XO31(231)
+#define STVEWX     XO31(199)
+
+#define VADDSBS    VX4(768)
+#define VADDUBS    VX4(512)
+#define VADDUBM    VX4(0)
+#define VADDSHS    VX4(832)
+#define VADDUHS    VX4(576)
+#define VADDUHM    VX4(64)
+#define VADDSWS    VX4(896)
+#define VADDUWS    VX4(640)
+#define VADDUWM    VX4(128)
+
+#define VSUBSBS    VX4(1792)
+#define VSUBUBS    VX4(1536)
+#define VSUBUBM    VX4(1024)
+#define VSUBSHS    VX4(1856)
+#define VSUBUHS    VX4(1600)
+#define VSUBUHM    VX4(1088)
+#define VSUBSWS    VX4(1920)
+#define VSUBUWS    VX4(1664)
+#define VSUBUWM    VX4(1152)
+
+#define VMAXSB     VX4(258)
+#define VMAXSH     VX4(322)
+#define VMAXSW     VX4(386)
+#define VMAXUB     VX4(2)
+#define VMAXUH     VX4(66)
+#define VMAXUW     VX4(130)
+#define VMINSB     VX4(770)
+#define VMINSH     VX4(834)
+#define VMINSW     VX4(898)
+#define VMINUB     VX4(514)
+#define VMINUH     VX4(578)
+#define VMINUW     VX4(642)
+
+#define VCMPEQUB   VX4(6)
+#define VCMPEQUH   VX4(70)
+#define VCMPEQUW   VX4(134)
+#define VCMPGTSB   VX4(774)
+#define VCMPGTSH   VX4(838)
+#define VCMPGTSW   VX4(902)
+#define VCMPGTUB   VX4(518)
+#define VCMPGTUH   VX4(582)
+#define VCMPGTUW   VX4(646)
+
+#define VAND       VX4(1028)
+#define VANDC      VX4(1092)
+#define VNOR       VX4(1284)
+#define VOR        VX4(1156)
+#define VXOR       VX4(1220)
+
+#define VSPLTB     VX4(524)
+#define VSPLTH     VX4(588)
+#define VSPLTW     VX4(652)
+#define VSPLTISB   VX4(780)
+#define VSPLTISH   VX4(844)
+#define VSPLTISW   VX4(908)
+
+#define VSLDOI     VX4(44)
+
 #define RT(r) ((r)<<21)
 #define RS(r) ((r)<<21)
 #define RA(r) ((r)<<16)
@@ -473,6 +544,11 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define MB64(b) ((b)<<5)
 #define FXM(b) (1 << (19 - (b)))
 
+#define VRT(r)  (((r) & 31) << 21)
+#define VRA(r)  (((r) & 31) << 16)
+#define VRB(r)  (((r) & 31) << 11)
+#define VRC(r)  (((r) & 31) <<  6)
+
 #define LK    1
 
 #define TAB(t, a, b) (RT(t) | RA(a) | RB(b))
@@ -529,6 +605,8 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
     tcg_insn_unit *target;
+    int16_t lo;
+    int32_t hi;
 
     value += addend;
     target = (tcg_insn_unit *)value;
@@ -550,6 +628,20 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
         }
         *code_ptr = (*code_ptr & ~0xfffc) | (value & 0xfffc);
         break;
+    case R_PPC_ADDR32:
+        /*
+         * We are abusing this relocation type.  Again, this points to
+         * a pair of insns, lis + load.  This is an absolute address
+         * relocation for PPC32 so the lis cannot be removed.
+         */
+        lo = value;
+        hi = value - lo;
+        if (hi + lo != value) {
+            return false;
+        }
+        code_ptr[0] = deposit32(code_ptr[0], 0, 16, hi >> 16);
+        code_ptr[1] = deposit32(code_ptr[1], 0, 16, lo);
+        break;
     default:
         g_assert_not_reached();
     }
@@ -561,9 +653,29 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
 
 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
-    if (ret != arg) {
-        tcg_out32(s, OR | SAB(arg, ret, arg));
+    if (ret == arg) {
+        return true;
+    }
+    switch (type) {
+    case TCG_TYPE_I64:
+        tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
+        /* fallthru */
+    case TCG_TYPE_I32:
+        if (ret < 32 && arg < 32) {
+            tcg_out32(s, OR | SAB(arg, ret, arg));
+            break;
+        } else if (ret < 32 || arg < 32) {
+            /* Altivec does not support vector/integer moves.  */
+            return false;
+        }
+        /* fallthru */
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 32 && arg >= 32);
+        tcg_out32(s, VOR | VRT(ret) | VRA(arg) | VRB(arg));
+        break;
+    default:
+        g_assert_not_reached();
     }
     return true;
 }
@@ -712,10 +824,76 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
     }
 }
 
-static inline void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
-                                tcg_target_long arg)
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
+                             tcg_target_long val)
 {
-    tcg_out_movi_int(s, type, ret, arg, false);
+    uint32_t load_insn;
+    int rel, low;
+    intptr_t add;
+
+    low = (int8_t)val;
+    if (low >= -16 && low < 16) {
+        if (val == (tcg_target_long)dup_const(MO_8, low)) {
+            tcg_out32(s, VSPLTISB | VRT(ret) | ((val & 31) << 16));
+            return;
+        }
+        if (val == (tcg_target_long)dup_const(MO_16, low)) {
+            tcg_out32(s, VSPLTISH | VRT(ret) | ((val & 31) << 16));
+            return;
+        }
+        if (val == (tcg_target_long)dup_const(MO_32, low)) {
+            tcg_out32(s, VSPLTISW | VRT(ret) | ((val & 31) << 16));
+            return;
+        }
+    }
+
+    /*
+     * Otherwise we must load the value from the constant pool.
+     */
+    if (USE_REG_TB) {
+        rel = R_PPC_ADDR16;
+        add = -(intptr_t)s->code_gen_ptr;
+    } else {
+        rel = R_PPC_ADDR32;
+        add = 0;
+    }
+
+    load_insn = LVX | VRT(ret) | RB(TCG_REG_TMP1);
+    if (TCG_TARGET_REG_BITS == 64) {
+        new_pool_l2(s, rel, s->code_ptr, add, val, val);
+    } else {
+        new_pool_l4(s, rel, s->code_ptr, add, val, val, val, val);
+    }
+
+    if (USE_REG_TB) {
+        tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, 0, 0));
+        load_insn |= RA(TCG_REG_TB);
+    } else {
+        tcg_out32(s, ADDIS | TAI(TCG_REG_TMP1, 0, 0));
+        tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, TCG_REG_TMP1, 0));
+    }
+    tcg_out32(s, load_insn);
+}
+
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret,
+                         tcg_target_long arg)
+{
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+        tcg_debug_assert(ret < 32);
+        tcg_out_movi_int(s, type, ret, arg, false);
+        break;
+
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 32);
+        tcg_out_dupi_vec(s, type, ret, arg);
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
 }
 
 static bool mask_operand(uint32_t c, int *mb, int *me)
@@ -868,7 +1046,7 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
     }
 
     /* For unaligned, or very large offsets, use the indexed form.  */
-    if (offset & align || offset != (int32_t)offset) {
+    if (offset & align || offset != (int32_t)offset || opi == 0) {
         if (rs == base) {
             rs = TCG_REG_R0;
         }
@@ -899,32 +1077,96 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
     }
 }
 
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_vsldoi(TCGContext *s, TCGReg ret,
+                           TCGReg va, TCGReg vb, int shb)
 {
-    int opi, opx;
-
-    tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
-    if (type == TCG_TYPE_I32) {
-        opi = LWZ, opx = LWZX;
-    } else {
-        opi = LD, opx = LDX;
-    }
-    tcg_out_mem_long(s, opi, opx, ret, arg1, arg2);
+    tcg_out32(s, VSLDOI | VRT(ret) | VRA(va) | VRB(vb) | (shb << 6));
 }
 
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
+                       TCGReg base, intptr_t offset)
 {
-    int opi, opx;
+    int shift;
 
-    tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
-    if (type == TCG_TYPE_I32) {
-        opi = STW, opx = STWX;
-    } else {
-        opi = STD, opx = STDX;
+    switch (type) {
+    case TCG_TYPE_I32:
+        if (ret < 32) {
+            tcg_out_mem_long(s, LWZ, LWZX, ret, base, offset);
+            break;
+        }
+        assert((offset & 3) == 0);
+        tcg_out_mem_long(s, 0, LVEWX, ret & 31, base, offset);
+        shift = (offset - 4) & 0xc;
+        if (shift) {
+            tcg_out_vsldoi(s, ret, ret, ret, shift);
+        }
+        break;
+    case TCG_TYPE_I64:
+        if (ret < 32) {
+            tcg_out_mem_long(s, LD, LDX, ret, base, offset);
+            break;
+        }
+        /* fallthru */
+    case TCG_TYPE_V64:
+        tcg_debug_assert(ret >= 32);
+        assert((offset & 7) == 0);
+        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset & -16);
+        if (offset & 8) {
+            tcg_out_vsldoi(s, ret, ret, ret, 8);
+        }
+        break;
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 32);
+        assert((offset & 15) == 0);
+        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+                              TCGReg base, intptr_t offset)
+{
+    int shift;
+
+    switch (type) {
+    case TCG_TYPE_I32:
+        if (arg < 32) {
+            tcg_out_mem_long(s, STW, STWX, arg, base, offset);
+            break;
+        }
+        assert((offset & 3) == 0);
+        shift = (offset - 4) & 0xc;
+        if (shift) {
+            tcg_out_vsldoi(s, TCG_VEC_TMP1, arg, arg, shift);
+            arg = TCG_VEC_TMP1;
+        }
+        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset);
+        break;
+    case TCG_TYPE_I64:
+        if (arg < 32) {
+            tcg_out_mem_long(s, STD, STDX, arg, base, offset);
+            break;
+        }
+        /* fallthru */
+    case TCG_TYPE_V64:
+        tcg_debug_assert(arg >= 32);
+        assert((offset & 7) == 0);
+        if (offset & 8) {
+            tcg_out_vsldoi(s, TCG_VEC_TMP1, arg, arg, 8);
+            arg = TCG_VEC_TMP1;
+        }
+        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset);
+        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset + 4);
+        break;
+    case TCG_TYPE_V128:
+        tcg_debug_assert(arg >= 32);
+        tcg_out_mem_long(s, 0, STVX, arg & 31, base, offset);
+        break;
+    default:
+        g_assert_not_reached();
     }
-    tcg_out_mem_long(s, opi, opx, arg, arg1, arg2);
 }
 
 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
@@ -2616,6 +2858,292 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
     }
 }
 
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+    switch (opc) {
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_andc_vec:
+    case INDEX_op_not_vec:
+        return 1;
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
+        return vece <= MO_32;
+    case INDEX_op_cmp_vec:
+        return vece <= MO_32 ? -1 : 0;
+    default:
+        return 0;
+    }
+}
+
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+                            TCGReg dst, TCGReg src)
+{
+    tcg_debug_assert(dst >= 32);
+    tcg_debug_assert(src >= 32);
+
+    /*
+     * Recall we use (or emulate) VSX integer loads, so the integer is
+     * right justified within the left (zero-index) double-word.
+     */
+    switch (vece) {
+    case MO_8:
+        tcg_out32(s, VSPLTB | VRT(dst) | VRB(src) | (7 << 16));
+        break;
+    case MO_16:
+        tcg_out32(s, VSPLTH | VRT(dst) | VRB(src) | (3 << 16));
+        break;
+    case MO_32:
+        tcg_out32(s, VSPLTW | VRT(dst) | VRB(src) | (1 << 16));
+        break;
+    case MO_64:
+        tcg_out_vsldoi(s, TCG_VEC_TMP1, src, src, 8);
+        tcg_out_vsldoi(s, dst, TCG_VEC_TMP1, src, 8);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+                             TCGReg out, TCGReg base, intptr_t offset)
+{
+    int elt;
+
+    tcg_debug_assert(out >= 32);
+    out &= 31;
+    switch (vece) {
+    case MO_8:
+        tcg_out_mem_long(s, 0, LVEBX, out, base, offset);
+        elt = extract32(offset, 0, 4);
+#ifndef HOST_WORDS_BIGENDIAN
+        elt ^= 15;
+#endif
+        tcg_out32(s, VSPLTB | VRT(out) | VRB(out) | (elt << 16));
+        break;
+    case MO_16:
+        assert((offset & 1) == 0);
+        tcg_out_mem_long(s, 0, LVEHX, out, base, offset);
+        elt = extract32(offset, 1, 3);
+#ifndef HOST_WORDS_BIGENDIAN
+        elt ^= 7;
+#endif
+        tcg_out32(s, VSPLTH | VRT(out) | VRB(out) | (elt << 16));
+        break;
+    case MO_32:
+        assert((offset & 3) == 0);
+        tcg_out_mem_long(s, 0, LVEWX, out, base, offset);
+        elt = extract32(offset, 2, 2);
+#ifndef HOST_WORDS_BIGENDIAN
+        elt ^= 3;
+#endif
+        tcg_out32(s, VSPLTW | VRT(out) | VRB(out) | (elt << 16));
+        break;
+    case MO_64:
+        assert((offset & 7) == 0);
+        tcg_out_mem_long(s, 0, LVX, out, base, offset & -16);
+        tcg_out_vsldoi(s, TCG_VEC_TMP1, out, out, 8);
+        elt = extract32(offset, 3, 1);
+#ifndef HOST_WORDS_BIGENDIAN
+        elt = !elt;
+#endif
+        if (elt) {
+            tcg_out_vsldoi(s, out, out, TCG_VEC_TMP1, 8);
+        } else {
+            tcg_out_vsldoi(s, out, TCG_VEC_TMP1, out, 8);
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+                           unsigned vecl, unsigned vece,
+                           const TCGArg *args, const int *const_args)
+{
+    static const uint32_t
+        add_op[4] = { VADDUBM, VADDUHM, VADDUWM, 0 },
+        sub_op[4] = { VSUBUBM, VSUBUHM, VSUBUWM, 0 },
+        eq_op[4]  = { VCMPEQUB, VCMPEQUH, VCMPEQUW, 0 },
+        gts_op[4] = { VCMPGTSB, VCMPGTSH, VCMPGTSW, 0 },
+        gtu_op[4] = { VCMPGTUB, VCMPGTUH, VCMPGTUW, 0 },
+        ssadd_op[4] = { VADDSBS, VADDSHS, VADDSWS, 0 },
+        usadd_op[4] = { VADDUBS, VADDUHS, VADDUWS, 0 },
+        sssub_op[4] = { VSUBSBS, VSUBSHS, VSUBSWS, 0 },
+        ussub_op[4] = { VSUBUBS, VSUBUHS, VSUBUWS, 0 },
+        umin_op[4] = { VMINUB, VMINUH, VMINUW, 0 },
+        smin_op[4] = { VMINSB, VMINSH, VMINSW, 0 },
+        umax_op[4] = { VMAXUB, VMAXUH, VMAXUW, 0 },
+        smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, 0 };
+
+    TCGType type = vecl + TCG_TYPE_V64;
+    TCGArg a0 = args[0], a1 = args[1], a2 = args[2];
+    uint32_t insn;
+
+    switch (opc) {
+    case INDEX_op_ld_vec:
+        tcg_out_ld(s, type, a0, a1, a2);
+        return;
+    case INDEX_op_st_vec:
+        tcg_out_st(s, type, a0, a1, a2);
+        return;
+    case INDEX_op_dupm_vec:
+        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+        return;
+
+    case INDEX_op_add_vec:
+        insn = add_op[vece];
+        break;
+    case INDEX_op_sub_vec:
+        insn = sub_op[vece];
+        break;
+    case INDEX_op_ssadd_vec:
+        insn = ssadd_op[vece];
+        break;
+    case INDEX_op_sssub_vec:
+        insn = sssub_op[vece];
+        break;
+    case INDEX_op_usadd_vec:
+        insn = usadd_op[vece];
+        break;
+    case INDEX_op_ussub_vec:
+        insn = ussub_op[vece];
+        break;
+    case INDEX_op_smin_vec:
+        insn = smin_op[vece];
+        break;
+    case INDEX_op_umin_vec:
+        insn = umin_op[vece];
+        break;
+    case INDEX_op_smax_vec:
+        insn = smax_op[vece];
+        break;
+    case INDEX_op_umax_vec:
+        insn = umax_op[vece];
+        break;
+    case INDEX_op_and_vec:
+        insn = VAND;
+        break;
+    case INDEX_op_or_vec:
+        insn = VOR;
+        break;
+    case INDEX_op_xor_vec:
+        insn = VXOR;
+        break;
+    case INDEX_op_andc_vec:
+        insn = VANDC;
+        break;
+    case INDEX_op_not_vec:
+        insn = VNOR;
+        a2 = a1;
+        break;
+
+    case INDEX_op_cmp_vec:
+        switch (args[3]) {
+        case TCG_COND_EQ:
+            insn = eq_op[vece];
+            break;
+        case TCG_COND_GT:
+            insn = gts_op[vece];
+            break;
+        case TCG_COND_GTU:
+            insn = gtu_op[vece];
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+
+    case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
+    case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
+    default:
+        g_assert_not_reached();
+    }
+
+    tcg_debug_assert(insn != 0);
+    tcg_out32(s, insn | VRT(a0) | VRA(a1) | VRB(a2));
+}
+
+static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
+                           TCGv_vec v1, TCGv_vec v2, TCGCond cond)
+{
+    bool need_swap = false, need_inv = false;
+
+    tcg_debug_assert(vece <= MO_32);
+
+    switch (cond) {
+    case TCG_COND_EQ:
+    case TCG_COND_GT:
+    case TCG_COND_GTU:
+        break;
+    case TCG_COND_NE:
+    case TCG_COND_LE:
+    case TCG_COND_LEU:
+        need_inv = true;
+        break;
+    case TCG_COND_LT:
+    case TCG_COND_LTU:
+        need_swap = true;
+        break;
+    case TCG_COND_GE:
+    case TCG_COND_GEU:
+        need_swap = need_inv = true;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (need_inv) {
+        cond = tcg_invert_cond(cond);
+    }
+    if (need_swap) {
+        TCGv_vec t1;
+        t1 = v1, v1 = v2, v2 = t1;
+        cond = tcg_swap_cond(cond);
+    }
+
+    vec_gen_4(INDEX_op_cmp_vec, type, vece, tcgv_vec_arg(v0),
+              tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
+
+    if (need_inv) {
+        tcg_gen_not_vec(vece, v0, v0);
+    }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+                       TCGArg a0, ...)
+{
+    va_list va;
+    TCGv_vec v0, v1, v2;
+
+    va_start(va, a0);
+    v0 = temp_tcgv_vec(arg_temp(a0));
+    v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+    v2 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+
+    switch (opc) {
+    case INDEX_op_cmp_vec:
+        expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    va_end(va);
+}
+
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
     static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
@@ -2653,6 +3181,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
         = { .args_ct_str = { "r", "r", "r", "r", "rI", "rZM" } };
     static const TCGTargetOpDef sub2
         = { .args_ct_str = { "r", "r", "rI", "rZM", "r", "r" } };
+    static const TCGTargetOpDef v_r = { .args_ct_str = { "v", "r" } };
+    static const TCGTargetOpDef v_v = { .args_ct_str = { "v", "v" } };
+    static const TCGTargetOpDef v_v_v = { .args_ct_str = { "v", "v", "v" } };
 
     switch (op) {
     case INDEX_op_goto_ptr:
@@ -2788,6 +3319,32 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
         return (TCG_TARGET_REG_BITS == 64 ? &S_S
                 : TARGET_LONG_BITS == 32 ? &S_S_S : &S_S_S_S);
 
+    case INDEX_op_add_vec:
+    case INDEX_op_sub_vec:
+    case INDEX_op_mul_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_andc_vec:
+    case INDEX_op_orc_vec:
+    case INDEX_op_cmp_vec:
+    case INDEX_op_ssadd_vec:
+    case INDEX_op_sssub_vec:
+    case INDEX_op_usadd_vec:
+    case INDEX_op_ussub_vec:
+    case INDEX_op_smax_vec:
+    case INDEX_op_smin_vec:
+    case INDEX_op_umax_vec:
+    case INDEX_op_umin_vec:
+        return &v_v_v;
+    case INDEX_op_not_vec:
+    case INDEX_op_dup_vec:
+        return &v_v;
+    case INDEX_op_ld_vec:
+    case INDEX_op_st_vec:
+    case INDEX_op_dupm_vec:
+        return &v_r;
+
     default:
         return NULL;
     }
@@ -2798,6 +3355,9 @@ static void tcg_target_init(TCGContext *s)
     unsigned long hwcap = qemu_getauxval(AT_HWCAP);
     unsigned long hwcap2 = qemu_getauxval(AT_HWCAP2);
 
+    if (hwcap & PPC_FEATURE_HAS_ALTIVEC) {
+        have_isa_altivec = true;
+    }
     if (hwcap & PPC_FEATURE_ARCH_2_06) {
         have_isa_2_06 = true;
     }
@@ -2809,6 +3369,10 @@ static void tcg_target_init(TCGContext *s)
 
     tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffff;
     tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffff;
+    if (have_isa_altivec) {
+        tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
+        tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
+    }
 
     tcg_target_call_clobber_regs = 0;
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R0);
@@ -2824,6 +3388,27 @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R12);
 
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V0);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V1);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V2);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V3);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V4);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V5);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V6);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V7);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V16);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V17);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V18);
+    tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_V19);
+
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0); /* tcg temp */
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_R1); /* stack pointer */
@@ -2834,6 +3419,8 @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_R13); /* thread pointer */
 #endif
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1); /* mem temp */
+    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP1);
+    tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP2);
     if (USE_REG_TB) {
         tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);  /* tb->tc_ptr */
     }
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [Qemu-devel] [PATCH v4 2/7] tcg/ppc: Support vector shift by immediate
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 1/7] tcg/ppc: Initial backend support for Altivec Richard Henderson
@ 2019-05-19  4:15 ` Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 3/7] tcg/ppc: Support vector multiply Richard Henderson
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

For Altivec, this is done via vector shift by vector,
and loading the immediate into a register.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |  2 +-
 tcg/ppc/tcg-target.inc.c | 58 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 368c250c6a..766706fd30 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -152,7 +152,7 @@ extern bool have_isa_3_00;
 #define TCG_TARGET_HAS_abs_vec          0
 #define TCG_TARGET_HAS_shi_vec          0
 #define TCG_TARGET_HAS_shs_vec          0
-#define TCG_TARGET_HAS_shv_vec          0
+#define TCG_TARGET_HAS_shv_vec          1
 #define TCG_TARGET_HAS_cmp_vec          1
 #define TCG_TARGET_HAS_mul_vec          0
 #define TCG_TARGET_HAS_sat_vec          1
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 479e653da6..62a8c428e0 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -517,6 +517,16 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define VCMPGTUH   VX4(582)
 #define VCMPGTUW   VX4(646)
 
+#define VSLB       VX4(260)
+#define VSLH       VX4(324)
+#define VSLW       VX4(388)
+#define VSRB       VX4(516)
+#define VSRH       VX4(580)
+#define VSRW       VX4(644)
+#define VSRAB      VX4(772)
+#define VSRAH      VX4(836)
+#define VSRAW      VX4(900)
+
 #define VAND       VX4(1028)
 #define VANDC      VX4(1092)
 #define VNOR       VX4(1284)
@@ -2877,8 +2887,14 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_sssub_vec:
     case INDEX_op_usadd_vec:
     case INDEX_op_ussub_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
         return vece <= MO_32;
     case INDEX_op_cmp_vec:
+    case INDEX_op_shli_vec:
+    case INDEX_op_shri_vec:
+    case INDEX_op_sari_vec:
         return vece <= MO_32 ? -1 : 0;
     default:
         return 0;
@@ -2986,7 +3002,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         umin_op[4] = { VMINUB, VMINUH, VMINUW, 0 },
         smin_op[4] = { VMINSB, VMINSH, VMINSW, 0 },
         umax_op[4] = { VMAXUB, VMAXUH, VMAXUW, 0 },
-        smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, 0 };
+        smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, 0 },
+        shlv_op[4] = { VSLB, VSLH, VSLW, 0 },
+        shrv_op[4] = { VSRB, VSRH, VSRW, 0 },
+        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 };
 
     TCGType type = vecl + TCG_TYPE_V64;
     TCGArg a0 = args[0], a1 = args[1], a2 = args[2];
@@ -3033,6 +3052,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_umax_vec:
         insn = umax_op[vece];
         break;
+    case INDEX_op_shlv_vec:
+        insn = shlv_op[vece];
+        break;
+    case INDEX_op_shrv_vec:
+        insn = shrv_op[vece];
+        break;
+    case INDEX_op_sarv_vec:
+        insn = sarv_op[vece];
+        break;
     case INDEX_op_and_vec:
         insn = VAND;
         break;
@@ -3077,6 +3105,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     tcg_out32(s, insn | VRT(a0) | VRA(a1) | VRB(a2));
 }
 
+static void expand_vec_shi(TCGType type, unsigned vece, TCGv_vec v0,
+                           TCGv_vec v1, TCGArg imm, TCGOpcode opci)
+{
+    TCGv_vec t1 = tcg_temp_new_vec(type);
+
+    /* Splat w/bytes for xxspltib.  */
+    tcg_gen_dupi_vec(MO_8, t1, imm & ((8 << vece) - 1));
+    vec_gen_3(opci, type, vece, tcgv_vec_arg(v0),
+              tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+    tcg_temp_free_vec(t1);
+}
+
 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
 {
@@ -3128,14 +3168,25 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
 {
     va_list va;
     TCGv_vec v0, v1, v2;
+    TCGArg a2;
 
     va_start(va, a0);
     v0 = temp_tcgv_vec(arg_temp(a0));
     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
-    v2 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+    a2 = va_arg(va, TCGArg);
 
     switch (opc) {
+    case INDEX_op_shli_vec:
+        expand_vec_shi(type, vece, v0, v1, a2, INDEX_op_shlv_vec);
+        break;
+    case INDEX_op_shri_vec:
+        expand_vec_shi(type, vece, v0, v1, a2, INDEX_op_shrv_vec);
+        break;
+    case INDEX_op_sari_vec:
+        expand_vec_shi(type, vece, v0, v1, a2, INDEX_op_sarv_vec);
+        break;
     case INDEX_op_cmp_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
         break;
     default:
@@ -3336,6 +3387,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_smin_vec:
     case INDEX_op_umax_vec:
     case INDEX_op_umin_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
         return &v_v_v;
     case INDEX_op_not_vec:
     case INDEX_op_dup_vec:
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [Qemu-devel] [PATCH v4 3/7] tcg/ppc: Support vector multiply
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 1/7] tcg/ppc: Initial backend support for Altivec Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 2/7] tcg/ppc: Support vector shift by immediate Richard Henderson
@ 2019-05-19  4:15 ` Richard Henderson
  2019-05-19  5:05   ` Aleksandar Markovic
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 4/7] tcg/ppc: Support vector dup2 Richard Henderson
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

For Altivec, this is always an expansion.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |   2 +-
 tcg/ppc/tcg-target.opc.h |   8 +++
 tcg/ppc/tcg-target.inc.c | 112 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 766706fd30..a130192cbd 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -154,7 +154,7 @@ extern bool have_isa_3_00;
 #define TCG_TARGET_HAS_shs_vec          0
 #define TCG_TARGET_HAS_shv_vec          1
 #define TCG_TARGET_HAS_cmp_vec          1
-#define TCG_TARGET_HAS_mul_vec          0
+#define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       1
 #define TCG_TARGET_HAS_bitsel_vec       0
diff --git a/tcg/ppc/tcg-target.opc.h b/tcg/ppc/tcg-target.opc.h
index 4816a6c3d4..5c6a5ad52c 100644
--- a/tcg/ppc/tcg-target.opc.h
+++ b/tcg/ppc/tcg-target.opc.h
@@ -1,3 +1,11 @@
 /* Target-specific opcodes for host vector expansion.  These will be
    emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
    consider these to be UNSPEC with names.  */
+
+DEF(ppc_mrgh_vec, 1, 2, 0, IMPLVEC)
+DEF(ppc_mrgl_vec, 1, 2, 0, IMPLVEC)
+DEF(ppc_msum_vec, 1, 3, 0, IMPLVEC)
+DEF(ppc_muleu_vec, 1, 2, 0, IMPLVEC)
+DEF(ppc_mulou_vec, 1, 2, 0, IMPLVEC)
+DEF(ppc_pkum_vec, 1, 2, 0, IMPLVEC)
+DEF(ppc_rotl_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 62a8c428e0..9d58db9eb1 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -526,6 +526,25 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define VSRAB      VX4(772)
 #define VSRAH      VX4(836)
 #define VSRAW      VX4(900)
+#define VRLB       VX4(4)
+#define VRLH       VX4(68)
+#define VRLW       VX4(132)
+
+#define VMULEUB    VX4(520)
+#define VMULEUH    VX4(584)
+#define VMULOUB    VX4(8)
+#define VMULOUH    VX4(72)
+#define VMSUMUHM   VX4(38)
+
+#define VMRGHB     VX4(12)
+#define VMRGHH     VX4(76)
+#define VMRGHW     VX4(140)
+#define VMRGLB     VX4(268)
+#define VMRGLH     VX4(332)
+#define VMRGLW     VX4(396)
+
+#define VPKUHUM    VX4(14)
+#define VPKUWUM    VX4(78)
 
 #define VAND       VX4(1028)
 #define VANDC      VX4(1092)
@@ -2892,6 +2911,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_sarv_vec:
         return vece <= MO_32;
     case INDEX_op_cmp_vec:
+    case INDEX_op_mul_vec:
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
@@ -3005,7 +3025,13 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, 0 },
         shlv_op[4] = { VSLB, VSLH, VSLW, 0 },
         shrv_op[4] = { VSRB, VSRH, VSRW, 0 },
-        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 };
+        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 },
+        mrgh_op[4] = { VMRGHB, VMRGHH, VMRGHW, 0 },
+        mrgl_op[4] = { VMRGLB, VMRGLH, VMRGLW, 0 },
+        muleu_op[4] = { VMULEUB, VMULEUH, 0, 0 },
+        mulou_op[4] = { VMULOUB, VMULOUH, 0, 0 },
+        pkum_op[4] = { VPKUHUM, VPKUWUM, 0, 0 },
+        rotl_op[4] = { VRLB, VRLH, VRLW, 0 };
 
     TCGType type = vecl + TCG_TYPE_V64;
     TCGArg a0 = args[0], a1 = args[1], a2 = args[2];
@@ -3094,6 +3120,29 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    case INDEX_op_ppc_mrgh_vec:
+        insn = mrgh_op[vece];
+        break;
+    case INDEX_op_ppc_mrgl_vec:
+        insn = mrgl_op[vece];
+        break;
+    case INDEX_op_ppc_muleu_vec:
+        insn = muleu_op[vece];
+        break;
+    case INDEX_op_ppc_mulou_vec:
+        insn = mulou_op[vece];
+        break;
+    case INDEX_op_ppc_pkum_vec:
+        insn = pkum_op[vece];
+        break;
+    case INDEX_op_ppc_rotl_vec:
+        insn = rotl_op[vece];
+        break;
+    case INDEX_op_ppc_msum_vec:
+        tcg_debug_assert(vece == MO_16);
+        tcg_out32(s, VMSUMUHM | VRT(a0) | VRA(a1) | VRB(a2) | VRC(args[3]));
+        return;
+
     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
@@ -3163,6 +3212,53 @@ static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
     }
 }
 
+static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
+                           TCGv_vec v1, TCGv_vec v2)
+{
+    TCGv_vec t1 = tcg_temp_new_vec(type);
+    TCGv_vec t2 = tcg_temp_new_vec(type);
+    TCGv_vec t3, t4;
+
+    switch (vece) {
+    case MO_8:
+    case MO_16:
+        vec_gen_3(INDEX_op_ppc_muleu_vec, type, vece, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        vec_gen_3(INDEX_op_ppc_mulou_vec, type, vece, tcgv_vec_arg(t2),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        vec_gen_3(INDEX_op_ppc_mrgh_vec, type, vece + 1, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+        vec_gen_3(INDEX_op_ppc_mrgl_vec, type, vece + 1, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+        vec_gen_3(INDEX_op_ppc_pkum_vec, type, vece, tcgv_vec_arg(v0),
+                  tcgv_vec_arg(v0), tcgv_vec_arg(t1));
+	break;
+
+    case MO_32:
+        t3 = tcg_temp_new_vec(type);
+        t4 = tcg_temp_new_vec(type);
+        tcg_gen_dupi_vec(MO_8, t4, -16);
+        vec_gen_3(INDEX_op_ppc_rotl_vec, type, MO_32, tcgv_vec_arg(t1),
+                  tcgv_vec_arg(v2), tcgv_vec_arg(t4));
+        vec_gen_3(INDEX_op_ppc_mulou_vec, type, MO_16, tcgv_vec_arg(t2),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
+        tcg_gen_dupi_vec(MO_8, t3, 0);
+        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t3),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t3),
+                  tcgv_vec_arg(t3), tcgv_vec_arg(t4));
+        tcg_gen_add_vec(MO_32, v0, t2, t3);
+        tcg_temp_free_vec(t3);
+        tcg_temp_free_vec(t4);
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+    tcg_temp_free_vec(t1);
+    tcg_temp_free_vec(t2);
+}
+
 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
                        TCGArg a0, ...)
 {
@@ -3189,6 +3285,10 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
         v2 = temp_tcgv_vec(arg_temp(a2));
         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
         break;
+    case INDEX_op_mul_vec:
+        v2 = temp_tcgv_vec(arg_temp(a2));
+        expand_vec_mul(type, vece, v0, v1, v2);
+        break;
     default:
         g_assert_not_reached();
     }
@@ -3235,6 +3335,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     static const TCGTargetOpDef v_r = { .args_ct_str = { "v", "r" } };
     static const TCGTargetOpDef v_v = { .args_ct_str = { "v", "v" } };
     static const TCGTargetOpDef v_v_v = { .args_ct_str = { "v", "v", "v" } };
+    static const TCGTargetOpDef v_v_v_v
+        = { .args_ct_str = { "v", "v", "v", "v" } };
 
     switch (op) {
     case INDEX_op_goto_ptr:
@@ -3390,6 +3492,12 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shlv_vec:
     case INDEX_op_shrv_vec:
     case INDEX_op_sarv_vec:
+    case INDEX_op_ppc_mrgh_vec:
+    case INDEX_op_ppc_mrgl_vec:
+    case INDEX_op_ppc_muleu_vec:
+    case INDEX_op_ppc_mulou_vec:
+    case INDEX_op_ppc_pkum_vec:
+    case INDEX_op_ppc_rotl_vec:
         return &v_v_v;
     case INDEX_op_not_vec:
     case INDEX_op_dup_vec:
@@ -3398,6 +3506,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_st_vec:
     case INDEX_op_dupm_vec:
         return &v_r;
+    case INDEX_op_ppc_msum_vec:
+        return &v_v_v_v;
 
     default:
         return NULL;
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [Qemu-devel] [PATCH v4 4/7] tcg/ppc: Support vector dup2
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
                   ` (2 preceding siblings ...)
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 3/7] tcg/ppc: Support vector multiply Richard Henderson
@ 2019-05-19  4:15 ` Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 5/7] tcg/ppc: Update vector support to v2.06 Richard Henderson
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

This is only used for 32-bit hosts.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.inc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 9d58db9eb1..3219df2e90 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -3120,6 +3120,14 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    case INDEX_op_dup2_vec:
+        assert(TCG_TARGET_REG_BITS == 32);
+        /* With inputs a1 = xLxx, a2 = xHxx  */
+        tcg_out32(s, VMRGHW | VRT(a0) | VRA(a2) | VRB(a1));  /* a0  = xxHL */
+        tcg_out_vsldoi(s, TCG_VEC_TMP1, a0, a0, 8);          /* tmp = HLxx */
+        tcg_out_vsldoi(s, a0, a0, TCG_VEC_TMP1, 8);          /* a0  = HLHL */
+        return;
+
     case INDEX_op_ppc_mrgh_vec:
         insn = mrgh_op[vece];
         break;
@@ -3498,6 +3506,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_ppc_mulou_vec:
     case INDEX_op_ppc_pkum_vec:
     case INDEX_op_ppc_rotl_vec:
+    case INDEX_op_dup2_vec:
         return &v_v_v;
     case INDEX_op_not_vec:
     case INDEX_op_dup_vec:
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [Qemu-devel] [PATCH v4 5/7] tcg/ppc: Update vector support to v2.06
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
                   ` (3 preceding siblings ...)
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 4/7] tcg/ppc: Support vector dup2 Richard Henderson
@ 2019-05-19  4:15 ` Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 6/7] tcg/ppc: Update vector support to v2.07 Richard Henderson
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

This includes double-word loads and stores, double-word load and splat,
double-word permute, and bit select.  All of which require multiple
operations in the base Altivec instruction set.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |  5 ++--
 tcg/ppc/tcg-target.inc.c | 51 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index a130192cbd..40544f996d 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -60,6 +60,7 @@ typedef enum {
 
 extern bool have_isa_altivec;
 extern bool have_isa_2_06;
+extern bool have_isa_2_06_vsx;
 extern bool have_isa_3_00;
 
 /* optional instructions automatically implemented */
@@ -141,7 +142,7 @@ extern bool have_isa_3_00;
  * instruction and substituting two 32-bit stores makes the generated
  * code quite large.
  */
-#define TCG_TARGET_HAS_v64              0
+#define TCG_TARGET_HAS_v64              have_isa_2_06_vsx
 #define TCG_TARGET_HAS_v128             have_isa_altivec
 #define TCG_TARGET_HAS_v256             0
 
@@ -157,7 +158,7 @@ extern bool have_isa_3_00;
 #define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       1
-#define TCG_TARGET_HAS_bitsel_vec       0
+#define TCG_TARGET_HAS_bitsel_vec       have_isa_2_06_vsx
 #define TCG_TARGET_HAS_cmpsel_vec       0
 
 void flush_icache_range(uintptr_t start, uintptr_t stop);
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 3219df2e90..6cb8c8f0eb 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -66,6 +66,7 @@ static tcg_insn_unit *tb_ret_addr;
 
 bool have_isa_altivec;
 bool have_isa_2_06;
+bool have_isa_2_06_vsx;
 bool have_isa_3_00;
 
 #define HAVE_ISA_2_06  have_isa_2_06
@@ -470,9 +471,12 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define LVEBX      XO31(7)
 #define LVEHX      XO31(39)
 #define LVEWX      XO31(71)
+#define LXSDX      XO31(588)      /* v2.06 */
+#define LXVDSX     XO31(332)      /* v2.06 */
 
 #define STVX       XO31(231)
 #define STVEWX     XO31(199)
+#define STXSDX     XO31(716)      /* v2.06 */
 
 #define VADDSBS    VX4(768)
 #define VADDUBS    VX4(512)
@@ -561,6 +565,9 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 
 #define VSLDOI     VX4(44)
 
+#define XXPERMDI   (OPCD(60) | (10 << 3))   /* v2.06 */
+#define XXSEL      (OPCD(60) | (3 << 4))    /* v2.06 */
+
 #define RT(r) ((r)<<21)
 #define RS(r) ((r)<<21)
 #define RA(r) ((r)<<16)
@@ -887,11 +894,21 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
         add = 0;
     }
 
-    load_insn = LVX | VRT(ret) | RB(TCG_REG_TMP1);
-    if (TCG_TARGET_REG_BITS == 64) {
-        new_pool_l2(s, rel, s->code_ptr, add, val, val);
+    if (have_isa_2_06_vsx) {
+        load_insn = type == TCG_TYPE_V64 ? LXSDX : LXVDSX;
+        load_insn |= VRT(ret) | RB(TCG_REG_TMP1) | 1;
+        if (TCG_TARGET_REG_BITS == 64) {
+            new_pool_label(s, val, rel, s->code_ptr, add);
+        } else {
+            new_pool_l2(s, rel, s->code_ptr, add, val, val);
+        }
     } else {
-        new_pool_l4(s, rel, s->code_ptr, add, val, val, val, val);
+        load_insn = LVX | VRT(ret) | RB(TCG_REG_TMP1);
+        if (TCG_TARGET_REG_BITS == 64) {
+            new_pool_l2(s, rel, s->code_ptr, add, val, val);
+        } else {
+            new_pool_l4(s, rel, s->code_ptr, add, val, val, val, val);
+        }
     }
 
     if (USE_REG_TB) {
@@ -1138,6 +1155,10 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
         /* fallthru */
     case TCG_TYPE_V64:
         tcg_debug_assert(ret >= 32);
+        if (have_isa_2_06_vsx) {
+            tcg_out_mem_long(s, 0, LXSDX | 1, ret & 31, base, offset);
+            break;
+        }
         assert((offset & 7) == 0);
         tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset & -16);
         if (offset & 8) {
@@ -1181,6 +1202,10 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
         /* fallthru */
     case TCG_TYPE_V64:
         tcg_debug_assert(arg >= 32);
+        if (have_isa_2_06_vsx) {
+            tcg_out_mem_long(s, 0, STXSDX | 1, arg & 31, base, offset);
+            break;
+        }
         assert((offset & 7) == 0);
         if (offset & 8) {
             tcg_out_vsldoi(s, TCG_VEC_TMP1, arg, arg, 8);
@@ -2916,6 +2941,8 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
         return vece <= MO_32 ? -1 : 0;
+    case INDEX_op_bitsel_vec:
+        return have_isa_2_06_vsx;
     default:
         return 0;
     }
@@ -2942,6 +2969,10 @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
         tcg_out32(s, VSPLTW | VRT(dst) | VRB(src) | (1 << 16));
         break;
     case MO_64:
+        if (have_isa_2_06_vsx) {
+            tcg_out32(s, XXPERMDI | 7 | VRT(dst) | VRA(src) | VRB(src));
+            break;
+        }
         tcg_out_vsldoi(s, TCG_VEC_TMP1, src, src, 8);
         tcg_out_vsldoi(s, dst, TCG_VEC_TMP1, src, 8);
         break;
@@ -2986,6 +3017,10 @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
         tcg_out32(s, VSPLTW | VRT(out) | VRB(out) | (elt << 16));
         break;
     case MO_64:
+        if (have_isa_2_06_vsx) {
+            tcg_out_mem_long(s, 0, LXVDSX | 1, out, base, offset);
+            break;
+        }
         assert((offset & 7) == 0);
         tcg_out_mem_long(s, 0, LVX, out, base, offset & -16);
         tcg_out_vsldoi(s, TCG_VEC_TMP1, out, out, 8);
@@ -3120,6 +3155,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    case INDEX_op_bitsel_vec:
+        tcg_out32(s, XXSEL | 0xf | VRT(a0) | VRC(a1) | VRB(a2) | VRA(args[3]));
+        return;
+
     case INDEX_op_dup2_vec:
         assert(TCG_TARGET_REG_BITS == 32);
         /* With inputs a1 = xLxx, a2 = xHxx  */
@@ -3515,6 +3554,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_st_vec:
     case INDEX_op_dupm_vec:
         return &v_r;
+    case INDEX_op_bitsel_vec:
     case INDEX_op_ppc_msum_vec:
         return &v_v_v_v;
 
@@ -3533,6 +3573,9 @@ static void tcg_target_init(TCGContext *s)
     }
     if (hwcap & PPC_FEATURE_ARCH_2_06) {
         have_isa_2_06 = true;
+        if (hwcap & PPC_FEATURE_HAS_VSX) {
+            have_isa_2_06_vsx = true;
+        }
     }
 #ifdef PPC_FEATURE2_ARCH_3_00
     if (hwcap2 & PPC_FEATURE2_ARCH_3_00) {
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [Qemu-devel] [PATCH v4 6/7] tcg/ppc: Update vector support to v2.07
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
                   ` (4 preceding siblings ...)
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 5/7] tcg/ppc: Update vector support to v2.06 Richard Henderson
@ 2019-05-19  4:15 ` Richard Henderson
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 7/7] tcg/ppc: Update vector support to v3.00 Richard Henderson
  2019-06-18  5:00 ` [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
  7 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

This includes single-word loads and stores, lots of double-word
arithmetic, and a few extra logical operations.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |   3 +-
 tcg/ppc/tcg-target.inc.c | 111 +++++++++++++++++++++++++++++++--------
 2 files changed, 91 insertions(+), 23 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 40544f996d..b8355d0a56 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -61,6 +61,7 @@ typedef enum {
 extern bool have_isa_altivec;
 extern bool have_isa_2_06;
 extern bool have_isa_2_06_vsx;
+extern bool have_isa_2_07_vsx;
 extern bool have_isa_3_00;
 
 /* optional instructions automatically implemented */
@@ -147,7 +148,7 @@ extern bool have_isa_3_00;
 #define TCG_TARGET_HAS_v256             0
 
 #define TCG_TARGET_HAS_andc_vec         1
-#define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_orc_vec          have_isa_2_07_vsx
 #define TCG_TARGET_HAS_not_vec          1
 #define TCG_TARGET_HAS_neg_vec          0
 #define TCG_TARGET_HAS_abs_vec          0
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 6cb8c8f0eb..dedf0de04d 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -67,6 +67,7 @@ static tcg_insn_unit *tb_ret_addr;
 bool have_isa_altivec;
 bool have_isa_2_06;
 bool have_isa_2_06_vsx;
+bool have_isa_2_07_vsx;
 bool have_isa_3_00;
 
 #define HAVE_ISA_2_06  have_isa_2_06
@@ -473,10 +474,12 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define LVEWX      XO31(71)
 #define LXSDX      XO31(588)      /* v2.06 */
 #define LXVDSX     XO31(332)      /* v2.06 */
+#define LXSIWZX    XO31(12)       /* v2.07 */
 
 #define STVX       XO31(231)
 #define STVEWX     XO31(199)
 #define STXSDX     XO31(716)      /* v2.06 */
+#define STXSIWX    XO31(140)      /* v2.07 */
 
 #define VADDSBS    VX4(768)
 #define VADDUBS    VX4(512)
@@ -487,6 +490,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define VADDSWS    VX4(896)
 #define VADDUWS    VX4(640)
 #define VADDUWM    VX4(128)
+#define VADDUDM    VX4(192)       /* v2.07 */
 
 #define VSUBSBS    VX4(1792)
 #define VSUBUBS    VX4(1536)
@@ -497,47 +501,62 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define VSUBSWS    VX4(1920)
 #define VSUBUWS    VX4(1664)
 #define VSUBUWM    VX4(1152)
+#define VSUBUDM    VX4(1216)      /* v2.07 */
 
 #define VMAXSB     VX4(258)
 #define VMAXSH     VX4(322)
 #define VMAXSW     VX4(386)
+#define VMAXSD     VX4(450)       /* v2.07 */
 #define VMAXUB     VX4(2)
 #define VMAXUH     VX4(66)
 #define VMAXUW     VX4(130)
+#define VMAXUD     VX4(194)       /* v2.07 */
 #define VMINSB     VX4(770)
 #define VMINSH     VX4(834)
 #define VMINSW     VX4(898)
+#define VMINSD     VX4(962)       /* v2.07 */
 #define VMINUB     VX4(514)
 #define VMINUH     VX4(578)
 #define VMINUW     VX4(642)
+#define VMINUD     VX4(706)       /* v2.07 */
 
 #define VCMPEQUB   VX4(6)
 #define VCMPEQUH   VX4(70)
 #define VCMPEQUW   VX4(134)
+#define VCMPEQUD   VX4(199)       /* v2.07 */
 #define VCMPGTSB   VX4(774)
 #define VCMPGTSH   VX4(838)
 #define VCMPGTSW   VX4(902)
+#define VCMPGTSD   VX4(967)       /* v2.07 */
 #define VCMPGTUB   VX4(518)
 #define VCMPGTUH   VX4(582)
 #define VCMPGTUW   VX4(646)
+#define VCMPGTUD   VX4(711)       /* v2.07 */
 
 #define VSLB       VX4(260)
 #define VSLH       VX4(324)
 #define VSLW       VX4(388)
+#define VSLD       VX4(1476)      /* v2.07 */
 #define VSRB       VX4(516)
 #define VSRH       VX4(580)
 #define VSRW       VX4(644)
+#define VSRD       VX4(1732)      /* v2.07 */
 #define VSRAB      VX4(772)
 #define VSRAH      VX4(836)
 #define VSRAW      VX4(900)
+#define VSRAD      VX4(964)       /* v2.07 */
 #define VRLB       VX4(4)
 #define VRLH       VX4(68)
 #define VRLW       VX4(132)
+#define VRLD       VX4(196)       /* v2.07 */
 
 #define VMULEUB    VX4(520)
 #define VMULEUH    VX4(584)
+#define VMULEUW    VX4(648)       /* v2.07 */
 #define VMULOUB    VX4(8)
 #define VMULOUH    VX4(72)
+#define VMULOUW    VX4(136)       /* v2.07 */
+#define VMULUWM    VX4(137)       /* v2.07 */
 #define VMSUMUHM   VX4(38)
 
 #define VMRGHB     VX4(12)
@@ -555,6 +574,9 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define VNOR       VX4(1284)
 #define VOR        VX4(1156)
 #define VXOR       VX4(1220)
+#define VEQV       VX4(1668)      /* v2.07 */
+#define VNAND      VX4(1412)      /* v2.07 */
+#define VORC       VX4(1348)      /* v2.07 */
 
 #define VSPLTB     VX4(524)
 #define VSPLTH     VX4(588)
@@ -568,6 +590,11 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define XXPERMDI   (OPCD(60) | (10 << 3))   /* v2.06 */
 #define XXSEL      (OPCD(60) | (3 << 4))    /* v2.06 */
 
+#define MFVSRD     XO31(51)       /* v2.07 */
+#define MFVSRWZ    XO31(115)      /* v2.07 */
+#define MTVSRD     XO31(179)      /* v2.07 */
+#define MTVSRWZ    XO31(179)      /* v2.07 */
+
 #define RT(r) ((r)<<21)
 #define RS(r) ((r)<<21)
 #define RA(r) ((r)<<16)
@@ -700,7 +727,15 @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
         if (ret < 32 && arg < 32) {
             tcg_out32(s, OR | SAB(arg, ret, arg));
             break;
-        } else if (ret < 32 || arg < 32) {
+        } else if (ret < 32 && have_isa_2_07_vsx) {
+            tcg_out32(s, (type == TCG_TYPE_I32 ? MFVSRWZ : MFVSRD)
+                      | VRT(arg) | RA(ret) | 1);
+            break;
+        } else if (arg < 32 && have_isa_2_07_vsx) {
+            tcg_out32(s, (type == TCG_TYPE_I32 ? MTVSRWZ : MTVSRD)
+                      | VRT(ret) | RA(arg) | 1);
+            break;
+        } else {
             /* Altivec does not support vector/integer moves.  */
             return false;
         }
@@ -1140,6 +1175,10 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
             tcg_out_mem_long(s, LWZ, LWZX, ret, base, offset);
             break;
         }
+        if (have_isa_2_07_vsx) {
+            tcg_out_mem_long(s, 0, LXSIWZX | 1, ret & 31, base, offset);
+            break;
+        }
         assert((offset & 3) == 0);
         tcg_out_mem_long(s, 0, LVEWX, ret & 31, base, offset);
         shift = (offset - 4) & 0xc;
@@ -1186,6 +1225,10 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
             tcg_out_mem_long(s, STW, STWX, arg, base, offset);
             break;
         }
+        if (have_isa_2_07_vsx) {
+            tcg_out_mem_long(s, 0, STXSIWX | 1, arg & 31, base, offset);
+            break;
+        }
         assert((offset & 3) == 0);
         shift = (offset - 4) & 0xc;
         if (shift) {
@@ -2921,26 +2964,37 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_andc_vec:
     case INDEX_op_not_vec:
         return 1;
+    case INDEX_op_orc_vec:
+        return have_isa_2_07_vsx;
     case INDEX_op_add_vec:
     case INDEX_op_sub_vec:
     case INDEX_op_smax_vec:
     case INDEX_op_smin_vec:
     case INDEX_op_umax_vec:
     case INDEX_op_umin_vec:
+    case INDEX_op_shlv_vec:
+    case INDEX_op_shrv_vec:
+    case INDEX_op_sarv_vec:
+        return vece <= MO_32 || have_isa_2_07_vsx;
     case INDEX_op_ssadd_vec:
     case INDEX_op_sssub_vec:
     case INDEX_op_usadd_vec:
     case INDEX_op_ussub_vec:
-    case INDEX_op_shlv_vec:
-    case INDEX_op_shrv_vec:
-    case INDEX_op_sarv_vec:
         return vece <= MO_32;
     case INDEX_op_cmp_vec:
-    case INDEX_op_mul_vec:
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
-        return vece <= MO_32 ? -1 : 0;
+        return vece <= MO_32 || have_isa_2_07_vsx ? -1 : 0;
+    case INDEX_op_mul_vec:
+        switch (vece) {
+        case MO_8:
+        case MO_16:
+            return -1;
+        case MO_32:
+            return have_isa_2_07_vsx ? 1 : -1;
+        }
+        return 0;
     case INDEX_op_bitsel_vec:
         return have_isa_2_06_vsx;
     default:
@@ -3045,28 +3099,28 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
                            const TCGArg *args, const int *const_args)
 {
     static const uint32_t
-        add_op[4] = { VADDUBM, VADDUHM, VADDUWM, 0 },
-        sub_op[4] = { VSUBUBM, VSUBUHM, VSUBUWM, 0 },
-        eq_op[4]  = { VCMPEQUB, VCMPEQUH, VCMPEQUW, 0 },
-        gts_op[4] = { VCMPGTSB, VCMPGTSH, VCMPGTSW, 0 },
-        gtu_op[4] = { VCMPGTUB, VCMPGTUH, VCMPGTUW, 0 },
+        add_op[4] = { VADDUBM, VADDUHM, VADDUWM, VADDUDM },
+        sub_op[4] = { VSUBUBM, VSUBUHM, VSUBUWM, VSUBUDM },
+        eq_op[4]  = { VCMPEQUB, VCMPEQUH, VCMPEQUW, VCMPEQUD },
+        gts_op[4] = { VCMPGTSB, VCMPGTSH, VCMPGTSW, VCMPGTSD },
+        gtu_op[4] = { VCMPGTUB, VCMPGTUH, VCMPGTUW, VCMPGTUD },
         ssadd_op[4] = { VADDSBS, VADDSHS, VADDSWS, 0 },
         usadd_op[4] = { VADDUBS, VADDUHS, VADDUWS, 0 },
         sssub_op[4] = { VSUBSBS, VSUBSHS, VSUBSWS, 0 },
         ussub_op[4] = { VSUBUBS, VSUBUHS, VSUBUWS, 0 },
-        umin_op[4] = { VMINUB, VMINUH, VMINUW, 0 },
-        smin_op[4] = { VMINSB, VMINSH, VMINSW, 0 },
-        umax_op[4] = { VMAXUB, VMAXUH, VMAXUW, 0 },
-        smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, 0 },
-        shlv_op[4] = { VSLB, VSLH, VSLW, 0 },
-        shrv_op[4] = { VSRB, VSRH, VSRW, 0 },
-        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 },
+        umin_op[4] = { VMINUB, VMINUH, VMINUW, VMINUD },
+        smin_op[4] = { VMINSB, VMINSH, VMINSW, VMINSD },
+        umax_op[4] = { VMAXUB, VMAXUH, VMAXUW, VMAXUD },
+        smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, VMAXSD },
+        shlv_op[4] = { VSLB, VSLH, VSLW, VSLD },
+        shrv_op[4] = { VSRB, VSRH, VSRW, VSRD },
+        sarv_op[4] = { VSRAB, VSRAH, VSRAW, VSRAD },
         mrgh_op[4] = { VMRGHB, VMRGHH, VMRGHW, 0 },
         mrgl_op[4] = { VMRGLB, VMRGLH, VMRGLW, 0 },
-        muleu_op[4] = { VMULEUB, VMULEUH, 0, 0 },
-        mulou_op[4] = { VMULOUB, VMULOUH, 0, 0 },
+        muleu_op[4] = { VMULEUB, VMULEUH, VMULEUW, 0 },
+        mulou_op[4] = { VMULOUB, VMULOUH, VMULOUW, 0 },
         pkum_op[4] = { VPKUHUM, VPKUWUM, 0, 0 },
-        rotl_op[4] = { VRLB, VRLH, VRLW, 0 };
+        rotl_op[4] = { VRLB, VRLH, VRLW, VRLD };
 
     TCGType type = vecl + TCG_TYPE_V64;
     TCGArg a0 = args[0], a1 = args[1], a2 = args[2];
@@ -3089,6 +3143,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_sub_vec:
         insn = sub_op[vece];
         break;
+    case INDEX_op_mul_vec:
+        tcg_debug_assert(vece == MO_32 && have_isa_2_07_vsx);
+        insn = VMULUWM;
+        break;
     case INDEX_op_ssadd_vec:
         insn = ssadd_op[vece];
         break;
@@ -3138,6 +3196,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         insn = VNOR;
         a2 = a1;
         break;
+    case INDEX_op_orc_vec:
+        insn = VORC;
+        break;
 
     case INDEX_op_cmp_vec:
         switch (args[3]) {
@@ -3218,7 +3279,7 @@ static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
 {
     bool need_swap = false, need_inv = false;
 
-    tcg_debug_assert(vece <= MO_32);
+    tcg_debug_assert(vece <= MO_32 || have_isa_2_07_vsx);
 
     switch (cond) {
     case TCG_COND_EQ:
@@ -3282,6 +3343,7 @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
 	break;
 
     case MO_32:
+        tcg_debug_assert(!have_isa_2_07_vsx);
         t3 = tcg_temp_new_vec(type);
         t4 = tcg_temp_new_vec(type);
         tcg_gen_dupi_vec(MO_8, t4, -16);
@@ -3577,6 +3639,11 @@ static void tcg_target_init(TCGContext *s)
             have_isa_2_06_vsx = true;
         }
     }
+    if (hwcap2 & PPC_FEATURE2_ARCH_2_07) {
+        if (hwcap & PPC_FEATURE_HAS_VSX) {
+            have_isa_2_07_vsx = true;
+        }
+    }
 #ifdef PPC_FEATURE2_ARCH_3_00
     if (hwcap2 & PPC_FEATURE2_ARCH_3_00) {
         have_isa_3_00 = true;
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [Qemu-devel] [PATCH v4 7/7] tcg/ppc: Update vector support to v3.00
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
                   ` (5 preceding siblings ...)
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 6/7] tcg/ppc: Update vector support to v2.07 Richard Henderson
@ 2019-05-19  4:15 ` Richard Henderson
  2019-06-18  5:00 ` [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
  7 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19  4:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland

This includes vector load/store with immediate offset, some extra
move and splat insns, compare ne, and negate.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |   3 +-
 tcg/ppc/tcg-target.inc.c | 103 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index b8355d0a56..533f0ef510 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -63,6 +63,7 @@ extern bool have_isa_2_06;
 extern bool have_isa_2_06_vsx;
 extern bool have_isa_2_07_vsx;
 extern bool have_isa_3_00;
+extern bool have_isa_3_00_vsx;
 
 /* optional instructions automatically implemented */
 #define TCG_TARGET_HAS_ext8u_i32        0 /* andi */
@@ -150,7 +151,7 @@ extern bool have_isa_3_00;
 #define TCG_TARGET_HAS_andc_vec         1
 #define TCG_TARGET_HAS_orc_vec          have_isa_2_07_vsx
 #define TCG_TARGET_HAS_not_vec          1
-#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_neg_vec          have_isa_3_00_vsx
 #define TCG_TARGET_HAS_abs_vec          0
 #define TCG_TARGET_HAS_shi_vec          0
 #define TCG_TARGET_HAS_shs_vec          0
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index dedf0de04d..4ee77df178 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -69,6 +69,7 @@ bool have_isa_2_06;
 bool have_isa_2_06_vsx;
 bool have_isa_2_07_vsx;
 bool have_isa_3_00;
+bool have_isa_3_00_vsx;
 
 #define HAVE_ISA_2_06  have_isa_2_06
 #define HAVE_ISEL      have_isa_2_06
@@ -475,11 +476,16 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define LXSDX      XO31(588)      /* v2.06 */
 #define LXVDSX     XO31(332)      /* v2.06 */
 #define LXSIWZX    XO31(12)       /* v2.07 */
+#define LXV        (OPCD(61) | 1) /* v3.00 */
+#define LXSD       (OPCD(57) | 2) /* v3.00 */
+#define LXVWSX     XO31(364)      /* v3.00 */
 
 #define STVX       XO31(231)
 #define STVEWX     XO31(199)
 #define STXSDX     XO31(716)      /* v2.06 */
 #define STXSIWX    XO31(140)      /* v2.07 */
+#define STXV       (OPCD(61) | 5) /* v3.00 */
+#define STXSD      (OPCD(61) | 2) /* v3.00 */
 
 #define VADDSBS    VX4(768)
 #define VADDUBS    VX4(512)
@@ -503,6 +509,9 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define VSUBUWM    VX4(1152)
 #define VSUBUDM    VX4(1216)      /* v2.07 */
 
+#define VNEGW      (VX4(1538) | (6 << 16))  /* v3.00 */
+#define VNEGD      (VX4(1538) | (7 << 16))  /* v3.00 */
+
 #define VMAXSB     VX4(258)
 #define VMAXSH     VX4(322)
 #define VMAXSW     VX4(386)
@@ -532,6 +541,9 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define VCMPGTUH   VX4(582)
 #define VCMPGTUW   VX4(646)
 #define VCMPGTUD   VX4(711)       /* v2.07 */
+#define VCMPNEB    VX4(7)         /* v3.00 */
+#define VCMPNEH    VX4(71)        /* v3.00 */
+#define VCMPNEW    VX4(135)       /* v3.00 */
 
 #define VSLB       VX4(260)
 #define VSLH       VX4(324)
@@ -589,11 +601,14 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 
 #define XXPERMDI   (OPCD(60) | (10 << 3))   /* v2.06 */
 #define XXSEL      (OPCD(60) | (3 << 4))    /* v2.06 */
+#define XXSPLTIB   (OPCD(60) | (360 << 1))  /* v3.00 */
 
 #define MFVSRD     XO31(51)       /* v2.07 */
 #define MFVSRWZ    XO31(115)      /* v2.07 */
 #define MTVSRD     XO31(179)      /* v2.07 */
 #define MTVSRWZ    XO31(179)      /* v2.07 */
+#define MTVSRDD    XO31(435)      /* v3.00 */
+#define MTVSRWS    XO31(403)      /* v3.00 */
 
 #define RT(r) ((r)<<21)
 #define RS(r) ((r)<<21)
@@ -917,6 +932,10 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, TCGReg ret,
             return;
         }
     }
+    if (have_isa_3_00_vsx && val == (tcg_target_long)dup_const(MO_8, val)) {
+        tcg_out32(s, XXSPLTIB | VRT(ret) | ((val & 0xff) << 11) | 1);
+        return;
+    }
 
     /*
      * Otherwise we must load the value from the constant pool.
@@ -1105,7 +1124,7 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
                              TCGReg base, tcg_target_long offset)
 {
     tcg_target_long orig = offset, l0, l1, extra = 0, align = 0;
-    bool is_store = false;
+    bool is_int_store = false;
     TCGReg rs = TCG_REG_TMP1;
 
     switch (opi) {
@@ -1118,11 +1137,20 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
             break;
         }
         break;
+    case LXSD:
+    case STXSD:
+        align = 3;
+        break;
+    case LXV: case LXV | 8:
+    case STXV: case STXV | 8:
+        /* The |8 cases force altivec registers.  */
+        align = 15;
+        break;
     case STD:
         align = 3;
         /* FALLTHRU */
     case STB: case STH: case STW:
-        is_store = true;
+        is_int_store = true;
         break;
     }
 
@@ -1131,7 +1159,7 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
         if (rs == base) {
             rs = TCG_REG_R0;
         }
-        tcg_debug_assert(!is_store || rs != rt);
+        tcg_debug_assert(!is_int_store || rs != rt);
         tcg_out_movi(s, TCG_TYPE_PTR, rs, orig);
         tcg_out32(s, opx | TAB(rt, base, rs));
         return;
@@ -1195,7 +1223,8 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
     case TCG_TYPE_V64:
         tcg_debug_assert(ret >= 32);
         if (have_isa_2_06_vsx) {
-            tcg_out_mem_long(s, 0, LXSDX | 1, ret & 31, base, offset);
+            tcg_out_mem_long(s, have_isa_3_00_vsx ? LXSD : 0, LXSDX | 1,
+                             ret & 31, base, offset);
             break;
         }
         assert((offset & 7) == 0);
@@ -1207,7 +1236,8 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
     case TCG_TYPE_V128:
         tcg_debug_assert(ret >= 32);
         assert((offset & 15) == 0);
-        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset);
+        tcg_out_mem_long(s, have_isa_3_00_vsx ? LXV | 8 : 0, LVX,
+                         ret & 31, base, offset);
         break;
     default:
         g_assert_not_reached();
@@ -1246,7 +1276,8 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
     case TCG_TYPE_V64:
         tcg_debug_assert(arg >= 32);
         if (have_isa_2_06_vsx) {
-            tcg_out_mem_long(s, 0, STXSDX | 1, arg & 31, base, offset);
+            tcg_out_mem_long(s, have_isa_3_00_vsx ? STXSD : 0,
+                             STXSDX | 1, arg & 31, base, offset);
             break;
         }
         assert((offset & 7) == 0);
@@ -1259,7 +1290,8 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
         break;
     case TCG_TYPE_V128:
         tcg_debug_assert(arg >= 32);
-        tcg_out_mem_long(s, 0, STVX, arg & 31, base, offset);
+        tcg_out_mem_long(s, have_isa_3_00_vsx ? STXV | 8 : 0, STVX,
+                         arg & 31, base, offset);
         break;
     default:
         g_assert_not_reached();
@@ -2986,6 +3018,8 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
         return vece <= MO_32 || have_isa_2_07_vsx ? -1 : 0;
+    case INDEX_op_neg_vec:
+        return vece >= MO_32 && have_isa_3_00_vsx;
     case INDEX_op_mul_vec:
         switch (vece) {
         case MO_8:
@@ -3006,7 +3040,22 @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
                             TCGReg dst, TCGReg src)
 {
     tcg_debug_assert(dst >= 32);
-    tcg_debug_assert(src >= 32);
+
+    /* Splat from integer reg allowed via constraints for v3.00.  */
+    if (src < 32) {
+        tcg_debug_assert(have_isa_3_00_vsx);
+        switch (vece) {
+        case MO_64:
+            tcg_out32(s, MTVSRDD | 1 | VRT(dst) | RA(src) | RB(src));
+            return true;
+        case MO_32:
+            tcg_out32(s, MTVSRWS | 1 | VRT(dst) | RA(src));
+            return true;
+        default:
+            /* Fail, so that we fall back on either dupm or mov+dup.  */
+            return false;
+        }
+    }
 
     /*
      * Recall we use (or emulate) VSX integer loads, so the integer is
@@ -3045,7 +3094,11 @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
     out &= 31;
     switch (vece) {
     case MO_8:
-        tcg_out_mem_long(s, 0, LVEBX, out, base, offset);
+        if (have_isa_3_00_vsx) {
+            tcg_out_mem_long(s, LXV | 8, LVX, out, base, offset & -16);
+        } else {
+            tcg_out_mem_long(s, 0, LVEBX, out, base, offset);
+        }
         elt = extract32(offset, 0, 4);
 #ifndef HOST_WORDS_BIGENDIAN
         elt ^= 15;
@@ -3054,7 +3107,11 @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
         break;
     case MO_16:
         assert((offset & 1) == 0);
-        tcg_out_mem_long(s, 0, LVEHX, out, base, offset);
+        if (have_isa_3_00_vsx) {
+            tcg_out_mem_long(s, LXV | 8, LVX, out, base, offset & -16);
+        } else {
+            tcg_out_mem_long(s, 0, LVEHX, out, base, offset);
+        }
         elt = extract32(offset, 1, 3);
 #ifndef HOST_WORDS_BIGENDIAN
         elt ^= 7;
@@ -3062,6 +3119,10 @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
         tcg_out32(s, VSPLTH | VRT(out) | VRB(out) | (elt << 16));
         break;
     case MO_32:
+        if (have_isa_3_00_vsx) {
+            tcg_out_mem_long(s, 0, LXVWSX | 1, out, base, offset);
+            break;
+        }
         assert((offset & 3) == 0);
         tcg_out_mem_long(s, 0, LVEWX, out, base, offset);
         elt = extract32(offset, 2, 2);
@@ -3101,7 +3162,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     static const uint32_t
         add_op[4] = { VADDUBM, VADDUHM, VADDUWM, VADDUDM },
         sub_op[4] = { VSUBUBM, VSUBUHM, VSUBUWM, VSUBUDM },
+        neg_op[4] = { 0, 0, VNEGW, VNEGD },
         eq_op[4]  = { VCMPEQUB, VCMPEQUH, VCMPEQUW, VCMPEQUD },
+        ne_op[4]  = { VCMPNEB, VCMPNEH, VCMPNEW, 0 },
         gts_op[4] = { VCMPGTSB, VCMPGTSH, VCMPGTSW, VCMPGTSD },
         gtu_op[4] = { VCMPGTUB, VCMPGTUH, VCMPGTUW, VCMPGTUD },
         ssadd_op[4] = { VADDSBS, VADDSHS, VADDSWS, 0 },
@@ -3143,6 +3206,11 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_sub_vec:
         insn = sub_op[vece];
         break;
+    case INDEX_op_neg_vec:
+        insn = neg_op[vece];
+        a2 = a1;
+        a1 = 0;
+        break;
     case INDEX_op_mul_vec:
         tcg_debug_assert(vece == MO_32 && have_isa_2_07_vsx);
         insn = VMULUWM;
@@ -3205,6 +3273,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         case TCG_COND_EQ:
             insn = eq_op[vece];
             break;
+        case TCG_COND_NE:
+            insn = ne_op[vece];
+            break;
         case TCG_COND_GT:
             insn = gts_op[vece];
             break;
@@ -3287,6 +3358,10 @@ static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
     case TCG_COND_GTU:
         break;
     case TCG_COND_NE:
+        if (have_isa_3_00_vsx && vece <= MO_32) {
+            break;
+        }
+        /* fall through */
     case TCG_COND_LE:
     case TCG_COND_LEU:
         need_inv = true;
@@ -3442,6 +3517,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     static const TCGTargetOpDef sub2
         = { .args_ct_str = { "r", "r", "rI", "rZM", "r", "r" } };
     static const TCGTargetOpDef v_r = { .args_ct_str = { "v", "r" } };
+    static const TCGTargetOpDef v_vr = { .args_ct_str = { "v", "vr" } };
     static const TCGTargetOpDef v_v = { .args_ct_str = { "v", "v" } };
     static const TCGTargetOpDef v_v_v = { .args_ct_str = { "v", "v", "v" } };
     static const TCGTargetOpDef v_v_v_v
@@ -3610,8 +3686,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_dup2_vec:
         return &v_v_v;
     case INDEX_op_not_vec:
-    case INDEX_op_dup_vec:
+    case INDEX_op_neg_vec:
         return &v_v;
+    case INDEX_op_dup_vec:
+        return have_isa_3_00_vsx ? &v_vr : &v_v;
     case INDEX_op_ld_vec:
     case INDEX_op_st_vec:
     case INDEX_op_dupm_vec:
@@ -3647,6 +3725,9 @@ static void tcg_target_init(TCGContext *s)
 #ifdef PPC_FEATURE2_ARCH_3_00
     if (hwcap2 & PPC_FEATURE2_ARCH_3_00) {
         have_isa_3_00 = true;
+        if (hwcap & PPC_FEATURE_HAS_VSX) {
+            have_isa_3_00_vsx = true;
+        }
     }
 #endif
 
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 3/7] tcg/ppc: Support vector multiply
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 3/7] tcg/ppc: Support vector multiply Richard Henderson
@ 2019-05-19  5:05   ` Aleksandar Markovic
  2019-05-19 14:45     ` Richard Henderson
  0 siblings, 1 reply; 34+ messages in thread
From: Aleksandar Markovic @ 2019-05-19  5:05 UTC (permalink / raw)
  To: Richard Henderson; +Cc: mark.cave-ayland, qemu-devel

On May 19, 2019 6:35 AM, "Richard Henderson" <richard.henderson@linaro.org>
wrote:
>
> For Altivec, this is always an expansion.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---

Large portions of this patch have nothing to do with what title or commit
message say.Reorganize.

Thanks, Aleksandar

>  tcg/ppc/tcg-target.h     |   2 +-
>  tcg/ppc/tcg-target.opc.h |   8 +++
>  tcg/ppc/tcg-target.inc.c | 112 ++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 120 insertions(+), 2 deletions(-)
>
> diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
> index 766706fd30..a130192cbd 100644
> --- a/tcg/ppc/tcg-target.h
> +++ b/tcg/ppc/tcg-target.h
> @@ -154,7 +154,7 @@ extern bool have_isa_3_00;
>  #define TCG_TARGET_HAS_shs_vec          0
>  #define TCG_TARGET_HAS_shv_vec          1
>  #define TCG_TARGET_HAS_cmp_vec          1
> -#define TCG_TARGET_HAS_mul_vec          0
> +#define TCG_TARGET_HAS_mul_vec          1
>  #define TCG_TARGET_HAS_sat_vec          1
>  #define TCG_TARGET_HAS_minmax_vec       1
>  #define TCG_TARGET_HAS_bitsel_vec       0
> diff --git a/tcg/ppc/tcg-target.opc.h b/tcg/ppc/tcg-target.opc.h
> index 4816a6c3d4..5c6a5ad52c 100644
> --- a/tcg/ppc/tcg-target.opc.h
> +++ b/tcg/ppc/tcg-target.opc.h
> @@ -1,3 +1,11 @@
>  /* Target-specific opcodes for host vector expansion.  These will be
>     emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
>     consider these to be UNSPEC with names.  */
> +
> +DEF(ppc_mrgh_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_mrgl_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_msum_vec, 1, 3, 0, IMPLVEC)
> +DEF(ppc_muleu_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_mulou_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_pkum_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_rotl_vec, 1, 2, 0, IMPLVEC)
> diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
> index 62a8c428e0..9d58db9eb1 100644
> --- a/tcg/ppc/tcg-target.inc.c
> +++ b/tcg/ppc/tcg-target.inc.c
> @@ -526,6 +526,25 @@ static int tcg_target_const_match(tcg_target_long
val, TCGType type,
>  #define VSRAB      VX4(772)
>  #define VSRAH      VX4(836)
>  #define VSRAW      VX4(900)
> +#define VRLB       VX4(4)
> +#define VRLH       VX4(68)
> +#define VRLW       VX4(132)
> +
> +#define VMULEUB    VX4(520)
> +#define VMULEUH    VX4(584)
> +#define VMULOUB    VX4(8)
> +#define VMULOUH    VX4(72)
> +#define VMSUMUHM   VX4(38)
> +
> +#define VMRGHB     VX4(12)
> +#define VMRGHH     VX4(76)
> +#define VMRGHW     VX4(140)
> +#define VMRGLB     VX4(268)
> +#define VMRGLH     VX4(332)
> +#define VMRGLW     VX4(396)
> +
> +#define VPKUHUM    VX4(14)
> +#define VPKUWUM    VX4(78)
>
>  #define VAND       VX4(1028)
>  #define VANDC      VX4(1092)
> @@ -2892,6 +2911,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType
type, unsigned vece)
>      case INDEX_op_sarv_vec:
>          return vece <= MO_32;
>      case INDEX_op_cmp_vec:
> +    case INDEX_op_mul_vec:
>      case INDEX_op_shli_vec:
>      case INDEX_op_shri_vec:
>      case INDEX_op_sari_vec:
> @@ -3005,7 +3025,13 @@ static void tcg_out_vec_op(TCGContext *s,
TCGOpcode opc,
>          smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, 0 },
>          shlv_op[4] = { VSLB, VSLH, VSLW, 0 },
>          shrv_op[4] = { VSRB, VSRH, VSRW, 0 },
> -        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 };
> +        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 },
> +        mrgh_op[4] = { VMRGHB, VMRGHH, VMRGHW, 0 },
> +        mrgl_op[4] = { VMRGLB, VMRGLH, VMRGLW, 0 },
> +        muleu_op[4] = { VMULEUB, VMULEUH, 0, 0 },
> +        mulou_op[4] = { VMULOUB, VMULOUH, 0, 0 },
> +        pkum_op[4] = { VPKUHUM, VPKUWUM, 0, 0 },
> +        rotl_op[4] = { VRLB, VRLH, VRLW, 0 };
>
>      TCGType type = vecl + TCG_TYPE_V64;
>      TCGArg a0 = args[0], a1 = args[1], a2 = args[2];
> @@ -3094,6 +3120,29 @@ static void tcg_out_vec_op(TCGContext *s,
TCGOpcode opc,
>          }
>          break;
>
> +    case INDEX_op_ppc_mrgh_vec:
> +        insn = mrgh_op[vece];
> +        break;
> +    case INDEX_op_ppc_mrgl_vec:
> +        insn = mrgl_op[vece];
> +        break;
> +    case INDEX_op_ppc_muleu_vec:
> +        insn = muleu_op[vece];
> +        break;
> +    case INDEX_op_ppc_mulou_vec:
> +        insn = mulou_op[vece];
> +        break;
> +    case INDEX_op_ppc_pkum_vec:
> +        insn = pkum_op[vece];
> +        break;
> +    case INDEX_op_ppc_rotl_vec:
> +        insn = rotl_op[vece];
> +        break;
> +    case INDEX_op_ppc_msum_vec:
> +        tcg_debug_assert(vece == MO_16);
> +        tcg_out32(s, VMSUMUHM | VRT(a0) | VRA(a1) | VRB(a2) |
VRC(args[3]));
> +        return;
> +
>      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
>      case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
>      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
> @@ -3163,6 +3212,53 @@ static void expand_vec_cmp(TCGType type, unsigned
vece, TCGv_vec v0,
>      }
>  }
>
> +static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
> +                           TCGv_vec v1, TCGv_vec v2)
> +{
> +    TCGv_vec t1 = tcg_temp_new_vec(type);
> +    TCGv_vec t2 = tcg_temp_new_vec(type);
> +    TCGv_vec t3, t4;
> +
> +    switch (vece) {
> +    case MO_8:
> +    case MO_16:
> +        vec_gen_3(INDEX_op_ppc_muleu_vec, type, vece, tcgv_vec_arg(t1),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
> +        vec_gen_3(INDEX_op_ppc_mulou_vec, type, vece, tcgv_vec_arg(t2),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
> +        vec_gen_3(INDEX_op_ppc_mrgh_vec, type, vece + 1,
tcgv_vec_arg(v0),
> +                  tcgv_vec_arg(t1), tcgv_vec_arg(t2));
> +        vec_gen_3(INDEX_op_ppc_mrgl_vec, type, vece + 1,
tcgv_vec_arg(t1),
> +                  tcgv_vec_arg(t1), tcgv_vec_arg(t2));
> +        vec_gen_3(INDEX_op_ppc_pkum_vec, type, vece, tcgv_vec_arg(v0),
> +                  tcgv_vec_arg(v0), tcgv_vec_arg(t1));
> +       break;
> +
> +    case MO_32:
> +        t3 = tcg_temp_new_vec(type);
> +        t4 = tcg_temp_new_vec(type);
> +        tcg_gen_dupi_vec(MO_8, t4, -16);
> +        vec_gen_3(INDEX_op_ppc_rotl_vec, type, MO_32, tcgv_vec_arg(t1),
> +                  tcgv_vec_arg(v2), tcgv_vec_arg(t4));
> +        vec_gen_3(INDEX_op_ppc_mulou_vec, type, MO_16, tcgv_vec_arg(t2),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
> +        tcg_gen_dupi_vec(MO_8, t3, 0);
> +        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t3),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
> +        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t3),
> +                  tcgv_vec_arg(t3), tcgv_vec_arg(t4));
> +        tcg_gen_add_vec(MO_32, v0, t2, t3);
> +        tcg_temp_free_vec(t3);
> +        tcg_temp_free_vec(t4);
> +        break;
> +
> +    default:
> +        g_assert_not_reached();
> +    }
> +    tcg_temp_free_vec(t1);
> +    tcg_temp_free_vec(t2);
> +}
> +
>  void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
>                         TCGArg a0, ...)
>  {
> @@ -3189,6 +3285,10 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType
type, unsigned vece,
>          v2 = temp_tcgv_vec(arg_temp(a2));
>          expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
>          break;
> +    case INDEX_op_mul_vec:
> +        v2 = temp_tcgv_vec(arg_temp(a2));
> +        expand_vec_mul(type, vece, v0, v1, v2);
> +        break;
>      default:
>          g_assert_not_reached();
>      }
> @@ -3235,6 +3335,8 @@ static const TCGTargetOpDef
*tcg_target_op_def(TCGOpcode op)
>      static const TCGTargetOpDef v_r = { .args_ct_str = { "v", "r" } };
>      static const TCGTargetOpDef v_v = { .args_ct_str = { "v", "v" } };
>      static const TCGTargetOpDef v_v_v = { .args_ct_str = { "v", "v", "v"
} };
> +    static const TCGTargetOpDef v_v_v_v
> +        = { .args_ct_str = { "v", "v", "v", "v" } };
>
>      switch (op) {
>      case INDEX_op_goto_ptr:
> @@ -3390,6 +3492,12 @@ static const TCGTargetOpDef
*tcg_target_op_def(TCGOpcode op)
>      case INDEX_op_shlv_vec:
>      case INDEX_op_shrv_vec:
>      case INDEX_op_sarv_vec:
> +    case INDEX_op_ppc_mrgh_vec:
> +    case INDEX_op_ppc_mrgl_vec:
> +    case INDEX_op_ppc_muleu_vec:
> +    case INDEX_op_ppc_mulou_vec:
> +    case INDEX_op_ppc_pkum_vec:
> +    case INDEX_op_ppc_rotl_vec:
>          return &v_v_v;
>      case INDEX_op_not_vec:
>      case INDEX_op_dup_vec:
> @@ -3398,6 +3506,8 @@ static const TCGTargetOpDef
*tcg_target_op_def(TCGOpcode op)
>      case INDEX_op_st_vec:
>      case INDEX_op_dupm_vec:
>          return &v_r;
> +    case INDEX_op_ppc_msum_vec:
> +        return &v_v_v_v;
>
>      default:
>          return NULL;
> --
> 2.17.1
>
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 3/7] tcg/ppc: Support vector multiply
  2019-05-19  5:05   ` Aleksandar Markovic
@ 2019-05-19 14:45     ` Richard Henderson
  0 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-05-19 14:45 UTC (permalink / raw)
  To: Aleksandar Markovic; +Cc: mark.cave-ayland, qemu-devel

On 5/18/19 10:05 PM, Aleksandar Markovic wrote:
> 
> On May 19, 2019 6:35 AM, "Richard Henderson" <richard.henderson@linaro.org
> <mailto:richard.henderson@linaro.org>> wrote:
>>
>> For Altivec, this is always an expansion.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org
> <mailto:richard.henderson@linaro.org>>
>> ---
> 
> Large portions of this patch have nothing to do with what title or commit
> message say.Reorganize.


I beg to differ.  Absolutely nothing in this patch is unrelated to supporting
vector multiply for Altivec.


> 
> 
> For Altivec, this is always an expansion.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.h     |   2 +-
>  tcg/ppc/tcg-target.opc.h |   8 +++
>  tcg/ppc/tcg-target.inc.c | 112 ++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 120 insertions(+), 2 deletions(-)
> 
> diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
> index 766706fd30..a130192cbd 100644
> --- a/tcg/ppc/tcg-target.h
> +++ b/tcg/ppc/tcg-target.h
> @@ -154,7 +154,7 @@ extern bool have_isa_3_00;
>  #define TCG_TARGET_HAS_shs_vec          0
>  #define TCG_TARGET_HAS_shv_vec          1
>  #define TCG_TARGET_HAS_cmp_vec          1
> -#define TCG_TARGET_HAS_mul_vec          0
> +#define TCG_TARGET_HAS_mul_vec          1

Enable middle-end expansion of multiply.


>  #define TCG_TARGET_HAS_sat_vec          1
>  #define TCG_TARGET_HAS_minmax_vec       1
>  #define TCG_TARGET_HAS_bitsel_vec       0
> diff --git a/tcg/ppc/tcg-target.opc.h b/tcg/ppc/tcg-target.opc.h
> index 4816a6c3d4..5c6a5ad52c 100644
> --- a/tcg/ppc/tcg-target.opc.h
> +++ b/tcg/ppc/tcg-target.opc.h
> @@ -1,3 +1,11 @@
>  /* Target-specific opcodes for host vector expansion.  These will be
>     emitted by tcg_expand_vec_op.  For those familiar with GCC internals,
>     consider these to be UNSPEC with names.  */
> +
> +DEF(ppc_mrgh_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_mrgl_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_msum_vec, 1, 3, 0, IMPLVEC)
> +DEF(ppc_muleu_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_mulou_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_pkum_vec, 1, 2, 0, IMPLVEC)
> +DEF(ppc_rotl_vec, 1, 2, 0, IMPLVEC)

Define Altivec-specifc tcg opcodes needed for expansion of multiply.
These directly correspond to...

> diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
> index 62a8c428e0..9d58db9eb1 100644
> --- a/tcg/ppc/tcg-target.inc.c
> +++ b/tcg/ppc/tcg-target.inc.c
> @@ -526,6 +526,25 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
>  #define VSRAB      VX4(772)
>  #define VSRAH      VX4(836)
>  #define VSRAW      VX4(900)
> +#define VRLB       VX4(4)
> +#define VRLH       VX4(68)
> +#define VRLW       VX4(132)
> +
> +#define VMULEUB    VX4(520)
> +#define VMULEUH    VX4(584)
> +#define VMULOUB    VX4(8)
> +#define VMULOUH    VX4(72)
> +#define VMSUMUHM   VX4(38)
> +
> +#define VMRGHB     VX4(12)
> +#define VMRGHH     VX4(76)
> +#define VMRGHW     VX4(140)
> +#define VMRGLB     VX4(268)
> +#define VMRGLH     VX4(332)
> +#define VMRGLW     VX4(396)
> +
> +#define VPKUHUM    VX4(14)
> +#define VPKUWUM    VX4(78)

The Altivec instruction opcodes for multiply, plus a few others needed for the
expansion.

>  
>  #define VAND       VX4(1028)
>  #define VANDC      VX4(1092)
> @@ -2892,6 +2911,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
>      case INDEX_op_sarv_vec:
>          return vece <= MO_32;
>      case INDEX_op_cmp_vec:
> +    case INDEX_op_mul_vec:

Enable expansion of multiply.

>      case INDEX_op_shli_vec:
>      case INDEX_op_shri_vec:
>      case INDEX_op_sari_vec:
> @@ -3005,7 +3025,13 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>          smax_op[4] = { VMAXSB, VMAXSH, VMAXSW, 0 },
>          shlv_op[4] = { VSLB, VSLH, VSLW, 0 },
>          shrv_op[4] = { VSRB, VSRH, VSRW, 0 },
> -        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 };
> +        sarv_op[4] = { VSRAB, VSRAH, VSRAW, 0 },
> +        mrgh_op[4] = { VMRGHB, VMRGHH, VMRGHW, 0 },
> +        mrgl_op[4] = { VMRGLB, VMRGLH, VMRGLW, 0 },
> +        muleu_op[4] = { VMULEUB, VMULEUH, 0, 0 },
> +        mulou_op[4] = { VMULOUB, VMULOUH, 0, 0 },
> +        pkum_op[4] = { VPKUHUM, VPKUWUM, 0, 0 },
> +        rotl_op[4] = { VRLB, VRLH, VRLW, 0 };

Map tcg opcodes to Altivec instructions used by multiply expansion.

>  
>      TCGType type = vecl + TCG_TYPE_V64;
>      TCGArg a0 = args[0], a1 = args[1], a2 = args[2];
> @@ -3094,6 +3120,29 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>          }
>          break;
>  
> +    case INDEX_op_ppc_mrgh_vec:
> +        insn = mrgh_op[vece];
> +        break;
> +    case INDEX_op_ppc_mrgl_vec:
> +        insn = mrgl_op[vece];
> +        break;
> +    case INDEX_op_ppc_muleu_vec:
> +        insn = muleu_op[vece];
> +        break;
> +    case INDEX_op_ppc_mulou_vec:
> +        insn = mulou_op[vece];
> +        break;
> +    case INDEX_op_ppc_pkum_vec:
> +        insn = pkum_op[vece];
> +        break;
> +    case INDEX_op_ppc_rotl_vec:
> +        insn = rotl_op[vece];
> +        break;
> +    case INDEX_op_ppc_msum_vec:
> +        tcg_debug_assert(vece == MO_16);
> +        tcg_out32(s, VMSUMUHM | VRT(a0) | VRA(a1) | VRB(a2) | VRC(args[3]));
> +        return;

Emit said multiply instructions.

> +
>      case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
>      case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi.  */
>      case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
> @@ -3163,6 +3212,53 @@ static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
>      }
>  }
>  
> +static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0,
> +                           TCGv_vec v1, TCGv_vec v2)
> +{
> +    TCGv_vec t1 = tcg_temp_new_vec(type);
> +    TCGv_vec t2 = tcg_temp_new_vec(type);
> +    TCGv_vec t3, t4;
> +
> +    switch (vece) {
> +    case MO_8:
> +    case MO_16:
> +        vec_gen_3(INDEX_op_ppc_muleu_vec, type, vece, tcgv_vec_arg(t1),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
> +        vec_gen_3(INDEX_op_ppc_mulou_vec, type, vece, tcgv_vec_arg(t2),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
> +        vec_gen_3(INDEX_op_ppc_mrgh_vec, type, vece + 1, tcgv_vec_arg(v0),
> +                  tcgv_vec_arg(t1), tcgv_vec_arg(t2));
> +        vec_gen_3(INDEX_op_ppc_mrgl_vec, type, vece + 1, tcgv_vec_arg(t1),
> +                  tcgv_vec_arg(t1), tcgv_vec_arg(t2));
> +        vec_gen_3(INDEX_op_ppc_pkum_vec, type, vece, tcgv_vec_arg(v0),
> +                  tcgv_vec_arg(v0), tcgv_vec_arg(t1));
> +	break;
> +
> +    case MO_32:
> +        t3 = tcg_temp_new_vec(type);
> +        t4 = tcg_temp_new_vec(type);
> +        tcg_gen_dupi_vec(MO_8, t4, -16);
> +        vec_gen_3(INDEX_op_ppc_rotl_vec, type, MO_32, tcgv_vec_arg(t1),
> +                  tcgv_vec_arg(v2), tcgv_vec_arg(t4));
> +        vec_gen_3(INDEX_op_ppc_mulou_vec, type, MO_16, tcgv_vec_arg(t2),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(v2));
> +        tcg_gen_dupi_vec(MO_8, t3, 0);
> +        vec_gen_4(INDEX_op_ppc_msum_vec, type, MO_16, tcgv_vec_arg(t3),
> +                  tcgv_vec_arg(v1), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
> +        vec_gen_3(INDEX_op_shlv_vec, type, MO_32, tcgv_vec_arg(t3),
> +                  tcgv_vec_arg(t3), tcgv_vec_arg(t4));
> +        tcg_gen_add_vec(MO_32, v0, t2, t3);
> +        tcg_temp_free_vec(t3);
> +        tcg_temp_free_vec(t4);
> +        break;
> +
> +    default:
> +        g_assert_not_reached();
> +    }
> +    tcg_temp_free_vec(t1);
> +    tcg_temp_free_vec(t2);
> +}

Expand multiply operation, as advertised.

> +
>  void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
>                         TCGArg a0, ...)
>  {
> @@ -3189,6 +3285,10 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
>          v2 = temp_tcgv_vec(arg_temp(a2));
>          expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
>          break;
> +    case INDEX_op_mul_vec:
> +        v2 = temp_tcgv_vec(arg_temp(a2));
> +        expand_vec_mul(type, vece, v0, v1, v2);
> +        break;

Call the above expander.

>      default:
>          g_assert_not_reached();
>      }
> @@ -3235,6 +3335,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
>      static const TCGTargetOpDef v_r = { .args_ct_str = { "v", "r" } };
>      static const TCGTargetOpDef v_v = { .args_ct_str = { "v", "v" } };
>      static const TCGTargetOpDef v_v_v = { .args_ct_str = { "v", "v", "v" } };
> +    static const TCGTargetOpDef v_v_v_v
> +        = { .args_ct_str = { "v", "v", "v", "v" } };
>  
>      switch (op) {
>      case INDEX_op_goto_ptr:
> @@ -3390,6 +3492,12 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
>      case INDEX_op_shlv_vec:
>      case INDEX_op_shrv_vec:
>      case INDEX_op_sarv_vec:
> +    case INDEX_op_ppc_mrgh_vec:
> +    case INDEX_op_ppc_mrgl_vec:
> +    case INDEX_op_ppc_muleu_vec:
> +    case INDEX_op_ppc_mulou_vec:
> +    case INDEX_op_ppc_pkum_vec:
> +    case INDEX_op_ppc_rotl_vec:
>          return &v_v_v;

Define constraints for the ppc opcodes defined earlier.

>      case INDEX_op_not_vec:
>      case INDEX_op_dup_vec:
> @@ -3398,6 +3506,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
>      case INDEX_op_st_vec:
>      case INDEX_op_dupm_vec:
>          return &v_r;
> +    case INDEX_op_ppc_msum_vec:
> +        return &v_v_v_v;

Likewise.

>  
>      default:
>          return NULL;
> -- 
> 2.17.1
> 


What is left?  Nothing.

Thanks,


r~


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
                   ` (6 preceding siblings ...)
  2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 7/7] tcg/ppc: Update vector support to v3.00 Richard Henderson
@ 2019-06-18  5:00 ` Richard Henderson
  2019-06-19  5:07   ` Mark Cave-Ayland
  2019-06-19  8:11   ` David Gibson
  7 siblings, 2 replies; 34+ messages in thread
From: Richard Henderson @ 2019-06-18  5:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: mark.cave-ayland, David Gibson

Ping.  Otherwise I'll include it in my next tcg pull.


r~

On 5/18/19 9:15 PM, Richard Henderson wrote:
> Based-on: <20190518190157.21255-1-richard.henderson@linaro.org>
> Aka "tcg: misc gvec improvements".
> 
> Version 3 was last posted in March,
> https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg05859.html
> 
> Changes since v3:
>   * Add support for bitsel, with the vsx xxsel insn.
>   * Rely on the new relocation overflow handling, so
>     we don't require 3 insns for a vector load.
> 
> Changes since v2:
>   * Several generic tcg patches to improve dup vs dupi vs dupm.
>     In particular, if a global temp (like guest r10) is not in
>     a host register, we should duplicate from memory instead of
>     loading to an integer register, spilling to stack, loading
>     to a vector register, and then duplicating.
>   * I have more confidence that 32-bit ppc host should work
>     this time around.  No testing on that front yet, but I've
>     unified some code sequences with 64-bit ppc host.
>   * Base altivec now supports V128 only.  Moved V64 support to
>     Power7 (v2.06), which has 64-bit load/store.
>   * Dropped support for 64-bit vector multiply using Power8.
>     The expansion was too large compared to using integer regs.
> 
> 
> r~
> 
> 
> Richard Henderson (7):
>   tcg/ppc: Initial backend support for Altivec
>   tcg/ppc: Support vector shift by immediate
>   tcg/ppc: Support vector multiply
>   tcg/ppc: Support vector dup2
>   tcg/ppc: Update vector support to v2.06
>   tcg/ppc: Update vector support to v2.07
>   tcg/ppc: Update vector support to v3.00
> 
>  tcg/ppc/tcg-target.h     |   39 +-
>  tcg/ppc/tcg-target.opc.h |   11 +
>  tcg/ppc/tcg-target.inc.c | 1077 +++++++++++++++++++++++++++++++++++---
>  3 files changed, 1063 insertions(+), 64 deletions(-)
>  create mode 100644 tcg/ppc/tcg-target.opc.h
> 



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-18  5:00 ` [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
@ 2019-06-19  5:07   ` Mark Cave-Ayland
  2019-06-20 11:51     ` Howard Spoelstra
  2019-06-22 14:20     ` Mark Cave-Ayland
  2019-06-19  8:11   ` David Gibson
  1 sibling, 2 replies; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-19  5:07 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: Howard Spoelstra, David Gibson

On 18/06/2019 06:00, Richard Henderson wrote:

> Ping.  Otherwise I'll include it in my next tcg pull.
> 
> r~

I can give this another spin on my test images on a G4 over the next few days. I've
also added Howard on CC as he reported some issues with the previous iteration at
https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg06561.html.

> On 5/18/19 9:15 PM, Richard Henderson wrote:
>> Based-on: <20190518190157.21255-1-richard.henderson@linaro.org>
>> Aka "tcg: misc gvec improvements".
>>
>> Version 3 was last posted in March,
>> https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg05859.html
>>
>> Changes since v3:
>>   * Add support for bitsel, with the vsx xxsel insn.
>>   * Rely on the new relocation overflow handling, so
>>     we don't require 3 insns for a vector load.
>>
>> Changes since v2:
>>   * Several generic tcg patches to improve dup vs dupi vs dupm.
>>     In particular, if a global temp (like guest r10) is not in
>>     a host register, we should duplicate from memory instead of
>>     loading to an integer register, spilling to stack, loading
>>     to a vector register, and then duplicating.
>>   * I have more confidence that 32-bit ppc host should work
>>     this time around.  No testing on that front yet, but I've
>>     unified some code sequences with 64-bit ppc host.
>>   * Base altivec now supports V128 only.  Moved V64 support to
>>     Power7 (v2.06), which has 64-bit load/store.
>>   * Dropped support for 64-bit vector multiply using Power8.
>>     The expansion was too large compared to using integer regs.
>>
>>
>> r~
>>
>>
>> Richard Henderson (7):
>>   tcg/ppc: Initial backend support for Altivec
>>   tcg/ppc: Support vector shift by immediate
>>   tcg/ppc: Support vector multiply
>>   tcg/ppc: Support vector dup2
>>   tcg/ppc: Update vector support to v2.06
>>   tcg/ppc: Update vector support to v2.07
>>   tcg/ppc: Update vector support to v3.00
>>
>>  tcg/ppc/tcg-target.h     |   39 +-
>>  tcg/ppc/tcg-target.opc.h |   11 +
>>  tcg/ppc/tcg-target.inc.c | 1077 +++++++++++++++++++++++++++++++++++---
>>  3 files changed, 1063 insertions(+), 64 deletions(-)
>>  create mode 100644 tcg/ppc/tcg-target.opc.h


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-18  5:00 ` [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
  2019-06-19  5:07   ` Mark Cave-Ayland
@ 2019-06-19  8:11   ` David Gibson
  1 sibling, 0 replies; 34+ messages in thread
From: David Gibson @ 2019-06-19  8:11 UTC (permalink / raw)
  To: Richard Henderson; +Cc: mark.cave-ayland, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2384 bytes --]

On Mon, Jun 17, 2019 at 10:00:10PM -0700, Richard Henderson wrote:
> Ping.  Otherwise I'll include it in my next tcg pull.

Uh.. I'm not sure who this ping is directed at.  I'm afraid this
series has dropped off my radar.

> 
> 
> r~
> 
> On 5/18/19 9:15 PM, Richard Henderson wrote:
> > Based-on: <20190518190157.21255-1-richard.henderson@linaro.org>
> > Aka "tcg: misc gvec improvements".
> > 
> > Version 3 was last posted in March,
> > https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg05859.html
> > 
> > Changes since v3:
> >   * Add support for bitsel, with the vsx xxsel insn.
> >   * Rely on the new relocation overflow handling, so
> >     we don't require 3 insns for a vector load.
> > 
> > Changes since v2:
> >   * Several generic tcg patches to improve dup vs dupi vs dupm.
> >     In particular, if a global temp (like guest r10) is not in
> >     a host register, we should duplicate from memory instead of
> >     loading to an integer register, spilling to stack, loading
> >     to a vector register, and then duplicating.
> >   * I have more confidence that 32-bit ppc host should work
> >     this time around.  No testing on that front yet, but I've
> >     unified some code sequences with 64-bit ppc host.
> >   * Base altivec now supports V128 only.  Moved V64 support to
> >     Power7 (v2.06), which has 64-bit load/store.
> >   * Dropped support for 64-bit vector multiply using Power8.
> >     The expansion was too large compared to using integer regs.
> > 
> > 
> > r~
> > 
> > 
> > Richard Henderson (7):
> >   tcg/ppc: Initial backend support for Altivec
> >   tcg/ppc: Support vector shift by immediate
> >   tcg/ppc: Support vector multiply
> >   tcg/ppc: Support vector dup2
> >   tcg/ppc: Update vector support to v2.06
> >   tcg/ppc: Update vector support to v2.07
> >   tcg/ppc: Update vector support to v3.00
> > 
> >  tcg/ppc/tcg-target.h     |   39 +-
> >  tcg/ppc/tcg-target.opc.h |   11 +
> >  tcg/ppc/tcg-target.inc.c | 1077 +++++++++++++++++++++++++++++++++++---
> >  3 files changed, 1063 insertions(+), 64 deletions(-)
> >  create mode 100644 tcg/ppc/tcg-target.opc.h
> > 
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-19  5:07   ` Mark Cave-Ayland
@ 2019-06-20 11:51     ` Howard Spoelstra
  2019-06-22 14:20     ` Mark Cave-Ayland
  1 sibling, 0 replies; 34+ messages in thread
From: Howard Spoelstra @ 2019-06-20 11:51 UTC (permalink / raw)
  To: Mark Cave-Ayland; +Cc: Richard Henderson, qemu-devel qemu-devel, David Gibson

Hi,

As reported before, qemu-system-ppc still crashes with a segmentation fault
on Lubuntu 16 on a G5. Built from current tcg-ppc-vsx branch.

Linux hsp-G5-Lubuntu 4.4.0-143-powerpc64-smp #169-Ubuntu SMP Thu Feb 7
08:25:49 UTC 2019 ppc64 ppc64 ppc64 GNU/Linux

hsp@hsp-G5-Lubuntu:~$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/powerpc-linux-gnu/5/lto-wrapper
Target: powerpc-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu
5.4.0-6ubuntu1~16.04.11'
--with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs
--enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr
--program-suffix=-5 --enable-shared --enable-linker-build-id
--libexecdir=/usr/lib --without-included-gettext --enable-threads=posix
--libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu
--enable-libstdcxx-debug --enable-libstdcxx-time=yes
--with-default-libstdcxx-abi=new --enable-gnu-unique-object
--disable-libitm --disable-libquadmath --enable-plugin --with-system-zlib
--disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo
--with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-powerpc/jre
--enable-java-home
--with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-powerpc
--with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-powerpc
--with-arch-directory=ppc --with-ecj-jar=/usr/share/java/eclipse-ecj.jar
--enable-objc-gc --enable-secureplt --disable-softfloat
--with-cpu=default32 --disable-softfloat
--enable-targets=powerpc-linux,powerpc64-linux --enable-multiarch
--disable-werror --with-long-double-128 --enable-multilib
--enable-checking=release --build=powerpc-linux-gnu
--host=powerpc-linux-gnu --target=powerpc-linux-gnu
Thread model: posix
gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11)
<https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg06561.html>

Best,
Howard.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-19  5:07   ` Mark Cave-Ayland
  2019-06-20 11:51     ` Howard Spoelstra
@ 2019-06-22 14:20     ` Mark Cave-Ayland
  2019-06-22 15:01       ` Mark Cave-Ayland
  1 sibling, 1 reply; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-22 14:20 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: David Gibson, Howard Spoelstra

On 19/06/2019 06:07, Mark Cave-Ayland wrote:

> On 18/06/2019 06:00, Richard Henderson wrote:
> 
>> Ping.  Otherwise I'll include it in my next tcg pull.
>>
>> r~
> 
> I can give this another spin on my test images on a G4 over the next few days. I've
> also added Howard on CC as he reported some issues with the previous iteration at
> https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg06561.html.

I've just given your tcg-ppc-vsx branch a spin here, and like Howard I'm getting
segfaults trying to launch my MacOS images :(  The segfault is weird: it doesn't get
caught by an attached gdb and the qemu-system-ppc process seems to hang around like a
zombie which makes me think that it's probably an illegal instruction of some kind,
but the PPC kernel can't handle it as well as x86 does.

With a bit more work I bisected it down to the first commit in the patchset
(d8dcbb57e9: "tcg/ppc: Initial backend support for Altivec") and then as an
experiment I hacked tcg_can_emit_vec_op() to always return 0 to see if that made a
difference, but the segfault still appears.

The commit message mentions that the load and store helpers are also improved, so I
wonder if they are what is causing the error rather than the vector parts? Also in
the kernel log I see the following messages appearing:

[3639669.374942] qemu-system-ppc[28591]: segfault (11) at 64b8 nip f87280 lr f8723c
code 1 in qemu-system-ppc[94e000+aa0000]
[3639669.380015] qemu-system-ppc[28591]: code: 93c10038 91810020 90010044 7fc802a6
3fde0059 2e030000 3bde6c18 7c9d2378
[3639669.385056] qemu-system-ppc[28591]: code: 813e80a0 7cbb2b78 7cda3378 7cf93b78
<81428ff8> 9141001c 39400000 81290000

Does that help at all? If not let me know if there are any other tests that you'd
like me to try to help debug this.


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-22 14:20     ` Mark Cave-Ayland
@ 2019-06-22 15:01       ` Mark Cave-Ayland
  2019-06-23 17:10         ` Aleksandar Markovic
  0 siblings, 1 reply; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-22 15:01 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: Howard Spoelstra, David Gibson

On 22/06/2019 15:20, Mark Cave-Ayland wrote:

> I've just given your tcg-ppc-vsx branch a spin here, and like Howard I'm getting
> segfaults trying to launch my MacOS images :(  The segfault is weird: it doesn't get
> caught by an attached gdb and the qemu-system-ppc process seems to hang around like a
> zombie which makes me think that it's probably an illegal instruction of some kind,
> but the PPC kernel can't handle it as well as x86 does.
> 
> With a bit more work I bisected it down to the first commit in the patchset
> (d8dcbb57e9: "tcg/ppc: Initial backend support for Altivec") and then as an
> experiment I hacked tcg_can_emit_vec_op() to always return 0 to see if that made a
> difference, but the segfault still appears.
> 
> The commit message mentions that the load and store helpers are also improved, so I
> wonder if they are what is causing the error rather than the vector parts? Also in
> the kernel log I see the following messages appearing:
> 
> [3639669.374942] qemu-system-ppc[28591]: segfault (11) at 64b8 nip f87280 lr f8723c
> code 1 in qemu-system-ppc[94e000+aa0000]
> [3639669.380015] qemu-system-ppc[28591]: code: 93c10038 91810020 90010044 7fc802a6
> 3fde0059 2e030000 3bde6c18 7c9d2378
> [3639669.385056] qemu-system-ppc[28591]: code: 813e80a0 7cbb2b78 7cda3378 7cf93b78
> <81428ff8> 9141001c 39400000 81290000
> 
> Does that help at all? If not let me know if there are any other tests that you'd
> like me to try to help debug this.

One more hint: if I try a build of d8dcbb57e9 along with my tcg_can_emit_vec_op()
hack and pass --enable-debug-tcg to configure then I get an assert on startup:

qemu-system-ppc: /home/mca/src/qemu/tcg/tcg.c:2207: process_op_defs: Assertion `tdefs
!= ((void *)0)' failed.
Aborted


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-22 15:01       ` Mark Cave-Ayland
@ 2019-06-23 17:10         ` Aleksandar Markovic
  2019-06-25  6:56           ` Richard Henderson
  0 siblings, 1 reply; 34+ messages in thread
From: Aleksandar Markovic @ 2019-06-23 17:10 UTC (permalink / raw)
  To: Mark Cave-Ayland
  Cc: David Gibson, Richard Henderson, QEMU Developers, Howard Spoelstra

On Sat, Jun 22, 2019 at 5:02 PM Mark Cave-Ayland
<mark.cave-ayland@ilande.co.uk> wrote:
>
> On 22/06/2019 15:20, Mark Cave-Ayland wrote:
>
> > I've just given your tcg-ppc-vsx branch a spin here, and like Howard I'm getting
> > segfaults trying to launch my MacOS images :(  The segfault is weird: it doesn't get
> > caught by an attached gdb and the qemu-system-ppc process seems to hang around like a
> > zombie which makes me think that it's probably an illegal instruction of some kind,
> > but the PPC kernel can't handle it as well as x86 does.
> >
> > With a bit more work I bisected it down to the first commit in the patchset
> > (d8dcbb57e9: "tcg/ppc: Initial backend support for Altivec") and then as an
> > experiment I hacked tcg_can_emit_vec_op() to always return 0 to see if that made a
> > difference, but the segfault still appears.
> >
> > The commit message mentions that the load and store helpers are also improved, so I
> > wonder if they are what is causing the error rather than the vector parts? Also in
> > the kernel log I see the following messages appearing:
> >
> > [3639669.374942] qemu-system-ppc[28591]: segfault (11) at 64b8 nip f87280 lr f8723c
> > code 1 in qemu-system-ppc[94e000+aa0000]
> > [3639669.380015] qemu-system-ppc[28591]: code: 93c10038 91810020 90010044 7fc802a6
> > 3fde0059 2e030000 3bde6c18 7c9d2378
> > [3639669.385056] qemu-system-ppc[28591]: code: 813e80a0 7cbb2b78 7cda3378 7cf93b78
> > <81428ff8> 9141001c 39400000 81290000
> >
> > Does that help at all? If not let me know if there are any other tests that you'd
> > like me to try to help debug this.
>
> One more hint: if I try a build of d8dcbb57e9 along with my tcg_can_emit_vec_op()
> hack and pass --enable-debug-tcg to configure then I get an assert on startup:
>
> qemu-system-ppc: /home/mca/src/qemu/tcg/tcg.c:2207: process_op_defs: Assertion `tdefs
> != ((void *)0)' failed.
> Aborted
>

Mark, Richard, Howard, David,

I just sent v5 of the series, that is (in the sense of net-result of
code changes) equivalent to v4, but the patch 1/7 from v4 is now split
into ten smaller patches. This was done mainly to enable Mark to
perhaps try v5 and bisect, in order to at least somewhat narrow down
the culprit. Most likely it will be patch 5 from v5, that is still
sizeable, but even if this is the case, we can eliminate other smaller
things from consideration.

Sincerely,
Aleksandar

>
> ATB,
>
> Mark.
>


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-23 17:10         ` Aleksandar Markovic
@ 2019-06-25  6:56           ` Richard Henderson
  2019-06-25 15:37             ` Mark Cave-Ayland
  0 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2019-06-25  6:56 UTC (permalink / raw)
  To: Aleksandar Markovic, Mark Cave-Ayland
  Cc: David Gibson, QEMU Developers, Howard Spoelstra

On 6/23/19 7:10 PM, Aleksandar Markovic wrote:
> On Sat, Jun 22, 2019 at 5:02 PM Mark Cave-Ayland
> <mark.cave-ayland@ilande.co.uk> wrote:
>>
>> On 22/06/2019 15:20, Mark Cave-Ayland wrote:
>>
>>> I've just given your tcg-ppc-vsx branch a spin here, and like Howard I'm getting
>>> segfaults trying to launch my MacOS images :(  The segfault is weird: it doesn't get
>>> caught by an attached gdb and the qemu-system-ppc process seems to hang around like a
>>> zombie which makes me think that it's probably an illegal instruction of some kind,
>>> but the PPC kernel can't handle it as well as x86 does.
>>>
>>> With a bit more work I bisected it down to the first commit in the patchset
>>> (d8dcbb57e9: "tcg/ppc: Initial backend support for Altivec") and then as an
>>> experiment I hacked tcg_can_emit_vec_op() to always return 0 to see if that made a
>>> difference, but the segfault still appears.
>>>
>>> The commit message mentions that the load and store helpers are also improved, so I
>>> wonder if they are what is causing the error rather than the vector parts? Also in
>>> the kernel log I see the following messages appearing:
>>>
>>> [3639669.374942] qemu-system-ppc[28591]: segfault (11) at 64b8 nip f87280 lr f8723c
>>> code 1 in qemu-system-ppc[94e000+aa0000]
>>> [3639669.380015] qemu-system-ppc[28591]: code: 93c10038 91810020 90010044 7fc802a6
>>> 3fde0059 2e030000 3bde6c18 7c9d2378
>>> [3639669.385056] qemu-system-ppc[28591]: code: 813e80a0 7cbb2b78 7cda3378 7cf93b78
>>> <81428ff8> 9141001c 39400000 81290000
>>>
>>> Does that help at all? If not let me know if there are any other tests that you'd
>>> like me to try to help debug this.
>>
>> One more hint: if I try a build of d8dcbb57e9 along with my tcg_can_emit_vec_op()
>> hack and pass --enable-debug-tcg to configure then I get an assert on startup:
>>
>> qemu-system-ppc: /home/mca/src/qemu/tcg/tcg.c:2207: process_op_defs: Assertion `tdefs
>> != ((void *)0)' failed.
>> Aborted
>>
> 
> Mark, Richard, Howard, David,
> 
> I just sent v5 of the series, that is (in the sense of net-result of
> code changes) equivalent to v4, but the patch 1/7 from v4 is now split
> into ten smaller patches. This was done mainly to enable Mark to
> perhaps try v5 and bisect, in order to at least somewhat narrow down
> the culprit. Most likely it will be patch 5 from v5, that is still
> sizeable, but even if this is the case, we can eliminate other smaller
> things from consideration.

Thanks for the help on that.

I don't believe your split is actually bisectable -- there's a minimum amount
that is required to enable vector opcodes at all.  Patch 5 is the first that
enables tcg_out_{mov,ld,st}, so while patches beforehand may compile, they
certainly will not run.

I can retain your split, but for real bisectability we need to move the enable
of TCG_TARGET_HAS_v128 from patch 2 to patch 5.

Given that all this works for me on a Power9 host, I expect that there's a
simple fix for Mark's G5 host.  Given the above assertion, a missing opcode
definition, perhaps for -m32 vs -m64?


r~


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-25  6:56           ` Richard Henderson
@ 2019-06-25 15:37             ` Mark Cave-Ayland
  2019-06-25 15:56               ` Richard Henderson
  2019-06-25 18:01               ` Aleksandar Markovic
  0 siblings, 2 replies; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-25 15:37 UTC (permalink / raw)
  To: Richard Henderson, Aleksandar Markovic
  Cc: Howard Spoelstra, QEMU Developers, David Gibson

On 25/06/2019 07:56, Richard Henderson wrote:

>>> One more hint: if I try a build of d8dcbb57e9 along with my tcg_can_emit_vec_op()
>>> hack and pass --enable-debug-tcg to configure then I get an assert on startup:
>>>
>>> qemu-system-ppc: /home/mca/src/qemu/tcg/tcg.c:2207: process_op_defs: Assertion `tdefs
>>> != ((void *)0)' failed.
>>> Aborted
>>>
>>
>> Mark, Richard, Howard, David,
>>
>> I just sent v5 of the series, that is (in the sense of net-result of
>> code changes) equivalent to v4, but the patch 1/7 from v4 is now split
>> into ten smaller patches. This was done mainly to enable Mark to
>> perhaps try v5 and bisect, in order to at least somewhat narrow down
>> the culprit. Most likely it will be patch 5 from v5, that is still
>> sizeable, but even if this is the case, we can eliminate other smaller
>> things from consideration.
> 
> Thanks for the help on that.
> 
> I don't believe your split is actually bisectable -- there's a minimum amount
> that is required to enable vector opcodes at all.  Patch 5 is the first that
> enables tcg_out_{mov,ld,st}, so while patches beforehand may compile, they
> certainly will not run.
> 
> I can retain your split, but for real bisectability we need to move the enable
> of TCG_TARGET_HAS_v128 from patch 2 to patch 5.
> 
> Given that all this works for me on a Power9 host, I expect that there's a
> simple fix for Mark's G5 host.  Given the above assertion, a missing opcode
> definition, perhaps for -m32 vs -m64?

Right, I'm starting to dig into this a bit more now. First of all, I've figured out
what is triggering the above assertion:

"qemu-system-ppc: /home/mca/src/qemu/tcg/tcg.c:2207: process_op_defs: Assertion
`tdefs != ((void *)0)' failed."

The problem is that in tcg/tcg-op.h we define "DEF(dup2_vec, 1, 2, 0, IMPLVEC |
IMPL(TCG_TARGET_REG_BITS == 32))" and in the last patchset dup2_vec isn't introduced
until towards the end. Unfortunately it's not a simple as bringing the patch forward
within the series to maintain bisectability because the current implementation
depends on VMRG which only appears in the patch just before it...

Next to try and figure out what exactly is causing the fault. Just a quick question
out of curiosity: is your Power9 system BE or LE?


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-25 15:37             ` Mark Cave-Ayland
@ 2019-06-25 15:56               ` Richard Henderson
  2019-06-25 17:55                 ` Mark Cave-Ayland
  2019-06-26  8:33                 ` David Gibson
  2019-06-25 18:01               ` Aleksandar Markovic
  1 sibling, 2 replies; 34+ messages in thread
From: Richard Henderson @ 2019-06-25 15:56 UTC (permalink / raw)
  To: Mark Cave-Ayland, Aleksandar Markovic
  Cc: Howard Spoelstra, QEMU Developers, David Gibson

On 6/25/19 5:37 PM, Mark Cave-Ayland wrote:
> The problem is that in tcg/tcg-op.h we define "DEF(dup2_vec, 1, 2, 0, IMPLVEC |
> IMPL(TCG_TARGET_REG_BITS == 32))" and in the last patchset dup2_vec isn't introduced
> until towards the end. Unfortunately it's not a simple as bringing the patch forward
> within the series to maintain bisectability because the current implementation
> depends on VMRG which only appears in the patch just before it...

Ah, that would explain it.  I admit I haven't looked at v5 that closely.

> Next to try and figure out what exactly is causing the fault. Just a quick question
> out of curiosity: is your Power9 system BE or LE?

The Power9 is LE.

I do have access to a Power7 BE system, and that worked last time I checked.
I'll try that again tomorrow to be sure.


r~


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-25 15:56               ` Richard Henderson
@ 2019-06-25 17:55                 ` Mark Cave-Ayland
  2019-06-26  7:45                   ` Richard Henderson
  2019-06-26  8:33                 ` David Gibson
  1 sibling, 1 reply; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-25 17:55 UTC (permalink / raw)
  To: Richard Henderson, Aleksandar Markovic
  Cc: David Gibson, QEMU Developers, Howard Spoelstra

On 25/06/2019 16:56, Richard Henderson wrote:

> On 6/25/19 5:37 PM, Mark Cave-Ayland wrote:
>> The problem is that in tcg/tcg-op.h we define "DEF(dup2_vec, 1, 2, 0, IMPLVEC |
>> IMPL(TCG_TARGET_REG_BITS == 32))" and in the last patchset dup2_vec isn't introduced
>> until towards the end. Unfortunately it's not a simple as bringing the patch forward
>> within the series to maintain bisectability because the current implementation
>> depends on VMRG which only appears in the patch just before it...
> 
> Ah, that would explain it.  I admit I haven't looked at v5 that closely.

It's actually the same in both patchsets: I'm still playing with v4 at the moment
since I have a few hacks in place to help me figure out what's going on.

>> Next to try and figure out what exactly is causing the fault. Just a quick question
>> out of curiosity: is your Power9 system BE or LE?
> 
> The Power9 is LE.
> 
> I do have access to a Power7 BE system, and that worked last time I checked.
> I'll try that again tomorrow to be sure.

And here's where we are blowing up according to -d in_asm,op_out_asm:

IN:
0x00f22ca0:  101ffc84  vor      v0, v31, v31

OP:
 ld_i32 tmp0,env,$0xfffffff8
 movi_i32 tmp1,$0x0
 brcond_i32 tmp0,tmp1,lt,$L0

 ---- 00f22ca0
 ld_vec v128,e8,tmp2,env,$0xd6b0
 st_vec v128,e8,tmp2,env,$0xd4c0
 movi_i32 nip,$0xf22ca4
 movi_i32 nip,$0xf22ca4
 movi_i32 tmp0,$0x10002
 call raise_exception,$0x2,$0,env,tmp0
 exit_tb $0x0
 set_label $L0
 exit_tb $0xa4e7f0c3

OUT: [size=96]
0xa4e7f120:  81dbfff8  lwz      r14, -8(r27)
0xa4e7f124:  2f8e0000  cmpwi    cr7, r14, 0
0xa4e7f128:  419c004c  blt      cr7, 0xa4e7f174
0xa4e7f12c:  3c400000  lis      r2, 0
0xa4e7f130:  6042d6b0  ori      r2, r2, 0xd6b0
0xa4e7f134:  7c5b10ce  lvx      v2, r27, r2
0xa4e7f138:  3c400000  lis      r2, 0
0xa4e7f13c:  6042d4c0  ori      r2, r2, 0xd4c0
0xa4e7f140:  7c5b11ce  stvx     v2, r27, r2
0xa4e7f144:  3dc000f2  lis      r14, 0xf2
0xa4e7f148:  61ce2ca4  ori      r14, r14, 0x2ca4
0xa4e7f14c:  91db016c  stw      r14, 0x16c(r27)
0xa4e7f150:  7f63db78  mr       r3, r27
0xa4e7f154:  3c800001  lis      r4, 1
0xa4e7f158:  60840002  ori      r4, r4, 2
0xa4e7f15c:  3c000087  lis      r0, 0x87
0xa4e7f160:  6000b618  ori      r0, r0, 0xb618
0xa4e7f164:  7c0903a6  mtctr    r0
0xa4e7f168:  4e800421  bctrl
0xa4e7f16c:  38600000  li       r3, 0
0xa4e7f170:  4bfffef0  b        0xa4e7f060
0xa4e7f174:  3c60a4e7  lis      r3, -0x5b19
0xa4e7f178:  6063f0c3  ori      r3, r3, 0xf0c3
0xa4e7f17c:  4bfffee4  b        0xa4e7f060

Any ideas what might be going on here?


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-25 15:37             ` Mark Cave-Ayland
  2019-06-25 15:56               ` Richard Henderson
@ 2019-06-25 18:01               ` Aleksandar Markovic
  1 sibling, 0 replies; 34+ messages in thread
From: Aleksandar Markovic @ 2019-06-25 18:01 UTC (permalink / raw)
  To: Mark Cave-Ayland
  Cc: David Gibson, Richard Henderson, QEMU Developers, Howard Spoelstra

On Jun 25, 2019 5:42 PM, "Mark Cave-Ayland" <mark.cave-ayland@ilande.co.uk>
wrote:

>
> The problem is that in tcg/tcg-op.h we define "DEF(dup2_vec, 1, 2, 0,
IMPLVEC |
> IMPL(TCG_TARGET_REG_BITS == 32))" and in the last patchset dup2_vec isn't
introduced
> until towards the end. Unfortunately it's not a simple as bringing the
patch forward
> within the series to maintain bisectability because the current
implementation
> depends on VMRG which only appears in the patch just before it...
>

My strong impression is that VMRG,  VSPLT, VSLDOI, ... opcodes and basic
logic could have been defined very early in the series. (They all just
support other TCG vector operations. Their functionalty just helps achieve
other, exposed, backend functionalities.) That would reduce patch
dependencies and  allow “patch mobility” within the rest of the series.

However, I am not positive at all that would solve the problem at hand.

Aleksandar

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-25 17:55                 ` Mark Cave-Ayland
@ 2019-06-26  7:45                   ` Richard Henderson
  2019-06-26 17:00                     ` Mark Cave-Ayland
  0 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2019-06-26  7:45 UTC (permalink / raw)
  To: Mark Cave-Ayland, Aleksandar Markovic
  Cc: Howard Spoelstra, QEMU Developers, David Gibson

On 6/25/19 7:55 PM, Mark Cave-Ayland wrote:
> And here's where we are blowing up according to -d in_asm,op_out_asm:
> 
> IN:
> 0x00f22ca0:  101ffc84  vor      v0, v31, v31
> 
> OP:
>  ld_i32 tmp0,env,$0xfffffff8
>  movi_i32 tmp1,$0x0
>  brcond_i32 tmp0,tmp1,lt,$L0
> 
>  ---- 00f22ca0
>  ld_vec v128,e8,tmp2,env,$0xd6b0
>  st_vec v128,e8,tmp2,env,$0xd4c0

Yep, that looks right.

As an aside, this does suggest to me that target/ppc might be well served in
moving the ppc_vsr_t members of CPUPPCState earlier, so that this offset is
smaller.

>  movi_i32 nip,$0xf22ca4
>  movi_i32 nip,$0xf22ca4
>  movi_i32 tmp0,$0x10002
>  call raise_exception,$0x2,$0,env,tmp0

And this, presumably is the single-step debug exception.

> 0xa4e7f12c:  3c400000  lis      r2, 0
> 0xa4e7f130:  6042d6b0  ori      r2, r2, 0xd6b0
> 0xa4e7f134:  7c5b10ce  lvx      v2, r27, r2

> 0xa4e7f138:  3c400000  lis      r2, 0
> 0xa4e7f13c:  6042d4c0  ori      r2, r2, 0xd4c0
> 0xa4e7f140:  7c5b11ce  stvx     v2, r27, r2

These also look correct.  Form an offset into r2, load or store from env+r2.

This also shows what I mention above re offset.  For a ppc host, the offset is
large enough to require two instructions to form them.

> Any ideas what might be going on here?

What is the observed problem that makes you think that this is the incorrect
instruction?


r~


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-25 15:56               ` Richard Henderson
  2019-06-25 17:55                 ` Mark Cave-Ayland
@ 2019-06-26  8:33                 ` David Gibson
  2019-06-26 15:25                   ` Richard Henderson
  1 sibling, 1 reply; 34+ messages in thread
From: David Gibson @ 2019-06-26  8:33 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Mark Cave-Ayland, QEMU Developers, Aleksandar Markovic, Howard Spoelstra

[-- Attachment #1: Type: text/plain, Size: 1197 bytes --]

g


On Tue, Jun 25, 2019 at 05:56:42PM +0200, Richard Henderson wrote:
> On 6/25/19 5:37 PM, Mark Cave-Ayland wrote:
> > The problem is that in tcg/tcg-op.h we define "DEF(dup2_vec, 1, 2, 0, IMPLVEC |
> > IMPL(TCG_TARGET_REG_BITS == 32))" and in the last patchset dup2_vec isn't introduced
> > until towards the end. Unfortunately it's not a simple as bringing the patch forward
> > within the series to maintain bisectability because the current implementation
> > depends on VMRG which only appears in the patch just before it...
> 
> Ah, that would explain it.  I admit I haven't looked at v5 that closely.
> 
> > Next to try and figure out what exactly is causing the fault. Just a quick question
> > out of curiosity: is your Power9 system BE or LE?
> 
> The Power9 is LE.

It's the kernel determines endianness, not the system.

> 
> I do have access to a Power7 BE system, and that worked last time I checked.
> I'll try that again tomorrow to be sure.
> 
> 
> r~
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-26  8:33                 ` David Gibson
@ 2019-06-26 15:25                   ` Richard Henderson
  0 siblings, 0 replies; 34+ messages in thread
From: Richard Henderson @ 2019-06-26 15:25 UTC (permalink / raw)
  To: David Gibson
  Cc: Mark Cave-Ayland, QEMU Developers, Aleksandar Markovic, Howard Spoelstra

On 6/26/19 10:33 AM, David Gibson wrote:
>>> out of curiosity: is your Power9 system BE or LE?
>>
>> The Power9 is LE.
> 
> It's the kernel determines endianness, not the system.

Yes.  Lazy verbiage on my part -- I did mean "The Power9 that I have access to
is configured as LE".


r~


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-26  7:45                   ` Richard Henderson
@ 2019-06-26 17:00                     ` Mark Cave-Ayland
  2019-06-26 18:18                       ` BALATON Zoltan
  2019-06-26 18:42                       ` Richard Henderson
  0 siblings, 2 replies; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-26 17:00 UTC (permalink / raw)
  To: Richard Henderson, Aleksandar Markovic
  Cc: David Gibson, QEMU Developers, Howard Spoelstra

On 26/06/2019 08:45, Richard Henderson wrote:

> On 6/25/19 7:55 PM, Mark Cave-Ayland wrote:
>> And here's where we are blowing up according to -d in_asm,op_out_asm:
>>
>> IN:
>> 0x00f22ca0:  101ffc84  vor      v0, v31, v31
>>
>> OP:
>>  ld_i32 tmp0,env,$0xfffffff8
>>  movi_i32 tmp1,$0x0
>>  brcond_i32 tmp0,tmp1,lt,$L0
>>
>>  ---- 00f22ca0
>>  ld_vec v128,e8,tmp2,env,$0xd6b0
>>  st_vec v128,e8,tmp2,env,$0xd4c0
> 
> Yep, that looks right.
> 
> As an aside, this does suggest to me that target/ppc might be well served in
> moving the ppc_vsr_t members of CPUPPCState earlier, so that this offset is
> smaller.
> 
>>  movi_i32 nip,$0xf22ca4
>>  movi_i32 nip,$0xf22ca4
>>  movi_i32 tmp0,$0x10002
>>  call raise_exception,$0x2,$0,env,tmp0
> 
> And this, presumably is the single-step debug exception.
> 
>> 0xa4e7f12c:  3c400000  lis      r2, 0
>> 0xa4e7f130:  6042d6b0  ori      r2, r2, 0xd6b0
>> 0xa4e7f134:  7c5b10ce  lvx      v2, r27, r2
> 
>> 0xa4e7f138:  3c400000  lis      r2, 0
>> 0xa4e7f13c:  6042d4c0  ori      r2, r2, 0xd4c0
>> 0xa4e7f140:  7c5b11ce  stvx     v2, r27, r2
> 
> These also look correct.  Form an offset into r2, load or store from env+r2.
> 
> This also shows what I mention above re offset.  For a ppc host, the offset is
> large enough to require two instructions to form them.
> 
>> Any ideas what might be going on here?
> 
> What is the observed problem that makes you think that this is the incorrect
> instruction?

What I've been doing is set a breakpoint a few instructions before and then issuing
"stepi" commands via the gdbstub. As I step over the "vor v0, v31, v31" instruction
then either the qemu-system-ppc process segfaults outside of gdb, or inside gdb it
goes to bg. Bringing it back to fg just causes gdb to get confused and in the end the
only thing I can do is kill the gdb process.

On the plus side I've managed to work out where we are faulting by hacking the load
and store functions to inject trap opcodes in the ld_vec and st_vec and it appears
that we are segfaulting here:

OUT: [size=96]
0xa4e7f120:  81dbfff8  lwz      r14, -8(r27)
0xa4e7f124:  2f8e0000  cmpwi    cr7, r14, 0
0xa4e7f128:  419c004c  blt      cr7, 0xa4e7f174
0xa4e7f12c:  3c400000  lis      r2, 0
                       ^^^^^^^^^^^^^^
0xa4e7f130:  6042d6b0  ori      r2, r2, 0xd6b0
0xa4e7f134:  7c5b10ce  lvx      v2, r27, r2
0xa4e7f138:  3c400000  lis      r2, 0
0xa4e7f13c:  6042d4c0  ori      r2, r2, 0xd4c0
0xa4e7f140:  7c5b11ce  stvx     v2, r27, r2
0xa4e7f144:  3dc000f2  lis      r14, 0xf2
0xa4e7f148:  61ce2ca4  ori      r14, r14, 0x2ca4
0xa4e7f14c:  91db016c  stw      r14, 0x16c(r27)
0xa4e7f150:  7f63db78  mr       r3, r27
0xa4e7f154:  3c800001  lis      r4, 1
0xa4e7f158:  60840002  ori      r4, r4, 2
0xa4e7f15c:  3c000087  lis      r0, 0x87
0xa4e7f160:  6000b618  ori      r0, r0, 0xb618
0xa4e7f164:  7c0903a6  mtctr    r0
0xa4e7f168:  4e800421  bctrl
0xa4e7f16c:  38600000  li       r3, 0
0xa4e7f170:  4bfffef0  b        0xa4e7f060
0xa4e7f174:  3c60a4e7  lis      r3, -0x5b19
0xa4e7f178:  6063f0c3  ori      r3, r3, 0xf0c3
0xa4e7f17c:  4bfffee4  b        0xa4e7f060

Interestingly if I set a trap and then switch the opcode to "lis r4,0" (0x3c800000)
then we carry on as normal until the next "lis r2,0" instruction. Looking through the
whole output of -d out_asm this is the first mention of r2 which makes me wonder if
it is special somehow? At least a quick search indicates that for 32-bit PPC r2 is
supposed to be dedicated as a TOC pointer.

Is there a quick way to disable r2 from the list of available registers to see if
that gets things going?


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-26 17:00                     ` Mark Cave-Ayland
@ 2019-06-26 18:18                       ` BALATON Zoltan
  2019-06-26 18:42                       ` Richard Henderson
  1 sibling, 0 replies; 34+ messages in thread
From: BALATON Zoltan @ 2019-06-26 18:18 UTC (permalink / raw)
  To: Mark Cave-Ayland
  Cc: Howard Spoelstra, Richard Henderson, QEMU Developers,
	Aleksandar Markovic, David Gibson

On Wed, 26 Jun 2019, Mark Cave-Ayland wrote:
> On 26/06/2019 08:45, Richard Henderson wrote:
>> On 6/25/19 7:55 PM, Mark Cave-Ayland wrote:
>>> And here's where we are blowing up according to -d in_asm,op_out_asm:
>>>
>>> IN:
>>> 0x00f22ca0:  101ffc84  vor      v0, v31, v31
>>>
>>> OP:
>>>  ld_i32 tmp0,env,$0xfffffff8
>>>  movi_i32 tmp1,$0x0
>>>  brcond_i32 tmp0,tmp1,lt,$L0
>>>
>>>  ---- 00f22ca0
>>>  ld_vec v128,e8,tmp2,env,$0xd6b0
>>>  st_vec v128,e8,tmp2,env,$0xd4c0
>>
>> Yep, that looks right.
>>
>> As an aside, this does suggest to me that target/ppc might be well served in
>> moving the ppc_vsr_t members of CPUPPCState earlier, so that this offset is
>> smaller.
>>
>>>  movi_i32 nip,$0xf22ca4
>>>  movi_i32 nip,$0xf22ca4
>>>  movi_i32 tmp0,$0x10002
>>>  call raise_exception,$0x2,$0,env,tmp0
>>
>> And this, presumably is the single-step debug exception.
>>
>>> 0xa4e7f12c:  3c400000  lis      r2, 0
>>> 0xa4e7f130:  6042d6b0  ori      r2, r2, 0xd6b0
>>> 0xa4e7f134:  7c5b10ce  lvx      v2, r27, r2
>>
>>> 0xa4e7f138:  3c400000  lis      r2, 0
>>> 0xa4e7f13c:  6042d4c0  ori      r2, r2, 0xd4c0
>>> 0xa4e7f140:  7c5b11ce  stvx     v2, r27, r2
>>
>> These also look correct.  Form an offset into r2, load or store from env+r2.
>>
>> This also shows what I mention above re offset.  For a ppc host, the offset is
>> large enough to require two instructions to form them.
>>
>>> Any ideas what might be going on here?
>>
>> What is the observed problem that makes you think that this is the incorrect
>> instruction?
>
> What I've been doing is set a breakpoint a few instructions before and then issuing
> "stepi" commands via the gdbstub. As I step over the "vor v0, v31, v31" instruction
> then either the qemu-system-ppc process segfaults outside of gdb, or inside gdb it
> goes to bg. Bringing it back to fg just causes gdb to get confused and in the end the
> only thing I can do is kill the gdb process.
>
> On the plus side I've managed to work out where we are faulting by hacking the load
> and store functions to inject trap opcodes in the ld_vec and st_vec and it appears
> that we are segfaulting here:
>
> OUT: [size=96]
> 0xa4e7f120:  81dbfff8  lwz      r14, -8(r27)
> 0xa4e7f124:  2f8e0000  cmpwi    cr7, r14, 0
> 0xa4e7f128:  419c004c  blt      cr7, 0xa4e7f174
> 0xa4e7f12c:  3c400000  lis      r2, 0
>                       ^^^^^^^^^^^^^^
> 0xa4e7f130:  6042d6b0  ori      r2, r2, 0xd6b0
> 0xa4e7f134:  7c5b10ce  lvx      v2, r27, r2
> 0xa4e7f138:  3c400000  lis      r2, 0
> 0xa4e7f13c:  6042d4c0  ori      r2, r2, 0xd4c0
> 0xa4e7f140:  7c5b11ce  stvx     v2, r27, r2
> 0xa4e7f144:  3dc000f2  lis      r14, 0xf2
> 0xa4e7f148:  61ce2ca4  ori      r14, r14, 0x2ca4
> 0xa4e7f14c:  91db016c  stw      r14, 0x16c(r27)
> 0xa4e7f150:  7f63db78  mr       r3, r27
> 0xa4e7f154:  3c800001  lis      r4, 1
> 0xa4e7f158:  60840002  ori      r4, r4, 2
> 0xa4e7f15c:  3c000087  lis      r0, 0x87
> 0xa4e7f160:  6000b618  ori      r0, r0, 0xb618
> 0xa4e7f164:  7c0903a6  mtctr    r0
> 0xa4e7f168:  4e800421  bctrl
> 0xa4e7f16c:  38600000  li       r3, 0
> 0xa4e7f170:  4bfffef0  b        0xa4e7f060
> 0xa4e7f174:  3c60a4e7  lis      r3, -0x5b19
> 0xa4e7f178:  6063f0c3  ori      r3, r3, 0xf0c3
> 0xa4e7f17c:  4bfffee4  b        0xa4e7f060
>
> Interestingly if I set a trap and then switch the opcode to "lis r4,0" (0x3c800000)
> then we carry on as normal until the next "lis r2,0" instruction. Looking through the
> whole output of -d out_asm this is the first mention of r2 which makes me wonder if
> it is special somehow? At least a quick search indicates that for 32-bit PPC r2 is
> supposed to be dedicated as a TOC pointer.

According to a PowerPC ABI doc: 
http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf
r2 is system reserved and should not be used by application code and 
another one (probably the same you were referring to mentions TOC 
https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#REG. 
I'm not sure if that's relevant for the above but it looks like clobbering 
r2 might cause problems.

Regards,
BALATON Zoltan


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-26 17:00                     ` Mark Cave-Ayland
  2019-06-26 18:18                       ` BALATON Zoltan
@ 2019-06-26 18:42                       ` Richard Henderson
  2019-06-26 19:38                         ` Mark Cave-Ayland
  1 sibling, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2019-06-26 18:42 UTC (permalink / raw)
  To: Mark Cave-Ayland, Aleksandar Markovic
  Cc: Howard Spoelstra, QEMU Developers, David Gibson

On 6/26/19 7:00 PM, Mark Cave-Ayland wrote:
> Interestingly if I set a trap and then switch the opcode to "lis r4,0" (0x3c800000)
> then we carry on as normal until the next "lis r2,0" instruction. Looking through the
> whole output of -d out_asm this is the first mention of r2 which makes me wonder if
> it is special somehow? At least a quick search indicates that for 32-bit PPC r2 is
> supposed to be dedicated as a TOC pointer.
> 
> Is there a quick way to disable r2 from the list of available registers to see if
> that gets things going?

Interesting.  I'm not sure why that's happening.

As a quick hack,


  /* For some memory operations, we need a scratch that isn't R0.  For the AIX
     calling convention, we can re-use the TOC register since we'll be reloading
     it at every call.  Otherwise R12 will do nicely as neither a call-saved
     register nor a parameter register.  */
- #ifdef _CALL_AIX
+ #if 0
  # define TCG_REG_TMP1   TCG_REG_R2
  #else
  # define TCG_REG_TMP1   TCG_REG_R12
  #endif


But I thought that _CALL_AIX was only defined for ppc64 elf version 1.  I
thought that ppc32 used _CALL_SYSV instead.  Certainly that's what is used
elsewhere...


r~


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-26 18:42                       ` Richard Henderson
@ 2019-06-26 19:38                         ` Mark Cave-Ayland
  2019-06-26 20:21                           ` BALATON Zoltan
  2019-06-27 17:24                           ` Mark Cave-Ayland
  0 siblings, 2 replies; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-26 19:38 UTC (permalink / raw)
  To: Richard Henderson, Aleksandar Markovic
  Cc: David Gibson, QEMU Developers, Howard Spoelstra

On 26/06/2019 19:42, Richard Henderson wrote:

> On 6/26/19 7:00 PM, Mark Cave-Ayland wrote:
>> Interestingly if I set a trap and then switch the opcode to "lis r4,0" (0x3c800000)
>> then we carry on as normal until the next "lis r2,0" instruction. Looking through the
>> whole output of -d out_asm this is the first mention of r2 which makes me wonder if
>> it is special somehow? At least a quick search indicates that for 32-bit PPC r2 is
>> supposed to be dedicated as a TOC pointer.
>>
>> Is there a quick way to disable r2 from the list of available registers to see if
>> that gets things going?
> 
> Interesting.  I'm not sure why that's happening.
> 
> As a quick hack,
> 
> 
>   /* For some memory operations, we need a scratch that isn't R0.  For the AIX
>      calling convention, we can re-use the TOC register since we'll be reloading
>      it at every call.  Otherwise R12 will do nicely as neither a call-saved
>      register nor a parameter register.  */
> - #ifdef _CALL_AIX
> + #if 0
>   # define TCG_REG_TMP1   TCG_REG_R2
>   #else
>   # define TCG_REG_TMP1   TCG_REG_R12
>   #endif
> 
> 
> But I thought that _CALL_AIX was only defined for ppc64 elf version 1.  I
> thought that ppc32 used _CALL_SYSV instead.  Certainly that's what is used
> elsewhere...

No, that didn't work either. I've confirmed using #ifdef _CALL_AIX #error ERROR
#endif that _CALL_AIX is *NOT* defined and _CALL_SYSV *is* defined.

I've also tried removing TCG_REG_R2 from tcg_target_reg_alloc_order[] and
tcg_regset_set_reg() for TCG_REG_R2 from tcg_target_init() and I'm still generating
bad code that writes to r2(!).

Since I can't find any other mentions of TCG_REG_TMP1 and TCG_REG_R2 that isn't
inside an #ifdef _CALL_AIX ... #endif section I'm starting to get stuck. Is there any
chance that the R_PPC_ADDR32 change could be causing this at all?


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-26 19:38                         ` Mark Cave-Ayland
@ 2019-06-26 20:21                           ` BALATON Zoltan
  2019-06-27 17:24                           ` Mark Cave-Ayland
  1 sibling, 0 replies; 34+ messages in thread
From: BALATON Zoltan @ 2019-06-26 20:21 UTC (permalink / raw)
  To: Mark Cave-Ayland
  Cc: Howard Spoelstra, Richard Henderson, QEMU Developers,
	Aleksandar Markovic, David Gibson

On Wed, 26 Jun 2019, Mark Cave-Ayland wrote:
> On 26/06/2019 19:42, Richard Henderson wrote:
>
>> On 6/26/19 7:00 PM, Mark Cave-Ayland wrote:
>>> Interestingly if I set a trap and then switch the opcode to "lis r4,0" (0x3c800000)
>>> then we carry on as normal until the next "lis r2,0" instruction. Looking through the
>>> whole output of -d out_asm this is the first mention of r2 which makes me wonder if
>>> it is special somehow? At least a quick search indicates that for 32-bit PPC r2 is
>>> supposed to be dedicated as a TOC pointer.
>>>
>>> Is there a quick way to disable r2 from the list of available registers to see if
>>> that gets things going?
>>
>> Interesting.  I'm not sure why that's happening.
>>
>> As a quick hack,
>>
>>
>>   /* For some memory operations, we need a scratch that isn't R0.  For the AIX
>>      calling convention, we can re-use the TOC register since we'll be reloading
>>      it at every call.  Otherwise R12 will do nicely as neither a call-saved
>>      register nor a parameter register.  */
>> - #ifdef _CALL_AIX
>> + #if 0
>>   # define TCG_REG_TMP1   TCG_REG_R2
>>   #else
>>   # define TCG_REG_TMP1   TCG_REG_R12
>>   #endif
>>
>>
>> But I thought that _CALL_AIX was only defined for ppc64 elf version 1.  I
>> thought that ppc32 used _CALL_SYSV instead.  Certainly that's what is used
>> elsewhere...
>
> No, that didn't work either. I've confirmed using #ifdef _CALL_AIX #error ERROR
> #endif that _CALL_AIX is *NOT* defined and _CALL_SYSV *is* defined.
>
> I've also tried removing TCG_REG_R2 from tcg_target_reg_alloc_order[] and
> tcg_regset_set_reg() for TCG_REG_R2 from tcg_target_init() and I'm still generating
> bad code that writes to r2(!).
>
> Since I can't find any other mentions of TCG_REG_TMP1 and TCG_REG_R2 that isn't
> inside an #ifdef _CALL_AIX ... #endif section I'm starting to get stuck. Is there any
> chance that the R_PPC_ADDR32 change could be causing this at all?

There's one more mention of TCG_REG_R2 in tcg-target.inc.c 
tcg_target_init() function where it's set as call clobber but then later 
in same func again as reserved if _CALL_SYSV or 64 bits. Not sure if the 
later tcg_regset_set_reg overrides the first one or that should be removed 
or put in an #else of the later conditional call. (Still don't know what 
I'm talking about just trowing random ideas.)

Regards,
BALATON Zoltan


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-26 19:38                         ` Mark Cave-Ayland
  2019-06-26 20:21                           ` BALATON Zoltan
@ 2019-06-27 17:24                           ` Mark Cave-Ayland
  2019-06-27 17:51                             ` Richard Henderson
  1 sibling, 1 reply; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-27 17:24 UTC (permalink / raw)
  To: Richard Henderson, Aleksandar Markovic
  Cc: Howard Spoelstra, QEMU Developers, David Gibson

On 26/06/2019 20:38, Mark Cave-Ayland wrote:

>> But I thought that _CALL_AIX was only defined for ppc64 elf version 1.  I
>> thought that ppc32 used _CALL_SYSV instead.  Certainly that's what is used
>> elsewhere...
> 
> No, that didn't work either. I've confirmed using #ifdef _CALL_AIX #error ERROR
> #endif that _CALL_AIX is *NOT* defined and _CALL_SYSV *is* defined.
> 
> I've also tried removing TCG_REG_R2 from tcg_target_reg_alloc_order[] and
> tcg_regset_set_reg() for TCG_REG_R2 from tcg_target_init() and I'm still generating
> bad code that writes to r2(!).
> 
> Since I can't find any other mentions of TCG_REG_TMP1 and TCG_REG_R2 that isn't
> inside an #ifdef _CALL_AIX ... #endif section I'm starting to get stuck. Is there any
> chance that the R_PPC_ADDR32 change could be causing this at all?

So after a lot more digging: the issue can be seen in tcg_out_ld() and tcg_out_st()
for the vector registers. Taking tcg_out_ld() as an example:

    case TCG_TYPE_V128:
        tcg_debug_assert(ret >= 32);
        assert((offset & 15) == 0);
        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset);
        break;

For the TCG_TYPE_V128 case we have ret = TCG_REG_V2 but (ret & 31) masks off the top
bit which converts this to TCG_REG_R2 and that's why tcg_out_mem_long() starts using
r2 to calculate offsets.

Assuming that rt is the temporary register used to calculate the address then the
patch below tentatively appears to get things working again by passing in
TCG_REG_TMP1 instead, but ultimately I still see a crash much later when trying to
boot MacOS 9:

diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 61732c1f45..dd823447cc 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -1139,7 +1139,7 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
     case TCG_TYPE_V64:
         tcg_debug_assert(ret >= 32);
         assert((offset & 7) == 0);
-        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset & -16);
+        tcg_out_mem_long(s, 0, LVX, TCG_REG_TMP1, base, offset & -16);
         if (offset & 8) {
             tcg_out_vsldoi(s, ret, ret, ret, 8);
         }
@@ -1147,7 +1147,7 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
     case TCG_TYPE_V128:
         tcg_debug_assert(ret >= 32);
         assert((offset & 15) == 0);
-        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset);
+        tcg_out_mem_long(s, 0, LVX, TCG_REG_TMP1, base, offset);
         break;
     default:
         g_assert_not_reached();
@@ -1186,12 +1186,13 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
             tcg_out_vsldoi(s, TCG_VEC_TMP1, arg, arg, 8);
             arg = TCG_VEC_TMP1;
         }
-        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset);
-        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset + 4);
+        tcg_out_mem_long(s, 0, STVEWX, TCG_REG_TMP1, base, offset);
+        tcg_out_mem_long(s, 0, STVEWX, TCG_REG_TMP1, base, offset + 4);
         break;
     case TCG_TYPE_V128:
         tcg_debug_assert(arg >= 32);
-        tcg_out_mem_long(s, 0, STVX, arg & 31, base, offset);
+        assert((offset & 15) == 0);
+        tcg_out_mem_long(s, 0, STVX, TCG_REG_TMP1, base, offset);
         break;
     default:
         g_assert_not_reached();

Richard: even though it's still not perfect, does this look like it's fixing the
right problem? Presumably the reason this didn't break on your Power 9 box is because
the 64-bit ABI doesn't mark r2 as reserved?


ATB,

Mark.


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-27 17:24                           ` Mark Cave-Ayland
@ 2019-06-27 17:51                             ` Richard Henderson
  2019-06-27 17:54                               ` Richard Henderson
  0 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2019-06-27 17:51 UTC (permalink / raw)
  To: Mark Cave-Ayland, Aleksandar Markovic
  Cc: David Gibson, QEMU Developers, Howard Spoelstra

[-- Attachment #1: Type: text/plain, Size: 1033 bytes --]

On 6/27/19 7:24 PM, Mark Cave-Ayland wrote:
> For the TCG_TYPE_V128 case we have ret = TCG_REG_V2 but (ret & 31) masks
> off the top bit which converts this to TCG_REG_R2 and that's why
> tcg_out_mem_long() starts using r2 to calculate offsets.

Oh geez.  Ok, I see it now.

>      case TCG_TYPE_V128:
>          tcg_debug_assert(ret >= 32);
>          assert((offset & 15) == 0);
> -        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset);
> +        tcg_out_mem_long(s, 0, LVX, TCG_REG_TMP1, base, offset);

No, here ret is the register into which we are loading.
Same for the rest.  The error is in tcg_out_mem_long in
trying to reuse the output register as a scratch.

> Presumably the reason this didn't break on your Power 9 box is because
> the 64-bit ABI doesn't mark r2 as reserved?

Correct.  That and the fact that V0 and V1 get reserved as temporaries, so I
didn't attempt to use r1 (i.e. sp) as a temporary.

Please try the following patch on top and if it works I'll split it back into
the patch set properly.


r~

[-- Attachment #2: z --]
[-- Type: text/plain, Size: 1736 bytes --]

diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index 2ae537461f..b61c7ea0f1 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -6124,27 +6124,34 @@ static void m68k_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *dc = container_of(dcbase, DisasContext, base);
 
-    if (dc->base.is_jmp == DISAS_NORETURN) {
-        return;
-    }
-    if (dc->base.singlestep_enabled) {
-        gen_helper_raise_exception(cpu_env, tcg_const_i32(EXCP_DEBUG));
-        return;
-    }
-
     switch (dc->base.is_jmp) {
+    case DISAS_NORETURN:
+        break;
     case DISAS_TOO_MANY:
         update_cc_op(dc);
-        gen_jmp_tb(dc, 0, dc->pc);
+        if (dc->base.singlestep_enabled) {
+            tcg_gen_movi_i32(QREG_PC, dc->pc);
+            gen_helper_raise_exception(cpu_env, tcg_const_i32(EXCP_DEBUG));
+        } else {
+            gen_jmp_tb(dc, 0, dc->pc);
+        }
         break;
     case DISAS_JUMP:
         /* We updated CC_OP and PC in gen_jmp/gen_jmp_im.  */
-        tcg_gen_lookup_and_goto_ptr();
+        if (dc->base.singlestep_enabled) {
+            gen_helper_raise_exception(cpu_env, tcg_const_i32(EXCP_DEBUG));
+        } else {
+            tcg_gen_lookup_and_goto_ptr();
+        }
         break;
     case DISAS_EXIT:
         /* We updated CC_OP and PC in gen_exit_tb, but also modified
            other state that may require returning to the main loop.  */
-        tcg_gen_exit_tb(NULL, 0);
+        if (dc->base.singlestep_enabled) {
+            gen_helper_raise_exception(cpu_env, tcg_const_i32(EXCP_DEBUG));
+        } else {
+            tcg_gen_exit_tb(NULL, 0);
+        }
         break;
     default:
         g_assert_not_reached();

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-27 17:51                             ` Richard Henderson
@ 2019-06-27 17:54                               ` Richard Henderson
  2019-06-27 18:21                                 ` Mark Cave-Ayland
  0 siblings, 1 reply; 34+ messages in thread
From: Richard Henderson @ 2019-06-27 17:54 UTC (permalink / raw)
  To: Mark Cave-Ayland, Aleksandar Markovic
  Cc: David Gibson, QEMU Developers, Howard Spoelstra

[-- Attachment #1: Type: text/plain, Size: 232 bytes --]

On 6/27/19 7:51 PM, Richard Henderson wrote:
> Please try the following patch on top and if it works I'll split it back into
> the patch set properly.

Dangit.  I generated the patch on the wrong machine.
Let's try that again.


r~

[-- Attachment #2: z --]
[-- Type: text/plain, Size: 4823 bytes --]

diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 6cc56cf..e929df3 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -1132,7 +1132,7 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
         align = 3;
         /* FALLTHRU */
     default:
-        if (rt != TCG_REG_R0) {
+        if (rt > TCG_REG_R0 && rt < 32) {
             rs = rt;
             break;
         }
@@ -1161,7 +1161,7 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
         }
         tcg_debug_assert(!is_int_store || rs != rt);
         tcg_out_movi(s, TCG_TYPE_PTR, rs, orig);
-        tcg_out32(s, opx | TAB(rt, base, rs));
+        tcg_out32(s, opx | TAB(rt & 31, base, rs));
         return;
     }
 
@@ -1182,7 +1182,7 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
         base = rs;
     }
     if (opi != ADDI || base != rt || l0 != 0) {
-        tcg_out32(s, opi | TAI(rt, base, l0));
+        tcg_out32(s, opi | TAI(rt & 31, base, l0));
     }
 }
 
@@ -1204,11 +1204,11 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
             break;
         }
         if (have_isa_2_07_vsx) {
-            tcg_out_mem_long(s, 0, LXSIWZX | 1, ret & 31, base, offset);
+            tcg_out_mem_long(s, 0, LXSIWZX | 1, ret, base, offset);
             break;
         }
         assert((offset & 3) == 0);
-        tcg_out_mem_long(s, 0, LVEWX, ret & 31, base, offset);
+        tcg_out_mem_long(s, 0, LVEWX, ret, base, offset);
         shift = (offset - 4) & 0xc;
         if (shift) {
             tcg_out_vsldoi(s, ret, ret, ret, shift);
@@ -1224,11 +1224,11 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
         tcg_debug_assert(ret >= 32);
         if (have_isa_2_06_vsx) {
             tcg_out_mem_long(s, have_isa_3_00_vsx ? LXSD : 0, LXSDX | 1,
-                             ret & 31, base, offset);
+                             ret, base, offset);
             break;
         }
         assert((offset & 7) == 0);
-        tcg_out_mem_long(s, 0, LVX, ret & 31, base, offset & -16);
+        tcg_out_mem_long(s, 0, LVX, ret, base, offset & -16);
         if (offset & 8) {
             tcg_out_vsldoi(s, ret, ret, ret, 8);
         }
@@ -1237,7 +1237,7 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
         tcg_debug_assert(ret >= 32);
         assert((offset & 15) == 0);
         tcg_out_mem_long(s, have_isa_3_00_vsx ? LXV | 8 : 0, LVX,
-                         ret & 31, base, offset);
+                         ret, base, offset);
         break;
     default:
         g_assert_not_reached();
@@ -1256,7 +1256,7 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
             break;
         }
         if (have_isa_2_07_vsx) {
-            tcg_out_mem_long(s, 0, STXSIWX | 1, arg & 31, base, offset);
+            tcg_out_mem_long(s, 0, STXSIWX | 1, arg, base, offset);
             break;
         }
         assert((offset & 3) == 0);
@@ -1265,7 +1265,7 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
             tcg_out_vsldoi(s, TCG_VEC_TMP1, arg, arg, shift);
             arg = TCG_VEC_TMP1;
         }
-        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset);
+        tcg_out_mem_long(s, 0, STVEWX, arg, base, offset);
         break;
     case TCG_TYPE_I64:
         if (arg < 32) {
@@ -1277,7 +1277,7 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
         tcg_debug_assert(arg >= 32);
         if (have_isa_2_06_vsx) {
             tcg_out_mem_long(s, have_isa_3_00_vsx ? STXSD : 0,
-                             STXSDX | 1, arg & 31, base, offset);
+                             STXSDX | 1, arg, base, offset);
             break;
         }
         assert((offset & 7) == 0);
@@ -1285,13 +1285,13 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
             tcg_out_vsldoi(s, TCG_VEC_TMP1, arg, arg, 8);
             arg = TCG_VEC_TMP1;
         }
-        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset);
-        tcg_out_mem_long(s, 0, STVEWX, arg & 31, base, offset + 4);
+        tcg_out_mem_long(s, 0, STVEWX, arg, base, offset);
+        tcg_out_mem_long(s, 0, STVEWX, arg, base, offset + 4);
         break;
     case TCG_TYPE_V128:
         tcg_debug_assert(arg >= 32);
         tcg_out_mem_long(s, have_isa_3_00_vsx ? STXV | 8 : 0, STVX,
-                         arg & 31, base, offset);
+                         arg, base, offset);
         break;
     default:
         g_assert_not_reached();
@@ -3075,7 +3075,6 @@ static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
     int elt;
 
     tcg_debug_assert(out >= 32);
-    out &= 31;
     switch (vece) {
     case MO_8:
         if (have_isa_3_00_vsx) {

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes
  2019-06-27 17:54                               ` Richard Henderson
@ 2019-06-27 18:21                                 ` Mark Cave-Ayland
  0 siblings, 0 replies; 34+ messages in thread
From: Mark Cave-Ayland @ 2019-06-27 18:21 UTC (permalink / raw)
  To: Richard Henderson, Aleksandar Markovic
  Cc: Howard Spoelstra, QEMU Developers, David Gibson

On 27/06/2019 18:54, Richard Henderson wrote:

> On 6/27/19 7:51 PM, Richard Henderson wrote:
>> Please try the following patch on top and if it works I'll split it back into
>> the patch set properly.
> 
> Dangit.  I generated the patch on the wrong machine.
> Let's try that again.

Yes it works! Or at least so far it has survived a boot into the MacOS 9 desktop
which is fairly good at exercising all sorts of strange edge cases...

If you're going to resend the patchset, don't forget to squash "tcg/ppc: Support
vector dup2" into "tcg/ppc: Initial backend support for Altivec" to preserve
bisectability on 32-bit PPC hosts when configuring with "--enable-debug-tcg".


ATB,

Mark.


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2019-06-27 18:23 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-19  4:15 [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 1/7] tcg/ppc: Initial backend support for Altivec Richard Henderson
2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 2/7] tcg/ppc: Support vector shift by immediate Richard Henderson
2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 3/7] tcg/ppc: Support vector multiply Richard Henderson
2019-05-19  5:05   ` Aleksandar Markovic
2019-05-19 14:45     ` Richard Henderson
2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 4/7] tcg/ppc: Support vector dup2 Richard Henderson
2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 5/7] tcg/ppc: Update vector support to v2.06 Richard Henderson
2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 6/7] tcg/ppc: Update vector support to v2.07 Richard Henderson
2019-05-19  4:15 ` [Qemu-devel] [PATCH v4 7/7] tcg/ppc: Update vector support to v3.00 Richard Henderson
2019-06-18  5:00 ` [Qemu-devel] [PATCH v4 0/7] tcg/ppc: Add vector opcodes Richard Henderson
2019-06-19  5:07   ` Mark Cave-Ayland
2019-06-20 11:51     ` Howard Spoelstra
2019-06-22 14:20     ` Mark Cave-Ayland
2019-06-22 15:01       ` Mark Cave-Ayland
2019-06-23 17:10         ` Aleksandar Markovic
2019-06-25  6:56           ` Richard Henderson
2019-06-25 15:37             ` Mark Cave-Ayland
2019-06-25 15:56               ` Richard Henderson
2019-06-25 17:55                 ` Mark Cave-Ayland
2019-06-26  7:45                   ` Richard Henderson
2019-06-26 17:00                     ` Mark Cave-Ayland
2019-06-26 18:18                       ` BALATON Zoltan
2019-06-26 18:42                       ` Richard Henderson
2019-06-26 19:38                         ` Mark Cave-Ayland
2019-06-26 20:21                           ` BALATON Zoltan
2019-06-27 17:24                           ` Mark Cave-Ayland
2019-06-27 17:51                             ` Richard Henderson
2019-06-27 17:54                               ` Richard Henderson
2019-06-27 18:21                                 ` Mark Cave-Ayland
2019-06-26  8:33                 ` David Gibson
2019-06-26 15:25                   ` Richard Henderson
2019-06-25 18:01               ` Aleksandar Markovic
2019-06-19  8:11   ` David Gibson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).