All of lore.kernel.org
 help / color / mirror / Atom feed
From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Subject: [PATCH for-8.0 07/29] accel/tcg: Honor atomicity of loads
Date: Fri, 18 Nov 2022 01:47:32 -0800	[thread overview]
Message-ID: <20221118094754.242910-8-richard.henderson@linaro.org> (raw)
In-Reply-To: <20221118094754.242910-1-richard.henderson@linaro.org>

Create ldst_atomicity.c.inc.

Not required for user-only code loads, because we've ensured that
the page is read-only before beginning to translate code.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cputlb.c             | 174 ++++++++---
 accel/tcg/user-exec.c          |  26 +-
 accel/tcg/ldst_atomicity.c.inc | 546 +++++++++++++++++++++++++++++++++
 3 files changed, 695 insertions(+), 51 deletions(-)
 create mode 100644 accel/tcg/ldst_atomicity.c.inc

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 5562fb82d6..cdc109b473 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1651,6 +1651,9 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, target_ulong addr,
     return qemu_ram_addr_from_host_nofail(p);
 }
 
+/* Load/store with atomicity primitives. */
+#include "ldst_atomicity.c.inc"
+
 #ifdef CONFIG_PLUGIN
 /*
  * Perform a TLB lookup and populate the qemu_plugin_hwaddr structure.
@@ -2003,35 +2006,7 @@ static void validate_memop(MemOpIdx oi, MemOp expected)
  * specifically for reading instructions from system memory. It is
  * called by the translation loop and in some helpers where the code
  * is disassembled. It shouldn't be called directly by guest code.
- */
-
-typedef uint64_t FullLoadHelper(CPUArchState *env, target_ulong addr,
-                                MemOpIdx oi, uintptr_t retaddr);
-
-static inline uint64_t QEMU_ALWAYS_INLINE
-load_memop(const void *haddr, MemOp op)
-{
-    switch (op) {
-    case MO_UB:
-        return ldub_p(haddr);
-    case MO_BEUW:
-        return lduw_be_p(haddr);
-    case MO_LEUW:
-        return lduw_le_p(haddr);
-    case MO_BEUL:
-        return (uint32_t)ldl_be_p(haddr);
-    case MO_LEUL:
-        return (uint32_t)ldl_le_p(haddr);
-    case MO_BEUQ:
-        return ldq_be_p(haddr);
-    case MO_LEUQ:
-        return ldq_le_p(haddr);
-    default:
-        qemu_build_not_reached();
-    }
-}
-
-/*
+ *
  * For the benefit of TCG generated code, we want to avoid the
  * complication of ABI-specific return type promotion and always
  * return a value extended to the register size of the host. This is
@@ -2087,17 +2062,134 @@ static uint64_t do_ld_bytes_beN(MMULookupPageData *p, uint64_t ret_be)
     return ret_be;
 }
 
+/**
+ * do_ld_parts_beN
+ * @p: translation parameters
+ * @ret_be: accumulated data
+ *
+ * As do_ld_bytes_beN, but atomically on each aligned part.
+ */
+static uint64_t do_ld_parts_beN(MMULookupPageData *p, uint64_t ret_be)
+{
+    void *haddr = p->haddr;
+    int size = p->size;
+
+    do {
+        uint64_t x;
+        int n;
+
+        /*
+         * Find minimum of alignment and size.
+         * This is slightly stronger than required by MO_ATOM_SUBALIGN, which
+         * would have only checked the low bits of addr|size once at the start,
+         * but is just as easy.
+         */
+        switch (((uintptr_t)haddr | size) & 7) {
+        case 4:
+            x = cpu_to_be32(load_atomic4(haddr));
+            ret_be = (ret_be << 32) | x;
+            n = 4;
+            break;
+        case 2:
+        case 6:
+            x = cpu_to_be16(load_atomic2(haddr));
+            ret_be = (ret_be << 16) | x;
+            n = 2;
+            break;
+        default:
+            x = *(uint8_t *)haddr;
+            ret_be = (ret_be << 8) | x;
+            n = 1;
+            break;
+        case 0:
+            g_assert_not_reached();
+        }
+        haddr += n;
+        size -= n;
+    } while (size != 0);
+    return ret_be;
+}
+
+/**
+ * do_ld_parts_be4
+ * @p: translation parameters
+ * @ret_be: accumulated data
+ *
+ * As do_ld_bytes_beN, but with one atomic load.
+ * Four aligned bytes are guaranteed to cover the load.
+ */
+static uint64_t do_ld_whole_be4(MMULookupPageData *p, uint64_t ret_be)
+{
+    int o = p->addr & 3;
+    uint32_t x = load_atomic4(p->haddr - o);
+
+    x = cpu_to_be32(x);
+    x <<= o * 8;
+    x >>= (4 - p->size) * 8;
+    return (ret_be << (p->size * 8)) | x;
+}
+
+/**
+ * do_ld_parts_be8
+ * @p: translation parameters
+ * @ret_be: accumulated data
+ *
+ * As do_ld_bytes_beN, but with one atomic load.
+ * Eight aligned bytes are guaranteed to cover the load.
+ */
+static uint64_t do_ld_whole_be8(CPUArchState *env, uintptr_t ra,
+                                MMULookupPageData *p, uint64_t ret_be)
+{
+    int o = p->addr & 7;
+    uint64_t x = load_atomic8_or_exit(env, ra, p->haddr - o);
+
+    x = cpu_to_be64(x);
+    x <<= o * 8;
+    x >>= (8 - p->size) * 8;
+    return (ret_be << (p->size * 8)) | x;
+}
+
 /*
  * Wrapper for the above.
  */
 static uint64_t do_ld_beN(CPUArchState *env, MMULookupPageData *p,
-                          uint64_t ret_be, int mmu_idx,
-                          MMUAccessType type, uintptr_t ra)
+                          uint64_t ret_be, int mmu_idx, MMUAccessType type,
+                          MemOp mop, uintptr_t ra)
 {
+    MemOp atmax;
+
     if (unlikely(p->flags & TLB_MMIO)) {
         return do_ld_mmio_beN(env, p, ret_be, mmu_idx, type, ra);
-    } else {
+    }
+
+    switch (mop & MO_ATOM_MASK) {
+    case MO_ATOM_WITHIN16:
+        /*
+         * It is a given that we cross a page and therefore there is no
+         * atomicity for the load as a whole, but there may be a subobject
+         * as defined by ATMAX which does not cross a 16-byte boundary.
+         */
+        atmax = mop & MO_ATMAX_MASK;
+        if (atmax == MO_ATMAX_SIZE) {
+            atmax = mop & MO_SIZE;
+        } else {
+            atmax >>= MO_ATMAX_SHIFT;
+        }
+        if (unlikely(p->size >= (1 << atmax))) {
+            if (!HAVE_al8_fast && p->size < 4) {
+                return do_ld_whole_be4(p, ret_be);
+            } else {
+                return do_ld_whole_be8(env, ra, p, ret_be);
+            }
+        }
+        /* fall through */
+    case MO_ATOM_IFALIGN:
+    case MO_ATOM_NONE:
         return do_ld_bytes_beN(p, ret_be);
+    case MO_ATOM_SUBALIGN:
+        return do_ld_parts_beN(p, ret_be);
+    default:
+        g_assert_not_reached();
     }
 }
 
@@ -2147,7 +2239,7 @@ static uint16_t do_ld2_mmu(CPUArchState *env, target_ulong addr, MemOpIdx oi,
                            access_type, l.memop);
         } else {
             /* Perform the load host endian, then swap if necessary. */
-            ret = load_memop(l.page[0].haddr, MO_UW);
+            ret = load_atom_2(env, ra, l.page[0].haddr, l.memop);
             if (l.memop & MO_BSWAP) {
                 ret = bswap16(ret);
             }
@@ -2200,15 +2292,17 @@ static uint32_t do_ld4_mmu(CPUArchState *env, target_ulong addr, MemOpIdx oi,
                            access_type, l.memop);
         } else {
             /* Perform the load host endian. */
-            ret = load_memop(l.page[0].haddr, MO_UL);
+            ret = load_atom_4(env, ra, l.page[0].haddr, l.memop);
             if (l.memop & MO_BSWAP) {
                 ret = bswap32(ret);
             }
         }
     } else {
         assert_no_tlb_bswap;
-        ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx, access_type, ra);
-        ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx, access_type, ra);
+        ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx,
+                        access_type, l.memop, ra);
+        ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx,
+                        access_type, l.memop, ra);
         if ((l.memop & MO_BSWAP) == MO_LE) {
             ret = bswap32(ret);
         }
@@ -2247,15 +2341,17 @@ static uint64_t do_ld8_mmu(CPUArchState *env, target_ulong addr, MemOpIdx oi,
                            access_type, l.memop);
         } else {
             /* Perform the load host endian. */
-            ret = load_memop(l.page[0].haddr, MO_UQ);
+            ret = load_atom_8(env, ra, l.page[0].haddr, l.memop);
             if (l.memop & MO_BSWAP) {
                 ret = bswap64(ret);
             }
         }
     } else {
         assert_no_tlb_bswap;
-        ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx, access_type, ra);
-        ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx, access_type, ra);
+        ret = do_ld_beN(env, &l.page[0], 0, l.mmu_idx,
+                        access_type, l.memop, ra);
+        ret = do_ld_beN(env, &l.page[1], ret, l.mmu_idx,
+                        access_type, l.memop, ra);
         if ((l.memop & MO_BSWAP) == MO_LE) {
             ret = bswap64(ret);
         }
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index a52c7ef826..ec721e5097 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -296,6 +296,8 @@ static void *cpu_mmu_lookup(CPUArchState *env, target_ulong addr,
     return ret;
 }
 
+#include "ldst_atomicity.c.inc"
+
 uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr,
                     MemOpIdx oi, uintptr_t ra)
 {
@@ -318,10 +320,10 @@ uint16_t cpu_ldw_be_mmu(CPUArchState *env, abi_ptr addr,
 
     validate_memop(oi, MO_BEUW);
     haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = lduw_be_p(haddr);
+    ret = load_atom_2(env, ra, haddr, get_memop(oi));
     clear_helper_retaddr();
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return cpu_to_be16(ret);
 }
 
 uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
@@ -332,10 +334,10 @@ uint32_t cpu_ldl_be_mmu(CPUArchState *env, abi_ptr addr,
 
     validate_memop(oi, MO_BEUL);
     haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldl_be_p(haddr);
+    ret = load_atom_4(env, ra, haddr, get_memop(oi));
     clear_helper_retaddr();
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return cpu_to_be32(ret);
 }
 
 uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
@@ -346,10 +348,10 @@ uint64_t cpu_ldq_be_mmu(CPUArchState *env, abi_ptr addr,
 
     validate_memop(oi, MO_BEUQ);
     haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldq_be_p(haddr);
+    ret = load_atom_8(env, ra, haddr, get_memop(oi));
     clear_helper_retaddr();
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return cpu_to_be64(ret);
 }
 
 uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
@@ -360,10 +362,10 @@ uint16_t cpu_ldw_le_mmu(CPUArchState *env, abi_ptr addr,
 
     validate_memop(oi, MO_LEUW);
     haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = lduw_le_p(haddr);
+    ret = load_atom_2(env, ra, haddr, get_memop(oi));
     clear_helper_retaddr();
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return cpu_to_le16(ret);
 }
 
 uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
@@ -374,10 +376,10 @@ uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr addr,
 
     validate_memop(oi, MO_LEUL);
     haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldl_le_p(haddr);
+    ret = load_atom_4(env, ra, haddr, get_memop(oi));
     clear_helper_retaddr();
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return cpu_to_le32(ret);
 }
 
 uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
@@ -388,10 +390,10 @@ uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
 
     validate_memop(oi, MO_LEUQ);
     haddr = cpu_mmu_lookup(env, addr, oi, ra, MMU_DATA_LOAD);
-    ret = ldq_le_p(haddr);
+    ret = load_atom_8(env, ra, haddr, get_memop(oi));
     clear_helper_retaddr();
     qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
-    return ret;
+    return cpu_to_le64(ret);
 }
 
 Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
diff --git a/accel/tcg/ldst_atomicity.c.inc b/accel/tcg/ldst_atomicity.c.inc
new file mode 100644
index 0000000000..decc9a2a16
--- /dev/null
+++ b/accel/tcg/ldst_atomicity.c.inc
@@ -0,0 +1,546 @@
+/*
+ * Routines common to user and system emulation of load/store.
+ *
+ *  Copyright (c) 2022 Linaro, Ltd.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifdef CONFIG_ATOMIC64
+# define HAVE_al8          true
+#else
+# define HAVE_al8          false
+#endif
+#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
+
+#if defined(CONFIG_ATOMIC128)
+# define HAVE_al16_fast    true
+#else
+# define HAVE_al16_fast    false
+#endif
+
+/**
+ * required_atomicity:
+ *
+ * Return the lg2 bytes of atomicity required by @memop for @p.
+ * If the operation must be split into two operations to be
+ * examined separately for atomicity, return -lg2.
+ */
+static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
+{
+    int atmax = memop & MO_ATMAX_MASK;
+    int size = memop & MO_SIZE;
+    unsigned tmp;
+
+    if (atmax == MO_ATMAX_SIZE) {
+        atmax = size;
+    } else {
+        atmax >>= MO_ATMAX_SHIFT;
+    }
+
+    switch (memop & MO_ATOM_MASK) {
+    case MO_ATOM_IFALIGN:
+        tmp = (1 << atmax) - 1;
+        if (p & tmp) {
+            return MO_8;
+        }
+        break;
+    case MO_ATOM_NONE:
+        return MO_8;
+    case MO_ATOM_SUBALIGN:
+        tmp = p & -p;
+        if (tmp != 0 && tmp < atmax) {
+            atmax = tmp;
+        }
+        break;
+    case MO_ATOM_WITHIN16:
+        tmp = p & 15;
+        if (tmp + (1 << size) <= 16) {
+            atmax = size;
+        } else if (atmax < size && tmp + (1 << atmax) != 16) {
+            /*
+             * Paired load/store, where the pairs aren't aligned.
+             * One of the two must still be handled atomically.
+             */
+            atmax = -atmax;
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    /*
+     * Here we have the architectural atomicity of the operation.
+     * However, when executing in a serial context, we need no extra
+     * host atomicity in order to avoid racing.  This reduction
+     * avoids looping with cpu_loop_exit_atomic.
+     */
+    if (cpu_in_serial_context(env_cpu(env))) {
+        return MO_8;
+    }
+    return atmax;
+}
+
+/**
+ * load_atomic2:
+ * @pv: host address
+ *
+ * Atomically load 2 aligned bytes from @pv.
+ */
+static inline uint16_t load_atomic2(void *pv)
+{
+    uint16_t *p = __builtin_assume_aligned(pv, 2);
+    return qatomic_read(p);
+}
+
+/**
+ * load_atomic4:
+ * @pv: host address
+ *
+ * Atomically load 4 aligned bytes from @pv.
+ */
+static inline uint32_t load_atomic4(void *pv)
+{
+    uint32_t *p = __builtin_assume_aligned(pv, 4);
+    return qatomic_read(p);
+}
+
+/**
+ * load_atomic8:
+ * @pv: host address
+ *
+ * Atomically load 8 aligned bytes from @pv.
+ */
+static inline uint64_t load_atomic8(void *pv)
+{
+    uint64_t *p = __builtin_assume_aligned(pv, 8);
+
+    qemu_build_assert(HAVE_al8);
+    return qatomic_read__nocheck(p);
+}
+
+/**
+ * load_atomic16:
+ * @pv: host address
+ *
+ * Atomically load 16 aligned bytes from @pv.
+ */
+static inline Int128 load_atomic16(void *pv)
+{
+#ifdef CONFIG_ATOMIC128
+    __uint128_t *p = __builtin_assume_aligned(pv, 16);
+    Int128Alias r;
+
+    r.u = qatomic_read__nocheck(p);
+    return r.s;
+#else
+    qemu_build_not_reached();
+#endif
+}
+
+/**
+ * load_atomic8_or_exit:
+ * @env: cpu context
+ * @ra: host unwind address
+ * @pv: host address
+ *
+ * Atomically load 8 aligned bytes from @pv.
+ * If this is not possible, longjmp out to restart serially.
+ */
+static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
+{
+    uint64_t *p = __builtin_assume_aligned(pv, 8);
+
+    if (HAVE_al8) {
+        return load_atomic8(p);
+    }
+
+#ifdef CONFIG_USER_ONLY
+    /*
+     * If the page is not writable, then assume the value is immutable
+     * and requires no locking.  This ignores the case of MAP_SHARED with
+     * another process, because the fallback start_exclusive solution
+     * provides no protection across processes.
+     */
+    if (!page_check_range(h2g(p), 8, PAGE_WRITE)) {
+        return *p;
+    }
+#endif
+
+    /* Ultimate fallback: re-execute in serial context. */
+    cpu_loop_exit_atomic(env_cpu(env), ra);
+}
+
+/**
+ * load_atomic16_or_exit:
+ * @env: cpu context
+ * @ra: host unwind address
+ * @pv: host address
+ *
+ * Atomically load 16 aligned bytes from @pv.
+ * If this is not possible, longjmp out to restart serially.
+ */
+static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
+{
+    Int128 *p = __builtin_assume_aligned(pv, 16);
+
+    if (HAVE_al16_fast) {
+        return load_atomic16(p);
+    }
+
+#ifdef CONFIG_USER_ONLY
+    /*
+     * We can only use cmpxchg to emulate a load if the page is writable.
+     * If the page is not writable, then assume the value is immutable
+     * and requires no locking.  This ignores the case of MAP_SHARED with
+     * another process, because the fallback start_exclusive solution
+     * provides no protection across processes.
+     */
+    if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
+        return *p;
+    }
+#endif
+
+    /*
+     * In system mode all guest pages are writable, and for user-only
+     * we have just checked writability.  Try cmpxchg.
+     */
+#if defined(CONFIG_CMPXCHG128)
+    /* Swap 0 with 0, with the side-effect of returning the old value. */
+    {
+        Int128Alias r;
+        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
+        return r.s;
+    }
+#endif
+
+    /* Ultimate fallback: re-execute in serial context. */
+    cpu_loop_exit_atomic(env_cpu(env), ra);
+}
+
+/**
+ * load_atom_extract_al4x2:
+ * @pv: host address
+ *
+ * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
+ */
+static uint32_t load_atom_extract_al4x2(void *pv)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int sh = (pi & 3) * 8;
+    uint32_t a, b;
+
+    pv = (void *)(pi & ~3);
+    a = load_atomic4(pv);
+    b = load_atomic4(pv + 4);
+
+    if (HOST_BIG_ENDIAN) {
+        return (a << sh) | (b >> (-sh & 31));
+    } else {
+        return (a >> sh) | (b << (-sh & 31));
+    }
+}
+
+/**
+ * load_atom_extract_al8x2:
+ * @pv: host address
+ *
+ * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
+ */
+static uint64_t load_atom_extract_al8x2(void *pv)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int sh = (pi & 7) * 8;
+    uint64_t a, b;
+
+    pv = (void *)(pi & ~7);
+    a = load_atomic8(pv);
+    b = load_atomic8(pv + 8);
+
+    if (HOST_BIG_ENDIAN) {
+        return (a << sh) | (b >> (-sh & 63));
+    } else {
+        return (a >> sh) | (b << (-sh & 63));
+    }
+}
+
+/**
+ * load_atom_extract_al8:
+ * @pv: host address
+ * @s: object size in bytes, @s <= 4.
+ *
+ * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
+ * not cross an 8-byte boundary.  This means that we can perform an atomic
+ * 8-byte load and extract.
+ * The value is returned in the low bits of a uint32_t.
+ */
+static uint32_t load_atom_extract_al8(void *pv, int s)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int o = pi & 7;
+    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
+
+    pv = (void *)(pi & ~7);
+    return load_atomic8(pv) >> shr;
+}
+
+/**
+ * load_atom_extract_al16_or_exit:
+ * @env: cpu context
+ * @ra: host unwind address
+ * @p: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Atomically load @s bytes from @p, when p % 16 < 8
+ * and p % 16 + s > 8.  I.e. does not cross a 16-byte
+ * boundary, but *does* cross an 8-byte boundary.
+ * This is the slow version, so we must have eliminated
+ * any faster load_atom_extract_al8 case.
+ *
+ * If this is not possible, longjmp out to restart serially.
+ */
+static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
+                                               void *pv, int s)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int o = pi & 7;
+    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
+    Int128 r;
+
+    /*
+     * Note constraints above: p & 8 must be clear.
+     * Provoke SIGBUS if possible otherwise.
+     */
+    pv = (void *)(pi & ~7);
+    r = load_atomic16_or_exit(env, ra, pv);
+
+    r = int128_urshift(r, shr);
+    return int128_getlo(r);
+}
+
+/**
+ * load_atom_extract_al16_or_al8:
+ * @p: host address
+ * @s: object size in bytes, @s <= 8.
+ *
+ * Load @s bytes from @p, when p % s != 0.  If [p, p+s-1] does not
+ * cross an 16-byte boundary then the access must be 16-byte atomic,
+ * otherwise the access must be 8-byte atomic.
+ */
+static inline uint64_t load_atom_extract_al16_or_al8(void *pv, int s)
+{
+#if defined(CONFIG_ATOMIC128)
+    uintptr_t pi = (uintptr_t)pv;
+    int o = pi & 7;
+    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
+    __uint128_t r;
+
+    pv = (void *)(pi & ~7);
+    if (pi & 8) {
+        uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
+        uint64_t a = qatomic_read__nocheck(p8);
+        uint64_t b = qatomic_read__nocheck(p8 + 1);
+
+        if (HOST_BIG_ENDIAN) {
+            r = ((__uint128_t)a << 64) | b;
+        } else {
+            r = ((__uint128_t)b << 64) | a;
+        }
+    } else {
+        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
+        r = qatomic_read__nocheck(p16);
+    }
+    return r >> shr;
+#else
+    qemu_build_not_reached();
+#endif
+}
+
+/**
+ * load_atom_4_by_2:
+ * @pv: host address
+ *
+ * Load 4 bytes from @pv, with two 2-byte atomic loads.
+ */
+static inline uint32_t load_atom_4_by_2(void *pv)
+{
+    uint32_t a = load_atomic2(pv);
+    uint32_t b = load_atomic2(pv + 2);
+
+    if (HOST_BIG_ENDIAN) {
+        return (a << 16) | b;
+    } else {
+        return (b << 16) | a;
+    }
+}
+
+/**
+ * load_atom_8_by_2:
+ * @pv: host address
+ *
+ * Load 8 bytes from @pv, with four 2-byte atomic loads.
+ */
+static inline uint64_t load_atom_8_by_2(void *pv)
+{
+    uint32_t a = load_atom_4_by_2(pv);
+    uint32_t b = load_atom_4_by_2(pv + 4);
+
+    if (HOST_BIG_ENDIAN) {
+        return ((uint64_t)a << 32) | b;
+    } else {
+        return ((uint64_t)b << 32) | a;
+    }
+}
+
+/**
+ * load_atom_8_by_4:
+ * @pv: host address
+ *
+ * Load 8 bytes from @pv, with two 4-byte atomic loads.
+ */
+static inline uint64_t load_atom_8_by_4(void *pv)
+{
+    uint32_t a = load_atomic4(pv);
+    uint32_t b = load_atomic4(pv + 4);
+
+    if (HOST_BIG_ENDIAN) {
+        return ((uint64_t)a << 32) | b;
+    } else {
+        return ((uint64_t)b << 32) | a;
+    }
+}
+
+/**
+ * load_atom_2:
+ * @p: host address
+ * @memop: the full memory op
+ *
+ * Load 2 bytes from @p, honoring the atomicity of @memop.
+ */
+static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
+                            void *pv, MemOp memop)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int atmax;
+
+    if (likely((pi & 1) == 0)) {
+        return load_atomic2(pv);
+    }
+    if (HAVE_al16_fast) {
+        return load_atom_extract_al16_or_al8(pv, 2);
+    }
+
+    atmax = required_atomicity(env, pi, memop);
+    switch (atmax) {
+    case MO_8:
+        return lduw_he_p(pv);
+    case MO_16:
+        /* The only case remaining is MO_ATOM_WITHIN16. */
+        if (!HAVE_al8_fast && (pi & 3) == 1) {
+            /* Big or little endian, we want the middle two bytes. */
+            return load_atomic4(pv - 1) >> 8;
+        }
+        if (unlikely((pi & 15) != 7)) {
+            return load_atom_extract_al8(pv, 2);
+        }
+        return load_atom_extract_al16_or_exit(env, ra, pv, 2);
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/**
+ * load_atom_4:
+ * @p: host address
+ * @memop: the full memory op
+ *
+ * Load 4 bytes from @p, honoring the atomicity of @memop.
+ */
+static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
+                            void *pv, MemOp memop)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int atmax;
+
+    if (likely((pi & 3) == 0)) {
+        return load_atomic4(pv);
+    }
+    if (HAVE_al16_fast) {
+        return load_atom_extract_al16_or_al8(pv, 4);
+    }
+
+    atmax = required_atomicity(env, pi, memop);
+    switch (atmax) {
+    case MO_8:
+    case MO_16:
+    case -MO_16:
+        /*
+         * For MO_ATOM_IFALIGN, this is more atomicity than required,
+         * but it's trivially supported on all hosts, better than 4
+         * individual byte loads (when the host requires alignment),
+         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
+         */
+        return load_atom_extract_al4x2(pv);
+    case MO_32:
+        if (!(pi & 4)) {
+            return load_atom_extract_al8(pv, 4);
+        }
+        return load_atom_extract_al16_or_exit(env, ra, pv, 4);
+    default:
+        g_assert_not_reached();
+    }
+}
+
+/**
+ * load_atom_8:
+ * @p: host address
+ * @memop: the full memory op
+ *
+ * Load 8 bytes from @p, honoring the atomicity of @memop.
+ */
+static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
+                            void *pv, MemOp memop)
+{
+    uintptr_t pi = (uintptr_t)pv;
+    int atmax;
+
+    /*
+     * If the host does not support 8-byte atomics, wait until we have
+     * examined the atomicity parameters below.
+     */
+    if (HAVE_al8 && likely((pi & 7) == 0)) {
+        return load_atomic8(pv);
+    }
+    if (HAVE_al16_fast) {
+        return load_atom_extract_al16_or_al8(pv, 8);
+    }
+
+    atmax = required_atomicity(env, pi, memop);
+    if (atmax == MO_64) {
+        if (!HAVE_al8 && (pi & 7) == 0) {
+            load_atomic8_or_exit(env, ra, pv);
+        }
+        return load_atom_extract_al16_or_exit(env, ra, pv, 8);
+    }
+    if (HAVE_al8_fast) {
+        return load_atom_extract_al8x2(pv);
+    }
+    switch (atmax) {
+    case MO_8:
+        return ldq_he_p(pv);
+    case MO_16:
+        return load_atom_8_by_2(pv);
+    case MO_32:
+        return load_atom_8_by_4(pv);
+    case -MO_32:
+        if (HAVE_al8) {
+            return load_atom_extract_al8x2(pv);
+        }
+        cpu_loop_exit_atomic(env_cpu(env), ra);
+    default:
+        g_assert_not_reached();
+    }
+}
-- 
2.34.1



  parent reply	other threads:[~2022-11-18  9:50 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-18  9:47 [PATCH for-8.0 00/29] tcg: Improve atomicity support Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 01/29] include/qemu/cpuid: Introduce xgetbv_low Richard Henderson
2022-11-21 12:15   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 02/29] include/exec/memop: Add bits describing atomicity Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 03/29] accel/tcg: Add cpu_in_serial_context Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 04/29] accel/tcg: Introduce tlb_read_idx Richard Henderson
2022-11-21 12:25   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 05/29] accel/tcg: Reorg system mode load helpers Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 06/29] accel/tcg: Reorg system mode store helpers Richard Henderson
2022-11-18  9:47 ` Richard Henderson [this message]
2022-11-22 14:35   ` [PATCH for-8.0 07/29] accel/tcg: Honor atomicity of loads Peter Maydell
2022-11-22 18:04     ` Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 08/29] accel/tcg: Honor atomicity of stores Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 09/29] tcg/tci: Use cpu_{ld,st}_mmu Richard Henderson
2022-11-21 12:40   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 10/29] tcg: Unify helper_{be,le}_{ld,st}* Richard Henderson
2022-11-21 12:48   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 11/29] accel/tcg: Implement helper_{ld, st}*_mmu for user-only Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 12/29] tcg: Add 128-bit guest memory primitives Richard Henderson
2022-11-22  3:30   ` Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 13/29] meson: Detect atomic128 support with optimization Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 14/29] tcg/i386: Add have_atomic16 Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 15/29] include/qemu/int128: Add vector type to Int128Alias Richard Henderson
2022-11-21 23:45   ` Philippe Mathieu-Daudé
2022-11-22 18:21   ` Philippe Mathieu-Daudé
2022-11-22 18:31     ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 16/29] accel/tcg: Use have_atomic16 in ldst_atomicity.c.inc Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 17/29] tcg/aarch64: Add have_lse, have_lse2 Richard Henderson
2022-11-21 23:10   ` Philippe Mathieu-Daudé
2022-11-21 23:14     ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 18/29] accel/tcg: Add aarch64 specific support in ldst_atomicity Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 19/29] tcg: Introduce TCG_OPF_TYPE_MASK Richard Henderson
2022-11-21 16:12   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 20/29] tcg: Add INDEX_op_qemu_{ld,st}_i128 Richard Henderson
2022-11-21 22:59   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 21/29] tcg/i386: Introduce tcg_out_mov2 Richard Henderson
2022-11-21 16:21   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 22/29] tcg/i386: Introduce tcg_out_testi Richard Henderson
2022-11-21 16:22   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 23/29] tcg/i386: Use full load/store helpers in user-only mode Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 24/29] tcg/i386: Replace is64 with type in qemu_ld/st routines Richard Henderson
2022-11-21 16:27   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 25/29] tcg/i386: Mark Win64 call-saved vector regs as reserved Richard Henderson
2022-11-21 16:28   ` Philippe Mathieu-Daudé
2022-11-18  9:47 ` [PATCH for-8.0 26/29] tcg/i386: Examine MemOp for atomicity and alignment Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 27/29] tcg/i386: Support 128-bit load/store with have_atomic16 Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 28/29] tcg/i386: Add vex_v argument to tcg_out_vex_modrm_pool Richard Henderson
2022-11-18  9:47 ` [PATCH for-8.0 29/29] tcg/i386: Honor 64-bit atomicity in 32-bit mode Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221118094754.242910-8-richard.henderson@linaro.org \
    --to=richard.henderson@linaro.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.