All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations
@ 2020-07-24 19:51 Shu-Chun Weng
  2020-07-27 18:51 ` Richard Henderson
  0 siblings, 1 reply; 4+ messages in thread
From: Shu-Chun Weng @ 2020-07-24 19:51 UTC (permalink / raw)
  To: richard.henderson, pbonzini, alex.bennee; +Cc: Shu-Chun Weng, qemu-devel

This change has no functional change.

There is a potential link error with "undefined symbol:
qemu_build_not_reached" due to how `store_helper` is structured.
This does not produce at current QEMU head, but was reproducible at
v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.

The current function structure is:

    inline QEMU_ALWAYSINLINE
    store_memop() {
        switch () {
            ...
        default:
            qemu_build_not_reached();
        }
    }
    inline QEMU_ALWAYSINLINE
    store_helper() {
        ...
        if (span_two_pages_or_io) {
            ...
            helper_rst_stb_mmu();
        }
        store_memop();
    }
    helper_rst_stb_mmu() {
        store_helper();
    }

Both `store_memop` and `store_helper` need to be inlined and allow
constant propogations to eliminate the `qemu_build_not_reached` call.
QEMU_ALWAYSINLINE is a valiant effort to make that happen, but it's
still not guaranteed to be inlined. What happens with the failing
combination is that the compiler decided to inline the
`helper_rst_stb_mmu` call inside `store_helper`, making the latter
self-recursive, thus prevents constant propagations.

The new structure is:

    inline QEMU_ALWAYSINLINE
    store_memop() {
        switch () {
            ...
        default:
            qemu_build_not_reached();
        }
    }
    inline QEMU_ALWAYSINLINE
    store_helper_size_aligned()() {
        ...
        if (span_two_pages_or_io) {
            return false;
        }
        store_memoop();
        return true;
    }
    inline QEMU_ALWAYSINLINE
    store_helper() {
        if (store_helper_size_aligned() {
            return;
        }
        helper_rst_stb_mmu();
    }
    helper_rst_stb_mmu() {
        store_helper_size_aligned()();
    }

Since a byte store cannot span across more than one page, this is a
no-op. Moreover, there is no more recursion making it more robust
against potential optimizations.

Signed-off-by: Shu-Chun Weng <scw@google.com>
---
v1: https://patchew.org/QEMU/20200318062057.224953-1-scw@google.com/

v1 -> v2:
  Restructure the function instead of marking `helper_rst_stb_mmu` noinline.

 accel/tcg/cputlb.c | 158 ++++++++++++++++++++++++++-------------------
 1 file changed, 92 insertions(+), 66 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index d370aedb47..14324f08d2 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2010,9 +2010,10 @@ store_memop(void *haddr, uint64_t val, MemOp op)
     }
 }
 
-static inline void QEMU_ALWAYS_INLINE
-store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
-             TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
+/* Returns false if the store is not size-aligned and no store is done. */
+static inline bool QEMU_ALWAYS_INLINE
+store_helper_size_aligned(CPUArchState *env, target_ulong addr, uint64_t val,
+                          TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
 {
     uintptr_t mmu_idx = get_mmuidx(oi);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
@@ -2048,7 +2049,7 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
 
         /* For anything that is unaligned, recurse through byte stores.  */
         if ((addr & (size - 1)) != 0) {
-            goto do_unaligned_access;
+            return false;
         }
 
         iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
@@ -2066,12 +2067,12 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
         if (tlb_addr & TLB_MMIO) {
             io_writex(env, iotlbentry, mmu_idx, val, addr, retaddr,
                       op ^ (need_swap * MO_BSWAP));
-            return;
+            return true;
         }
 
         /* Ignore writes to ROM.  */
         if (unlikely(tlb_addr & TLB_DISCARD_WRITE)) {
-            return;
+            return true;
         }
 
         /* Handle clean RAM pages.  */
@@ -2091,82 +2092,107 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
         } else {
             store_memop(haddr, val, op);
         }
-        return;
+        return true;
     }
 
-    /* Handle slow unaligned access (it spans two pages or IO).  */
     if (size > 1
         && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
                      >= TARGET_PAGE_SIZE)) {
-        int i;
-        uintptr_t index2;
-        CPUTLBEntry *entry2;
-        target_ulong page2, tlb_addr2;
-        size_t size2;
+        /* Slow unaligned access (it spans two pages or IO).  */
+        return false;
+    }
 
-    do_unaligned_access:
-        /*
-         * Ensure the second page is in the TLB.  Note that the first page
-         * is already guaranteed to be filled, and that the second page
-         * cannot evict the first.
-         */
-        page2 = (addr + size) & TARGET_PAGE_MASK;
-        size2 = (addr + size) & ~TARGET_PAGE_MASK;
-        index2 = tlb_index(env, mmu_idx, page2);
-        entry2 = tlb_entry(env, mmu_idx, page2);
-        tlb_addr2 = tlb_addr_write(entry2);
-        if (!tlb_hit_page(tlb_addr2, page2)) {
-            if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
-                tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
-                         mmu_idx, retaddr);
-                index2 = tlb_index(env, mmu_idx, page2);
-                entry2 = tlb_entry(env, mmu_idx, page2);
-            }
-            tlb_addr2 = tlb_addr_write(entry2);
-        }
+    haddr = (void *)((uintptr_t)addr + entry->addend);
+    store_memop(haddr, val, op);
+    return true;
+}
 
-        /*
-         * Handle watchpoints.  Since this may trap, all checks
-         * must happen before any store.
-         */
-        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-            cpu_check_watchpoint(env_cpu(env), addr, size - size2,
-                                 env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
-                                 BP_MEM_WRITE, retaddr);
-        }
-        if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
-            cpu_check_watchpoint(env_cpu(env), page2, size2,
-                                 env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
-                                 BP_MEM_WRITE, retaddr);
-        }
+static inline void QEMU_ALWAYS_INLINE
+store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+             TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
+{
+    uintptr_t mmu_idx;
+    uintptr_t index;
+    CPUTLBEntry *entry;
+    target_ulong tlb_addr;
+    const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
+    size_t size;
+    int i;
+    uintptr_t index2;
+    CPUTLBEntry *entry2;
+    target_ulong page2, tlb_addr2;
+    size_t size2;
 
-        /*
-         * XXX: not efficient, but simple.
-         * This loop must go in the forward direction to avoid issues
-         * with self-modifying code in Windows 64-bit.
-         */
-        for (i = 0; i < size; ++i) {
-            uint8_t val8;
-            if (memop_big_endian(op)) {
-                /* Big-endian extract.  */
-                val8 = val >> (((size - 1) * 8) - (i * 8));
-            } else {
-                /* Little-endian extract.  */
-                val8 = val >> (i * 8);
-            }
-            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
-        }
+    if (store_helper_size_aligned(env, addr, val, oi, retaddr, op)) {
         return;
     }
 
-    haddr = (void *)((uintptr_t)addr + entry->addend);
-    store_memop(haddr, val, op);
+    mmu_idx = get_mmuidx(oi);
+
+    size = memop_size(op);
+    index = tlb_index(env, mmu_idx, addr);
+    entry = tlb_entry(env, mmu_idx, addr);
+    tlb_addr = tlb_addr_write(entry);
+
+    /* Handle slow unaligned access (it spans two pages or IO).  */
+    /*
+     * Ensure the second page is in the TLB.  Note that the first page
+     * is already guaranteed to be filled, and that the second page
+     * cannot evict the first.
+     */
+    page2 = (addr + size) & TARGET_PAGE_MASK;
+    size2 = (addr + size) & ~TARGET_PAGE_MASK;
+    index2 = tlb_index(env, mmu_idx, page2);
+    entry2 = tlb_entry(env, mmu_idx, page2);
+    tlb_addr2 = tlb_addr_write(entry2);
+    if (!tlb_hit_page(tlb_addr2, page2)) {
+        if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
+            tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
+                     mmu_idx, retaddr);
+            index2 = tlb_index(env, mmu_idx, page2);
+            entry2 = tlb_entry(env, mmu_idx, page2);
+        }
+        tlb_addr2 = tlb_addr_write(entry2);
+    }
+
+    /*
+     * Handle watchpoints.  Since this may trap, all checks
+     * must happen before any store.
+     */
+    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
+        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
+                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+                             BP_MEM_WRITE, retaddr);
+    }
+    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
+        cpu_check_watchpoint(env_cpu(env), page2, size2,
+                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
+                             BP_MEM_WRITE, retaddr);
+    }
+
+    /*
+     * XXX: not efficient, but simple.
+     * This loop must go in the forward direction to avoid issues
+     * with self-modifying code in Windows 64-bit.
+     */
+    for (i = 0; i < size; ++i) {
+        uint8_t val8;
+        if (memop_big_endian(op)) {
+            /* Big-endian extract.  */
+            val8 = val >> (((size - 1) * 8) - (i * 8));
+        } else {
+            /* Little-endian extract.  */
+            val8 = val >> (i * 8);
+        }
+        helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
+    }
 }
 
 void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
                         TCGMemOpIdx oi, uintptr_t retaddr)
 {
-    store_helper(env, addr, val, oi, retaddr, MO_UB);
+    /* Byte store is always size-aligned. */
+    store_helper_size_aligned(env, addr, val, oi, retaddr, MO_UB);
 }
 
 void helper_le_stw_mmu(CPUArchState *env, target_ulong addr, uint16_t val,
-- 
2.28.0.rc0.142.g3c755180ce-goog



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations
  2020-07-24 19:51 [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations Shu-Chun Weng
@ 2020-07-27 18:51 ` Richard Henderson
  2020-07-27 20:01   ` Shu-Chun Weng
  2020-07-28 14:56   ` Alex Bennée
  0 siblings, 2 replies; 4+ messages in thread
From: Richard Henderson @ 2020-07-27 18:51 UTC (permalink / raw)
  To: scw; +Cc: alex.bennee, qemu-devel

On 7/24/20 12:51 PM, Shu-Chun Weng wrote:
> This change has no functional change.
> 
> There is a potential link error with "undefined symbol:
> qemu_build_not_reached" due to how `store_helper` is structured.
> This does not produce at current QEMU head, but was reproducible at
> v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.

Thanks for the hint -- so far I had not been able to reproduce the
problem with any of clang 10, 11, or head (12), with default options.

> The current function structure is:
> 
>     inline QEMU_ALWAYSINLINE
>     store_memop() {
>         switch () {
>             ...
>         default:
>             qemu_build_not_reached();
>         }
>     }
>     inline QEMU_ALWAYSINLINE
>     store_helper() {
>         ...
>         if (span_two_pages_or_io) {
>             ...
>             helper_rst_stb_mmu();
>         }
>         store_memop();
>     }
>     helper_rst_stb_mmu() {
>         store_helper();
>     }
...
> The new structure is:
> 
>     inline QEMU_ALWAYSINLINE
>     store_memop() {
>         switch () {
>             ...
>         default:
>             qemu_build_not_reached();
>         }
>     }
>     inline QEMU_ALWAYSINLINE
>     store_helper_size_aligned()() {
>         ...
>         if (span_two_pages_or_io) {
>             return false;
>         }
>         store_memoop();
>         return true;
>     }
>     inline QEMU_ALWAYSINLINE
>     store_helper() {
>         if (store_helper_size_aligned() {
>             return;
>         }
>         helper_rst_stb_mmu();
>     }
>     helper_rst_stb_mmu() {
>         store_helper_size_aligned()();
>     }

Reasonable, I guess.

I did some experimenting though, and if I pull out the unaligned
portion into a noinline function, I can save about 6k code size.

Thoughts?


r~


diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 5698292749..7e603d6666 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2009,6 +2009,80 @@ store_memop(void *haddr, uint64_t val, MemOp op)
     }
 }
 
+static void __attribute__((noinline))
+store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
+                       uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
+                       bool big_endian)
+{
+    const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
+    uintptr_t index, index2;
+    CPUTLBEntry *entry, *entry2;
+    target_ulong page2, tlb_addr, tlb_addr2;
+    TCGMemOpIdx oi;
+    size_t size2;
+    int i;
+
+    /*
+     * Ensure the second page is in the TLB.  Note that the first page
+     * is already guaranteed to be filled, and that the second page
+     * cannot evict the first.
+     */
+    page2 = (addr + size) & TARGET_PAGE_MASK;
+    size2 = (addr + size) & ~TARGET_PAGE_MASK;
+    index2 = tlb_index(env, mmu_idx, page2);
+    entry2 = tlb_entry(env, mmu_idx, page2);
+
+    tlb_addr2 = tlb_addr_write(entry2);
+    if (!tlb_hit_page(tlb_addr2, page2)) {
+        if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
+            tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
+                     mmu_idx, retaddr);
+            index2 = tlb_index(env, mmu_idx, page2);
+            entry2 = tlb_entry(env, mmu_idx, page2);
+        }
+        tlb_addr2 = tlb_addr_write(entry2);
+    }
+
+    index = tlb_index(env, mmu_idx, addr);
+    entry = tlb_entry(env, mmu_idx, addr);
+    tlb_addr = tlb_addr_write(entry);
+
+    /*
+     * Handle watchpoints.  Since this may trap, all checks
+     * must happen before any store.
+     */
+    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
+        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
+                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
+                             BP_MEM_WRITE, retaddr);
+    }
+    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
+        cpu_check_watchpoint(env_cpu(env), page2, size2,
+                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
+                             BP_MEM_WRITE, retaddr);
+    }
+
+    /*
+     * XXX: not efficient, but simple.
+     * This loop must go in the forward direction to avoid issues
+     * with self-modifying code in Windows 64-bit.
+     */
+    oi = make_memop_idx(MO_UB, mmu_idx);
+    if (big_endian) {
+        for (i = 0; i < size; ++i) {
+            /* Big-endian extract.  */
+            uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
+            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
+        }
+    } else {
+        for (i = 0; i < size; ++i) {
+            /* Little-endian extract.  */
+            uint8_t val8 = val >> (i * 8);
+            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
+        }
+    }
+}
+
 static inline void QEMU_ALWAYS_INLINE
 store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
              TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
@@ -2097,64 +2171,9 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
     if (size > 1
         && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
                      >= TARGET_PAGE_SIZE)) {
-        int i;
-        uintptr_t index2;
-        CPUTLBEntry *entry2;
-        target_ulong page2, tlb_addr2;
-        size_t size2;
-
     do_unaligned_access:
-        /*
-         * Ensure the second page is in the TLB.  Note that the first page
-         * is already guaranteed to be filled, and that the second page
-         * cannot evict the first.
-         */
-        page2 = (addr + size) & TARGET_PAGE_MASK;
-        size2 = (addr + size) & ~TARGET_PAGE_MASK;
-        index2 = tlb_index(env, mmu_idx, page2);
-        entry2 = tlb_entry(env, mmu_idx, page2);
-        tlb_addr2 = tlb_addr_write(entry2);
-        if (!tlb_hit_page(tlb_addr2, page2)) {
-            if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
-                tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
-                         mmu_idx, retaddr);
-                index2 = tlb_index(env, mmu_idx, page2);
-                entry2 = tlb_entry(env, mmu_idx, page2);
-            }
-            tlb_addr2 = tlb_addr_write(entry2);
-        }
-
-        /*
-         * Handle watchpoints.  Since this may trap, all checks
-         * must happen before any store.
-         */
-        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
-            cpu_check_watchpoint(env_cpu(env), addr, size - size2,
-                                 env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
-                                 BP_MEM_WRITE, retaddr);
-        }
-        if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
-            cpu_check_watchpoint(env_cpu(env), page2, size2,
-                                 env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
-                                 BP_MEM_WRITE, retaddr);
-        }
-
-        /*
-         * XXX: not efficient, but simple.
-         * This loop must go in the forward direction to avoid issues
-         * with self-modifying code in Windows 64-bit.
-         */
-        for (i = 0; i < size; ++i) {
-            uint8_t val8;
-            if (memop_big_endian(op)) {
-                /* Big-endian extract.  */
-                val8 = val >> (((size - 1) * 8) - (i * 8));
-            } else {
-                /* Little-endian extract.  */
-                val8 = val >> (i * 8);
-            }
-            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
-        }
+        store_helper_unaligned(env, addr, val, retaddr, size,
+                               mmu_idx, memop_big_endian(op));
         return;
     }
 
@@ -2162,8 +2181,9 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
     store_memop(haddr, val, op);
 }
 
-void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
-                        TCGMemOpIdx oi, uintptr_t retaddr)
+void __attribute__((noinline))
+helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
+                   TCGMemOpIdx oi, uintptr_t retaddr)
 {
     store_helper(env, addr, val, oi, retaddr, MO_UB);
 }
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations
  2020-07-27 18:51 ` Richard Henderson
@ 2020-07-27 20:01   ` Shu-Chun Weng
  2020-07-28 14:56   ` Alex Bennée
  1 sibling, 0 replies; 4+ messages in thread
From: Shu-Chun Weng @ 2020-07-27 20:01 UTC (permalink / raw)
  To: Richard Henderson; +Cc: alex.bennee, qemu-devel


[-- Attachment #1.1: Type: text/plain, Size: 8855 bytes --]

That sounds like a much better solution -- I'm all for code size reduction.
I still don't understand the function enough to feel confident about more
invasive change thus the no-op change here.

I'd be happy to see outlining unaligned access like you suggested.

Shu-Chun

On Mon, Jul 27, 2020 at 11:51 AM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 7/24/20 12:51 PM, Shu-Chun Weng wrote:
> > This change has no functional change.
> >
> > There is a potential link error with "undefined symbol:
> > qemu_build_not_reached" due to how `store_helper` is structured.
> > This does not produce at current QEMU head, but was reproducible at
> > v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.
>
> Thanks for the hint -- so far I had not been able to reproduce the
> problem with any of clang 10, 11, or head (12), with default options.
>
> > The current function structure is:
> >
> >     inline QEMU_ALWAYSINLINE
> >     store_memop() {
> >         switch () {
> >             ...
> >         default:
> >             qemu_build_not_reached();
> >         }
> >     }
> >     inline QEMU_ALWAYSINLINE
> >     store_helper() {
> >         ...
> >         if (span_two_pages_or_io) {
> >             ...
> >             helper_rst_stb_mmu();
> >         }
> >         store_memop();
> >     }
> >     helper_rst_stb_mmu() {
> >         store_helper();
> >     }
> ...
> > The new structure is:
> >
> >     inline QEMU_ALWAYSINLINE
> >     store_memop() {
> >         switch () {
> >             ...
> >         default:
> >             qemu_build_not_reached();
> >         }
> >     }
> >     inline QEMU_ALWAYSINLINE
> >     store_helper_size_aligned()() {
> >         ...
> >         if (span_two_pages_or_io) {
> >             return false;
> >         }
> >         store_memoop();
> >         return true;
> >     }
> >     inline QEMU_ALWAYSINLINE
> >     store_helper() {
> >         if (store_helper_size_aligned() {
> >             return;
> >         }
> >         helper_rst_stb_mmu();
> >     }
> >     helper_rst_stb_mmu() {
> >         store_helper_size_aligned()();
> >     }
>
> Reasonable, I guess.
>
> I did some experimenting though, and if I pull out the unaligned
> portion into a noinline function, I can save about 6k code size.
>
> Thoughts?
>
>
> r~
>
>
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index 5698292749..7e603d6666 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -2009,6 +2009,80 @@ store_memop(void *haddr, uint64_t val, MemOp op)
>      }
>  }
>
> +static void __attribute__((noinline))
> +store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
> +                       uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
> +                       bool big_endian)
> +{
> +    const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
> +    uintptr_t index, index2;
> +    CPUTLBEntry *entry, *entry2;
> +    target_ulong page2, tlb_addr, tlb_addr2;
> +    TCGMemOpIdx oi;
> +    size_t size2;
> +    int i;
> +
> +    /*
> +     * Ensure the second page is in the TLB.  Note that the first page
> +     * is already guaranteed to be filled, and that the second page
> +     * cannot evict the first.
> +     */
> +    page2 = (addr + size) & TARGET_PAGE_MASK;
> +    size2 = (addr + size) & ~TARGET_PAGE_MASK;
> +    index2 = tlb_index(env, mmu_idx, page2);
> +    entry2 = tlb_entry(env, mmu_idx, page2);
> +
> +    tlb_addr2 = tlb_addr_write(entry2);
> +    if (!tlb_hit_page(tlb_addr2, page2)) {
> +        if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
> +            tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
> +                     mmu_idx, retaddr);
> +            index2 = tlb_index(env, mmu_idx, page2);
> +            entry2 = tlb_entry(env, mmu_idx, page2);
> +        }
> +        tlb_addr2 = tlb_addr_write(entry2);
> +    }
> +
> +    index = tlb_index(env, mmu_idx, addr);
> +    entry = tlb_entry(env, mmu_idx, addr);
> +    tlb_addr = tlb_addr_write(entry);
> +
> +    /*
> +     * Handle watchpoints.  Since this may trap, all checks
> +     * must happen before any store.
> +     */
> +    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
> +        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
> +                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
> +                             BP_MEM_WRITE, retaddr);
> +    }
> +    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
> +        cpu_check_watchpoint(env_cpu(env), page2, size2,
> +                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
> +                             BP_MEM_WRITE, retaddr);
> +    }
> +
> +    /*
> +     * XXX: not efficient, but simple.
> +     * This loop must go in the forward direction to avoid issues
> +     * with self-modifying code in Windows 64-bit.
> +     */
> +    oi = make_memop_idx(MO_UB, mmu_idx);
> +    if (big_endian) {
> +        for (i = 0; i < size; ++i) {
> +            /* Big-endian extract.  */
> +            uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
> +            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> +        }
> +    } else {
> +        for (i = 0; i < size; ++i) {
> +            /* Little-endian extract.  */
> +            uint8_t val8 = val >> (i * 8);
> +            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> +        }
> +    }
> +}
> +
>  static inline void QEMU_ALWAYS_INLINE
>  store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
>               TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
> @@ -2097,64 +2171,9 @@ store_helper(CPUArchState *env, target_ulong addr,
> uint64_t val,
>      if (size > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -        uintptr_t index2;
> -        CPUTLBEntry *entry2;
> -        target_ulong page2, tlb_addr2;
> -        size_t size2;
> -
>      do_unaligned_access:
> -        /*
> -         * Ensure the second page is in the TLB.  Note that the first page
> -         * is already guaranteed to be filled, and that the second page
> -         * cannot evict the first.
> -         */
> -        page2 = (addr + size) & TARGET_PAGE_MASK;
> -        size2 = (addr + size) & ~TARGET_PAGE_MASK;
> -        index2 = tlb_index(env, mmu_idx, page2);
> -        entry2 = tlb_entry(env, mmu_idx, page2);
> -        tlb_addr2 = tlb_addr_write(entry2);
> -        if (!tlb_hit_page(tlb_addr2, page2)) {
> -            if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
> -                tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
> -                         mmu_idx, retaddr);
> -                index2 = tlb_index(env, mmu_idx, page2);
> -                entry2 = tlb_entry(env, mmu_idx, page2);
> -            }
> -            tlb_addr2 = tlb_addr_write(entry2);
> -        }
> -
> -        /*
> -         * Handle watchpoints.  Since this may trap, all checks
> -         * must happen before any store.
> -         */
> -        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
> -            cpu_check_watchpoint(env_cpu(env), addr, size - size2,
> -
>  env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
> -                                 BP_MEM_WRITE, retaddr);
> -        }
> -        if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
> -            cpu_check_watchpoint(env_cpu(env), page2, size2,
> -
>  env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
> -                                 BP_MEM_WRITE, retaddr);
> -        }
> -
> -        /*
> -         * XXX: not efficient, but simple.
> -         * This loop must go in the forward direction to avoid issues
> -         * with self-modifying code in Windows 64-bit.
> -         */
> -        for (i = 0; i < size; ++i) {
> -            uint8_t val8;
> -            if (memop_big_endian(op)) {
> -                /* Big-endian extract.  */
> -                val8 = val >> (((size - 1) * 8) - (i * 8));
> -            } else {
> -                /* Little-endian extract.  */
> -                val8 = val >> (i * 8);
> -            }
> -            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> -        }
> +        store_helper_unaligned(env, addr, val, retaddr, size,
> +                               mmu_idx, memop_big_endian(op));
>          return;
>      }
>
> @@ -2162,8 +2181,9 @@ store_helper(CPUArchState *env, target_ulong addr,
> uint64_t val,
>      store_memop(haddr, val, op);
>  }
>
> -void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
> -                        TCGMemOpIdx oi, uintptr_t retaddr)
> +void __attribute__((noinline))
> +helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
> +                   TCGMemOpIdx oi, uintptr_t retaddr)
>  {
>      store_helper(env, addr, val, oi, retaddr, MO_UB);
>  }
> --
> 2.25.1
>
>

[-- Attachment #1.2: Type: text/html, Size: 11198 bytes --]

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 3844 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations
  2020-07-27 18:51 ` Richard Henderson
  2020-07-27 20:01   ` Shu-Chun Weng
@ 2020-07-28 14:56   ` Alex Bennée
  1 sibling, 0 replies; 4+ messages in thread
From: Alex Bennée @ 2020-07-28 14:56 UTC (permalink / raw)
  To: Richard Henderson; +Cc: scw, qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> On 7/24/20 12:51 PM, Shu-Chun Weng wrote:
>> This change has no functional change.
>> 
>> There is a potential link error with "undefined symbol:
>> qemu_build_not_reached" due to how `store_helper` is structured.
>> This does not produce at current QEMU head, but was reproducible at
>> v4.2.0 with `clang-10 -O2 -fexperimental-new-pass-manager`.
>
> Thanks for the hint -- so far I had not been able to reproduce the
> problem with any of clang 10, 11, or head (12), with default options.
>
>> The current function structure is:
>> 
>>     inline QEMU_ALWAYSINLINE
>>     store_memop() {
>>         switch () {
>>             ...
>>         default:
>>             qemu_build_not_reached();
>>         }
>>     }
>>     inline QEMU_ALWAYSINLINE
>>     store_helper() {
>>         ...
>>         if (span_two_pages_or_io) {
>>             ...
>>             helper_rst_stb_mmu();
>>         }
>>         store_memop();
>>     }
>>     helper_rst_stb_mmu() {
>>         store_helper();
>>     }
> ...
>> The new structure is:
>> 
>>     inline QEMU_ALWAYSINLINE
>>     store_memop() {
>>         switch () {
>>             ...
>>         default:
>>             qemu_build_not_reached();
>>         }
>>     }
>>     inline QEMU_ALWAYSINLINE
>>     store_helper_size_aligned()() {
>>         ...
>>         if (span_two_pages_or_io) {
>>             return false;
>>         }
>>         store_memoop();
>>         return true;
>>     }
>>     inline QEMU_ALWAYSINLINE
>>     store_helper() {
>>         if (store_helper_size_aligned() {
>>             return;
>>         }
>>         helper_rst_stb_mmu();
>>     }
>>     helper_rst_stb_mmu() {
>>         store_helper_size_aligned()();
>>     }
>
> Reasonable, I guess.
>
> I did some experimenting though, and if I pull out the unaligned
> portion into a noinline function, I can save about 6k code size.
>
> Thoughts?

I think on balance I prefers having the unaligned helper out of line.
AFAICT they both perform just as well and it must surely help to have
the uncommon case both out of the hotpath and shared with the other
implementations.

For this version:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

>
>
> r~
>
>
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index 5698292749..7e603d6666 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -2009,6 +2009,80 @@ store_memop(void *haddr, uint64_t val, MemOp op)
>      }
>  }
>  
> +static void __attribute__((noinline))
> +store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
> +                       uintptr_t retaddr, size_t size, uintptr_t mmu_idx,
> +                       bool big_endian)
> +{
> +    const size_t tlb_off = offsetof(CPUTLBEntry, addr_write);
> +    uintptr_t index, index2;
> +    CPUTLBEntry *entry, *entry2;
> +    target_ulong page2, tlb_addr, tlb_addr2;
> +    TCGMemOpIdx oi;
> +    size_t size2;
> +    int i;
> +
> +    /*
> +     * Ensure the second page is in the TLB.  Note that the first page
> +     * is already guaranteed to be filled, and that the second page
> +     * cannot evict the first.
> +     */
> +    page2 = (addr + size) & TARGET_PAGE_MASK;
> +    size2 = (addr + size) & ~TARGET_PAGE_MASK;
> +    index2 = tlb_index(env, mmu_idx, page2);
> +    entry2 = tlb_entry(env, mmu_idx, page2);
> +
> +    tlb_addr2 = tlb_addr_write(entry2);
> +    if (!tlb_hit_page(tlb_addr2, page2)) {
> +        if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
> +            tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
> +                     mmu_idx, retaddr);
> +            index2 = tlb_index(env, mmu_idx, page2);
> +            entry2 = tlb_entry(env, mmu_idx, page2);
> +        }
> +        tlb_addr2 = tlb_addr_write(entry2);
> +    }
> +
> +    index = tlb_index(env, mmu_idx, addr);
> +    entry = tlb_entry(env, mmu_idx, addr);
> +    tlb_addr = tlb_addr_write(entry);
> +
> +    /*
> +     * Handle watchpoints.  Since this may trap, all checks
> +     * must happen before any store.
> +     */
> +    if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
> +        cpu_check_watchpoint(env_cpu(env), addr, size - size2,
> +                             env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
> +                             BP_MEM_WRITE, retaddr);
> +    }
> +    if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
> +        cpu_check_watchpoint(env_cpu(env), page2, size2,
> +                             env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
> +                             BP_MEM_WRITE, retaddr);
> +    }
> +
> +    /*
> +     * XXX: not efficient, but simple.
> +     * This loop must go in the forward direction to avoid issues
> +     * with self-modifying code in Windows 64-bit.
> +     */
> +    oi = make_memop_idx(MO_UB, mmu_idx);
> +    if (big_endian) {
> +        for (i = 0; i < size; ++i) {
> +            /* Big-endian extract.  */
> +            uint8_t val8 = val >> (((size - 1) * 8) - (i * 8));
> +            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> +        }
> +    } else {
> +        for (i = 0; i < size; ++i) {
> +            /* Little-endian extract.  */
> +            uint8_t val8 = val >> (i * 8);
> +            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> +        }
> +    }
> +}
> +
>  static inline void QEMU_ALWAYS_INLINE
>  store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
>               TCGMemOpIdx oi, uintptr_t retaddr, MemOp op)
> @@ -2097,64 +2171,9 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
>      if (size > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + size - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -        uintptr_t index2;
> -        CPUTLBEntry *entry2;
> -        target_ulong page2, tlb_addr2;
> -        size_t size2;
> -
>      do_unaligned_access:
> -        /*
> -         * Ensure the second page is in the TLB.  Note that the first page
> -         * is already guaranteed to be filled, and that the second page
> -         * cannot evict the first.
> -         */
> -        page2 = (addr + size) & TARGET_PAGE_MASK;
> -        size2 = (addr + size) & ~TARGET_PAGE_MASK;
> -        index2 = tlb_index(env, mmu_idx, page2);
> -        entry2 = tlb_entry(env, mmu_idx, page2);
> -        tlb_addr2 = tlb_addr_write(entry2);
> -        if (!tlb_hit_page(tlb_addr2, page2)) {
> -            if (!victim_tlb_hit(env, mmu_idx, index2, tlb_off, page2)) {
> -                tlb_fill(env_cpu(env), page2, size2, MMU_DATA_STORE,
> -                         mmu_idx, retaddr);
> -                index2 = tlb_index(env, mmu_idx, page2);
> -                entry2 = tlb_entry(env, mmu_idx, page2);
> -            }
> -            tlb_addr2 = tlb_addr_write(entry2);
> -        }
> -
> -        /*
> -         * Handle watchpoints.  Since this may trap, all checks
> -         * must happen before any store.
> -         */
> -        if (unlikely(tlb_addr & TLB_WATCHPOINT)) {
> -            cpu_check_watchpoint(env_cpu(env), addr, size - size2,
> -                                 env_tlb(env)->d[mmu_idx].iotlb[index].attrs,
> -                                 BP_MEM_WRITE, retaddr);
> -        }
> -        if (unlikely(tlb_addr2 & TLB_WATCHPOINT)) {
> -            cpu_check_watchpoint(env_cpu(env), page2, size2,
> -                                 env_tlb(env)->d[mmu_idx].iotlb[index2].attrs,
> -                                 BP_MEM_WRITE, retaddr);
> -        }
> -
> -        /*
> -         * XXX: not efficient, but simple.
> -         * This loop must go in the forward direction to avoid issues
> -         * with self-modifying code in Windows 64-bit.
> -         */
> -        for (i = 0; i < size; ++i) {
> -            uint8_t val8;
> -            if (memop_big_endian(op)) {
> -                /* Big-endian extract.  */
> -                val8 = val >> (((size - 1) * 8) - (i * 8));
> -            } else {
> -                /* Little-endian extract.  */
> -                val8 = val >> (i * 8);
> -            }
> -            helper_ret_stb_mmu(env, addr + i, val8, oi, retaddr);
> -        }
> +        store_helper_unaligned(env, addr, val, retaddr, size,
> +                               mmu_idx, memop_big_endian(op));
>          return;
>      }
>  
> @@ -2162,8 +2181,9 @@ store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
>      store_memop(haddr, val, op);
>  }
>  
> -void helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
> -                        TCGMemOpIdx oi, uintptr_t retaddr)
> +void __attribute__((noinline))
> +helper_ret_stb_mmu(CPUArchState *env, target_ulong addr, uint8_t val,
> +                   TCGMemOpIdx oi, uintptr_t retaddr)
>  {
>      store_helper(env, addr, val, oi, retaddr, MO_UB);
>  }


-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-07-28 14:57 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-24 19:51 [PATCH v2] cputlb: Make store_helper less fragile to compiler optimizations Shu-Chun Weng
2020-07-27 18:51 ` Richard Henderson
2020-07-27 20:01   ` Shu-Chun Weng
2020-07-28 14:56   ` Alex Bennée

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.