All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates
@ 2018-12-13 23:58 Benjamin Herrenschmidt
  2018-12-13 23:58 ` [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg Benjamin Herrenschmidt
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2018-12-13 23:58 UTC (permalink / raw)
  To: qemu-devel; +Cc: Richard Henderson, Paolo Bonzini, Benjamin Herrenschmidt

On some architectures, PTE updates for dirty and changed bits need
to be performed atomically. This adds a couple of address_space_cmpxchg*
helpers for that purpose.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 include/exec/memory_ldst.inc.h |  6 +++
 memory_ldst.inc.c              | 78 ++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/include/exec/memory_ldst.inc.h b/include/exec/memory_ldst.inc.h
index 272c20f02e..f3cfa7e9a6 100644
--- a/include/exec/memory_ldst.inc.h
+++ b/include/exec/memory_ldst.inc.h
@@ -28,6 +28,12 @@ extern uint64_t glue(address_space_ldq, SUFFIX)(ARG1_DECL,
     hwaddr addr, MemTxAttrs attrs, MemTxResult *result);
 extern void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
     hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
+extern uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
+    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs,
+    MemTxResult *result);
+extern uint32_t glue(address_space_cmpxchgq_notdirty, SUFFIX)(ARG1_DECL,
+    hwaddr addr, uint64_t old, uint64_t new, MemTxAttrs attrs,
+    MemTxResult *result);
 extern void glue(address_space_stw, SUFFIX)(ARG1_DECL,
     hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result);
 extern void glue(address_space_stl, SUFFIX)(ARG1_DECL,
diff --git a/memory_ldst.inc.c b/memory_ldst.inc.c
index acf865b900..7ab6de37ba 100644
--- a/memory_ldst.inc.c
+++ b/memory_ldst.inc.c
@@ -320,6 +320,84 @@ void glue(address_space_stl_notdirty, SUFFIX)(ARG1_DECL,
     RCU_READ_UNLOCK();
 }
 
+/* This is meant to be used for atomic PTE updates under MT-TCG */
+uint32_t glue(address_space_cmpxchgl_notdirty, SUFFIX)(ARG1_DECL,
+    hwaddr addr, uint32_t old, uint32_t new, MemTxAttrs attrs, MemTxResult *result)
+{
+    uint8_t *ptr;
+    MemoryRegion *mr;
+    hwaddr l = 4;
+    hwaddr addr1;
+    MemTxResult r;
+    uint8_t dirty_log_mask;
+
+    /* Must test result */
+    assert(result);
+
+    RCU_READ_LOCK();
+    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
+    if (l < 4 || !memory_access_is_direct(mr, true)) {
+        r = MEMTX_ERROR;
+    } else {
+        uint32_t orig = old;
+
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
+        old = atomic_cmpxchg(ptr, orig, new);
+
+        if (old == orig) {
+            dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+            dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
+            cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
+                                                4, dirty_log_mask);
+        }
+        r = MEMTX_OK;
+    }
+    *result = r;
+    RCU_READ_UNLOCK();
+
+    return old;
+}
+
+#ifdef CONFIG_ATOMIC64
+/* This is meant to be used for atomic PTE updates under MT-TCG */
+uint32_t glue(address_space_cmpxchgq_notdirty, SUFFIX)(ARG1_DECL,
+    hwaddr addr, uint64_t old, uint64_t new, MemTxAttrs attrs, MemTxResult *result)
+{
+    uint8_t *ptr;
+    MemoryRegion *mr;
+    hwaddr l = 8;
+    hwaddr addr1;
+    MemTxResult r;
+    uint8_t dirty_log_mask;
+
+    /* Must test result */
+    assert(result);
+
+    RCU_READ_LOCK();
+    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
+    if (l < 8 || !memory_access_is_direct(mr, true)) {
+        r = MEMTX_ERROR;
+    } else {
+        uint32_t orig = old;
+
+        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
+        old = atomic_cmpxchg(ptr, orig, new);
+
+        if (old == orig) {
+            dirty_log_mask = memory_region_get_dirty_log_mask(mr);
+            dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
+            cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr,
+                                                8, dirty_log_mask);
+        }
+        r = MEMTX_OK;
+    }
+    *result = r;
+    RCU_READ_UNLOCK();
+
+    return old;
+}
+#endif /* CONFIG_ATOMIC64 */
+
 /* warning: addr must be aligned */
 static inline void glue(address_space_stl_internal, SUFFIX)(ARG1_DECL,
     hwaddr addr, uint32_t val, MemTxAttrs attrs,
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg
  2018-12-13 23:58 [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Benjamin Herrenschmidt
@ 2018-12-13 23:58 ` Benjamin Herrenschmidt
  2018-12-14  0:05   ` Benjamin Herrenschmidt
  2018-12-13 23:58 ` [Qemu-devel] [PATCH 3/3] ppc: Fix radix RC updates Benjamin Herrenschmidt
  2018-12-14  3:01 ` [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Richard Henderson
  2 siblings, 1 reply; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2018-12-13 23:58 UTC (permalink / raw)
  To: qemu-devel; +Cc: Richard Henderson, Paolo Bonzini, Benjamin Herrenschmidt

Afaik, this isn't well documented (at least it wasn't when I last looked)
but OSes such as Linux rely on this behaviour:

The HW updates to the page tables need to be done atomically with the
checking of the present bit (and other permissions).

This is what allows Linux to do simple xchg of PTEs with 0 and assume
the value read has "final" stable dirty and accessed bits (the TLB
invalidation is deferred).

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 target/i386/excp_helper.c | 104 +++++++++++++++++++++++++++++---------
 1 file changed, 80 insertions(+), 24 deletions(-)

diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
index 49231f6b69..93fc24c011 100644
--- a/target/i386/excp_helper.c
+++ b/target/i386/excp_helper.c
@@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
 
 #else
 
+static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
+                                    uint64_t orig_entry, uint32_t bits)
+{
+    uint64_t new_entry = orig_entry | bits;
+
+    /* Write the updated bottom 32-bits */
+    if (qemu_tcg_mttcg_enabled()) {
+        uint32_t old_le = cpu_to_le32(orig_entry);
+        uint32_t new_le = cpu_to_le32(new_entry);
+        MemTxResult result;
+        uint32_t old_ret;
+
+        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
+                                                  old_le, new_le,
+                                                  MEMTXATTRS_UNSPECIFIED,
+                                                  &result);
+        if (result == MEMTX_OK) {
+            if (old_ret != old_le && old_ret != new_le) {
+                new_entry = 0;
+            }
+            return new_entry;
+        }
+
+        /* Do we need to support this case where PTEs aren't in RAM ?
+         *
+         * For now fallback to non-atomic case
+         */
+    }
+
+    x86_stl_phys_notdirty(cs, addr, new_entry);
+
+    return new_entry;
+}
+
 static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
                         int *prot)
 {
     CPUX86State *env = &X86_CPU(cs)->env;
-    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
+    uint64_t rsvd_mask;
     uint64_t ptep, pte;
     uint64_t exit_info_1 = 0;
     target_ulong pde_addr, pte_addr;
@@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
         return gphys;
     }
 
+ restart:
+    rsvd_mask = PG_HI_RSVD_MASK;
     if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
         rsvd_mask |= PG_NX_MASK;
     }
@@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
                 goto do_fault_rsvd;
             }
             if (!(pml4e & PG_ACCESSED_MASK)) {
-                pml4e |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
+                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
+                if (!pml4e) {
+                    goto restart;
+                }
             }
             ptep &= pml4e ^ PG_NX_MASK;
             pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
@@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
             }
             ptep &= pdpe ^ PG_NX_MASK;
             if (!(pdpe & PG_ACCESSED_MASK)) {
-                pdpe |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
+                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
+                if (!pdpe) {
+                    goto restart;
+                }
             }
             if (pdpe & PG_PSE_MASK) {
                 /* 1 GB page */
@@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
         }
         /* 4 KB page */
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
         pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
         pte = x86_ldq_phys(cs, pte_addr);
@@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
         }
 
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
 
         /* page directory entry */
@@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
     int error_code = 0;
     int is_dirty, prot, page_size, is_write, is_user;
     hwaddr paddr;
-    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
+    uint64_t rsvd_mask;
     uint32_t page_offset;
     target_ulong vaddr;
 
@@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         goto do_mapping;
     }
 
+ restart:
+    rsvd_mask = PG_HI_RSVD_MASK;
     if (!(env->efer & MSR_EFER_NXE)) {
         rsvd_mask |= PG_NX_MASK;
     }
@@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
                     goto do_fault_rsvd;
                 }
                 if (!(pml5e & PG_ACCESSED_MASK)) {
-                    pml5e |= PG_ACCESSED_MASK;
-                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
+                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
+                    if (!pml5e) {
+                        goto restart;
+                    }
                 }
                 ptep = pml5e ^ PG_NX_MASK;
             } else {
@@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
                 goto do_fault_rsvd;
             }
             if (!(pml4e & PG_ACCESSED_MASK)) {
-                pml4e |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
+                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
+                if (!pml4e) {
+                    goto restart;
+                }
             }
             ptep &= pml4e ^ PG_NX_MASK;
             pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
@@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
             }
             ptep &= pdpe ^ PG_NX_MASK;
             if (!(pdpe & PG_ACCESSED_MASK)) {
-                pdpe |= PG_ACCESSED_MASK;
-                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
+                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
+                if (!pdpe) {
+                    goto restart;
+                }
             }
             if (pdpe & PG_PSE_MASK) {
                 /* 1 GB page */
@@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         }
         /* 4 KB page */
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
         pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
             a20_mask;
@@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         }
 
         if (!(pde & PG_ACCESSED_MASK)) {
-            pde |= PG_ACCESSED_MASK;
-            x86_stl_phys_notdirty(cs, pde_addr, pde);
+            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
+            if (!pde) {
+                goto restart;
+            }
         }
 
         /* page directory entry */
@@ -634,11 +690,11 @@ do_check_protect_pse36:
     /* yes, it can! */
     is_dirty = is_write && !(pte & PG_DIRTY_MASK);
     if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
-        pte |= PG_ACCESSED_MASK;
-        if (is_dirty) {
-            pte |= PG_DIRTY_MASK;
+        pte = update_entry(cs, pte_addr, pte,
+                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
+        if (!pte) {
+            goto restart;
         }
-        x86_stl_phys_notdirty(cs, pte_addr, pte);
     }
 
     if (!(pte & PG_DIRTY_MASK)) {
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH 3/3] ppc: Fix radix RC updates
  2018-12-13 23:58 [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Benjamin Herrenschmidt
  2018-12-13 23:58 ` [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg Benjamin Herrenschmidt
@ 2018-12-13 23:58 ` Benjamin Herrenschmidt
  2018-12-14  0:03   ` Benjamin Herrenschmidt
  2018-12-14  3:01 ` [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Richard Henderson
  2 siblings, 1 reply; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2018-12-13 23:58 UTC (permalink / raw)
  To: qemu-devel; +Cc: Richard Henderson, Paolo Bonzini, Benjamin Herrenschmidt

They should be atomic for MTTCG. Note: a real POWER9 core doesn't actually
implement atomic PTE updates, it always fault for SW to handle it. Only
the nest MMU (used by some accelerator devices and GPUs) implements
those HW updates.

However, the architecture does allow the core to do it, and doing so
in TCG is faster than letting the guest do it.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 target/ppc/cpu.h         |  1 +
 target/ppc/mmu-radix64.c | 70 +++++++++++++++++++++++++++++++++-------
 2 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index ab68abe8a2..afdef2af2f 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -493,6 +493,7 @@ struct ppc_slb_t {
 #define DSISR_AMR                0x00200000
 /* Unsupported Radix Tree Configuration */
 #define DSISR_R_BADCONFIG        0x00080000
+#define DSISR_ATOMIC_RC          0x00040000
 
 /* SRR1 error code fields */
 
diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index ab76cbc835..dba95aabdc 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -28,6 +28,15 @@
 #include "mmu-radix64.h"
 #include "mmu-book3s-v3.h"
 
+static inline bool ppc_radix64_hw_rc_updates(CPUPPCState *env)
+{
+#ifdef CONFIG_ATOMIC64
+    return true;
+#else
+    return !qemu_tcg_mttcg_enabled();
+#endif
+}
+
 static bool ppc_radix64_get_fully_qualified_addr(CPUPPCState *env, vaddr eaddr,
                                                  uint64_t *lpid, uint64_t *pid)
 {
@@ -120,11 +129,18 @@ static bool ppc_radix64_check_prot(PowerPCCPU *cpu, int rwx, uint64_t pte,
         return true;
     }
 
+    /* Check RC bits if necessary */
+    if (!ppc_radix64_hw_rc_updates(env)) {
+        if (!(pte & R_PTE_R) || ((rwx == 1) && !(pte & R_PTE_C))) {
+            *fault_cause |= DSISR_ATOMIC_RC;
+            return true;
+        }
+    }
+
     return false;
 }
 
-static void ppc_radix64_set_rc(PowerPCCPU *cpu, int rwx, uint64_t pte,
-                               hwaddr pte_addr, int *prot)
+static uint64_t ppc_radix64_set_rc(PowerPCCPU *cpu, int rwx, uint64_t pte, hwaddr pte_addr)
 {
     CPUState *cs = CPU(cpu);
     uint64_t npte;
@@ -133,17 +149,38 @@ static void ppc_radix64_set_rc(PowerPCCPU *cpu, int rwx, uint64_t pte,
 
     if (rwx == 1) { /* Store/Write */
         npte |= R_PTE_C; /* Set change bit */
-    } else {
-        /*
-         * Treat the page as read-only for now, so that a later write
-         * will pass through this function again to set the C bit.
-         */
-        *prot &= ~PAGE_WRITE;
     }
+    if (pte == npte) {
+        return pte;
+    }
+
+#ifdef CONFIG_ATOMIC64
+    if (qemu_tcg_mttcg_enabled()) {
+        uint64_t old_be = cpu_to_be32(pte);
+        uint64_t new_be = cpu_to_be32(npte);
+        MemTxResult result;
+        uint64_t old_ret;
+
+        old_ret = address_space_cmpxchgq_notdirty(cs->as, pte_addr,
+                                                  old_be, new_be,
+                                                  MEMTXATTRS_UNSPECIFIED,
+                                                  &result);
+        if (result == MEMTX_OK) {
+            if (old_ret != old_be && old_ret != new_be) {
+                return 0;
+            }
+            return npte;
+        }
 
-    if (pte ^ npte) { /* If pte has changed then write it back */
-        stq_phys(cs->as, pte_addr, npte);
+        /* Do we need to support this case where PTEs aren't in RAM ?
+         *
+         * For now fallback to non-atomic case
+         */
     }
+#endif
+
+    stq_phys(cs->as, pte_addr, npte);
+    return npte;
 }
 
 static uint64_t ppc_radix64_walk_tree(PowerPCCPU *cpu, vaddr eaddr,
@@ -234,6 +271,7 @@ int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx,
 
     /* Walk Radix Tree from Process Table Entry to Convert EA to RA */
     page_size = PRTBE_R_GET_RTS(prtbe0);
+ restart:
     pte = ppc_radix64_walk_tree(cpu, eaddr & R_EADDR_MASK,
                                 prtbe0 & PRTBE_R_RPDB, prtbe0 & PRTBE_R_RPDS,
                                 &raddr, &page_size, &fault_cause, &pte_addr);
@@ -244,8 +282,16 @@ int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx,
     }
 
     /* Update Reference and Change Bits */
-    ppc_radix64_set_rc(cpu, rwx, pte, pte_addr, &prot);
-
+    if (ppc_radix64_hw_rc_updates(env)) {
+        pte = ppc_radix64_set_rc(cpu, rwx, pte, pte_addr);
+        if (!pte) {
+            goto restart;
+        }
+    }
+    /* If the page doesn't have C, treat it as read only */
+    if (!(pte & R_PTE_C)) {
+        prot &= ~PAGE_WRITE;
+    }
     tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK,
                  prot, mmu_idx, 1UL << page_size);
     return 0;
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] ppc: Fix radix RC updates
  2018-12-13 23:58 ` [Qemu-devel] [PATCH 3/3] ppc: Fix radix RC updates Benjamin Herrenschmidt
@ 2018-12-14  0:03   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2018-12-14  0:03 UTC (permalink / raw)
  To: qemu-devel; +Cc: Richard Henderson, Paolo Bonzini

On Fri, 2018-12-14 at 10:58 +1100, Benjamin Herrenschmidt wrote:
> They should be atomic for MTTCG. Note: a real POWER9 core doesn't actually
> implement atomic PTE updates, it always fault for SW to handle it. Only
> the nest MMU (used by some accelerator devices and GPUs) implements
> those HW updates.
> 
> However, the architecture does allow the core to do it, and doing so
> in TCG is faster than letting the guest do it.

Note: ppc hash MMU needs fixes too but of a different nature. I have
some queued up as well, but as-is, they are entangled with other ppc
target fixes, so I'll send them separately via David.

Cheers,
Ben.

> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>  target/ppc/cpu.h         |  1 +
>  target/ppc/mmu-radix64.c | 70 +++++++++++++++++++++++++++++++++-------
>  2 files changed, 59 insertions(+), 12 deletions(-)
> 
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index ab68abe8a2..afdef2af2f 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -493,6 +493,7 @@ struct ppc_slb_t {
>  #define DSISR_AMR                0x00200000
>  /* Unsupported Radix Tree Configuration */
>  #define DSISR_R_BADCONFIG        0x00080000
> +#define DSISR_ATOMIC_RC          0x00040000
>  
>  /* SRR1 error code fields */
>  
> diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
> index ab76cbc835..dba95aabdc 100644
> --- a/target/ppc/mmu-radix64.c
> +++ b/target/ppc/mmu-radix64.c
> @@ -28,6 +28,15 @@
>  #include "mmu-radix64.h"
>  #include "mmu-book3s-v3.h"
>  
> +static inline bool ppc_radix64_hw_rc_updates(CPUPPCState *env)
> +{
> +#ifdef CONFIG_ATOMIC64
> +    return true;
> +#else
> +    return !qemu_tcg_mttcg_enabled();
> +#endif
> +}
> +
>  static bool ppc_radix64_get_fully_qualified_addr(CPUPPCState *env, vaddr eaddr,
>                                                   uint64_t *lpid, uint64_t *pid)
>  {
> @@ -120,11 +129,18 @@ static bool ppc_radix64_check_prot(PowerPCCPU *cpu, int rwx, uint64_t pte,
>          return true;
>      }
>  
> +    /* Check RC bits if necessary */
> +    if (!ppc_radix64_hw_rc_updates(env)) {
> +        if (!(pte & R_PTE_R) || ((rwx == 1) && !(pte & R_PTE_C))) {
> +            *fault_cause |= DSISR_ATOMIC_RC;
> +            return true;
> +        }
> +    }
> +
>      return false;
>  }
>  
> -static void ppc_radix64_set_rc(PowerPCCPU *cpu, int rwx, uint64_t pte,
> -                               hwaddr pte_addr, int *prot)
> +static uint64_t ppc_radix64_set_rc(PowerPCCPU *cpu, int rwx, uint64_t pte, hwaddr pte_addr)
>  {
>      CPUState *cs = CPU(cpu);
>      uint64_t npte;
> @@ -133,17 +149,38 @@ static void ppc_radix64_set_rc(PowerPCCPU *cpu, int rwx, uint64_t pte,
>  
>      if (rwx == 1) { /* Store/Write */
>          npte |= R_PTE_C; /* Set change bit */
> -    } else {
> -        /*
> -         * Treat the page as read-only for now, so that a later write
> -         * will pass through this function again to set the C bit.
> -         */
> -        *prot &= ~PAGE_WRITE;
>      }
> +    if (pte == npte) {
> +        return pte;
> +    }
> +
> +#ifdef CONFIG_ATOMIC64
> +    if (qemu_tcg_mttcg_enabled()) {
> +        uint64_t old_be = cpu_to_be32(pte);
> +        uint64_t new_be = cpu_to_be32(npte);
> +        MemTxResult result;
> +        uint64_t old_ret;
> +
> +        old_ret = address_space_cmpxchgq_notdirty(cs->as, pte_addr,
> +                                                  old_be, new_be,
> +                                                  MEMTXATTRS_UNSPECIFIED,
> +                                                  &result);
> +        if (result == MEMTX_OK) {
> +            if (old_ret != old_be && old_ret != new_be) {
> +                return 0;
> +            }
> +            return npte;
> +        }
>  
> -    if (pte ^ npte) { /* If pte has changed then write it back */
> -        stq_phys(cs->as, pte_addr, npte);
> +        /* Do we need to support this case where PTEs aren't in RAM ?
> +         *
> +         * For now fallback to non-atomic case
> +         */
>      }
> +#endif
> +
> +    stq_phys(cs->as, pte_addr, npte);
> +    return npte;
>  }
>  
>  static uint64_t ppc_radix64_walk_tree(PowerPCCPU *cpu, vaddr eaddr,
> @@ -234,6 +271,7 @@ int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx,
>  
>      /* Walk Radix Tree from Process Table Entry to Convert EA to RA */
>      page_size = PRTBE_R_GET_RTS(prtbe0);
> + restart:
>      pte = ppc_radix64_walk_tree(cpu, eaddr & R_EADDR_MASK,
>                                  prtbe0 & PRTBE_R_RPDB, prtbe0 & PRTBE_R_RPDS,
>                                  &raddr, &page_size, &fault_cause, &pte_addr);
> @@ -244,8 +282,16 @@ int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr eaddr, int rwx,
>      }
>  
>      /* Update Reference and Change Bits */
> -    ppc_radix64_set_rc(cpu, rwx, pte, pte_addr, &prot);
> -
> +    if (ppc_radix64_hw_rc_updates(env)) {
> +        pte = ppc_radix64_set_rc(cpu, rwx, pte, pte_addr);
> +        if (!pte) {
> +            goto restart;
> +        }
> +    }
> +    /* If the page doesn't have C, treat it as read only */
> +    if (!(pte & R_PTE_C)) {
> +        prot &= ~PAGE_WRITE;
> +    }
>      tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr & TARGET_PAGE_MASK,
>                   prot, mmu_idx, 1UL << page_size);
>      return 0;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg
  2018-12-13 23:58 ` [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg Benjamin Herrenschmidt
@ 2018-12-14  0:05   ` Benjamin Herrenschmidt
  2018-12-14 11:11       ` [Qemu-riscv] " Philippe Mathieu-Daudé
  0 siblings, 1 reply; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2018-12-14  0:05 UTC (permalink / raw)
  To: Palmer Dabbelt; +Cc: Richard Henderson, Paolo Bonzini, qemu-devel

Note to RiscV folks: You may want to adapt your code to do the same as
this, esp. afaik, what you do today is endian-broken if host and guest
endian are different.

Cheers,
Ben. 

On Fri, 2018-12-14 at 10:58 +1100, Benjamin Herrenschmidt wrote:
> Afaik, this isn't well documented (at least it wasn't when I last looked)
> but OSes such as Linux rely on this behaviour:
> 
> The HW updates to the page tables need to be done atomically with the
> checking of the present bit (and other permissions).
> 
> This is what allows Linux to do simple xchg of PTEs with 0 and assume
> the value read has "final" stable dirty and accessed bits (the TLB
> invalidation is deferred).
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>  target/i386/excp_helper.c | 104 +++++++++++++++++++++++++++++---------
>  1 file changed, 80 insertions(+), 24 deletions(-)
> 
> diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
> index 49231f6b69..93fc24c011 100644
> --- a/target/i386/excp_helper.c
> +++ b/target/i386/excp_helper.c
> @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>  
>  #else
>  
> +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
> +                                    uint64_t orig_entry, uint32_t bits)
> +{
> +    uint64_t new_entry = orig_entry | bits;
> +
> +    /* Write the updated bottom 32-bits */
> +    if (qemu_tcg_mttcg_enabled()) {
> +        uint32_t old_le = cpu_to_le32(orig_entry);
> +        uint32_t new_le = cpu_to_le32(new_entry);
> +        MemTxResult result;
> +        uint32_t old_ret;
> +
> +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
> +                                                  old_le, new_le,
> +                                                  MEMTXATTRS_UNSPECIFIED,
> +                                                  &result);
> +        if (result == MEMTX_OK) {
> +            if (old_ret != old_le && old_ret != new_le) {
> +                new_entry = 0;
> +            }
> +            return new_entry;
> +        }
> +
> +        /* Do we need to support this case where PTEs aren't in RAM ?
> +         *
> +         * For now fallback to non-atomic case
> +         */
> +    }
> +
> +    x86_stl_phys_notdirty(cs, addr, new_entry);
> +
> +    return new_entry;
> +}
> +
>  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>                          int *prot)
>  {
>      CPUX86State *env = &X86_CPU(cs)->env;
> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
> +    uint64_t rsvd_mask;
>      uint64_t ptep, pte;
>      uint64_t exit_info_1 = 0;
>      target_ulong pde_addr, pte_addr;
> @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>          return gphys;
>      }
>  
> + restart:
> +    rsvd_mask = PG_HI_RSVD_MASK;
>      if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
>          rsvd_mask |= PG_NX_MASK;
>      }
> @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>                  goto do_fault_rsvd;
>              }
>              if (!(pml4e & PG_ACCESSED_MASK)) {
> -                pml4e |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
> +                if (!pml4e) {
> +                    goto restart;
> +                }
>              }
>              ptep &= pml4e ^ PG_NX_MASK;
>              pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
> @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>              }
>              ptep &= pdpe ^ PG_NX_MASK;
>              if (!(pdpe & PG_ACCESSED_MASK)) {
> -                pdpe |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
> +                if (!pdpe) {
> +                    goto restart;
> +                }
>              }
>              if (pdpe & PG_PSE_MASK) {
>                  /* 1 GB page */
> @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>          }
>          /* 4 KB page */
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>          pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
>          pte = x86_ldq_phys(cs, pte_addr);
> @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>          }
>  
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>  
>          /* page directory entry */
> @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>      int error_code = 0;
>      int is_dirty, prot, page_size, is_write, is_user;
>      hwaddr paddr;
> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
> +    uint64_t rsvd_mask;
>      uint32_t page_offset;
>      target_ulong vaddr;
>  
> @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>          goto do_mapping;
>      }
>  
> + restart:
> +    rsvd_mask = PG_HI_RSVD_MASK;
>      if (!(env->efer & MSR_EFER_NXE)) {
>          rsvd_mask |= PG_NX_MASK;
>      }
> @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>                      goto do_fault_rsvd;
>                  }
>                  if (!(pml5e & PG_ACCESSED_MASK)) {
> -                    pml5e |= PG_ACCESSED_MASK;
> -                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
> +                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
> +                    if (!pml5e) {
> +                        goto restart;
> +                    }
>                  }
>                  ptep = pml5e ^ PG_NX_MASK;
>              } else {
> @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>                  goto do_fault_rsvd;
>              }
>              if (!(pml4e & PG_ACCESSED_MASK)) {
> -                pml4e |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
> +                if (!pml4e) {
> +                    goto restart;
> +                }
>              }
>              ptep &= pml4e ^ PG_NX_MASK;
>              pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
> @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>              }
>              ptep &= pdpe ^ PG_NX_MASK;
>              if (!(pdpe & PG_ACCESSED_MASK)) {
> -                pdpe |= PG_ACCESSED_MASK;
> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
> +                if (!pdpe) {
> +                    goto restart;
> +                }
>              }
>              if (pdpe & PG_PSE_MASK) {
>                  /* 1 GB page */
> @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>          }
>          /* 4 KB page */
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>          pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
>              a20_mask;
> @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>          }
>  
>          if (!(pde & PG_ACCESSED_MASK)) {
> -            pde |= PG_ACCESSED_MASK;
> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
> +            if (!pde) {
> +                goto restart;
> +            }
>          }
>  
>          /* page directory entry */
> @@ -634,11 +690,11 @@ do_check_protect_pse36:
>      /* yes, it can! */
>      is_dirty = is_write && !(pte & PG_DIRTY_MASK);
>      if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
> -        pte |= PG_ACCESSED_MASK;
> -        if (is_dirty) {
> -            pte |= PG_DIRTY_MASK;
> +        pte = update_entry(cs, pte_addr, pte,
> +                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
> +        if (!pte) {
> +            goto restart;
>          }
> -        x86_stl_phys_notdirty(cs, pte_addr, pte);
>      }
>  
>      if (!(pte & PG_DIRTY_MASK)) {

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates
  2018-12-13 23:58 [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Benjamin Herrenschmidt
  2018-12-13 23:58 ` [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg Benjamin Herrenschmidt
  2018-12-13 23:58 ` [Qemu-devel] [PATCH 3/3] ppc: Fix radix RC updates Benjamin Herrenschmidt
@ 2018-12-14  3:01 ` Richard Henderson
  2018-12-14  3:54   ` Benjamin Herrenschmidt
  2 siblings, 1 reply; 9+ messages in thread
From: Richard Henderson @ 2018-12-14  3:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, qemu-devel; +Cc: Paolo Bonzini

On 12/13/18 5:58 PM, Benjamin Herrenschmidt wrote:
> +#ifdef CONFIG_ATOMIC64
> +/* This is meant to be used for atomic PTE updates under MT-TCG */
> +uint32_t glue(address_space_cmpxchgq_notdirty, SUFFIX)(ARG1_DECL,
> +    hwaddr addr, uint64_t old, uint64_t new, MemTxAttrs attrs, MemTxResult *result)
> +{
> +    uint8_t *ptr;
> +    MemoryRegion *mr;
> +    hwaddr l = 8;
> +    hwaddr addr1;
> +    MemTxResult r;
> +    uint8_t dirty_log_mask;
> +
> +    /* Must test result */
> +    assert(result);
> +
> +    RCU_READ_LOCK();
> +    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> +    if (l < 8 || !memory_access_is_direct(mr, true)) {
> +        r = MEMTX_ERROR;
> +    } else {
> +        uint32_t orig = old;
> +
> +        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
> +        old = atomic_cmpxchg(ptr, orig, new);
> +

I think you need atomic_cmpxchg__nocheck here.

Failure would be with a 32-bit host that supports ATOMIC64.
E.g. i686.


r~

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates
  2018-12-14  3:01 ` [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Richard Henderson
@ 2018-12-14  3:54   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 9+ messages in thread
From: Benjamin Herrenschmidt @ 2018-12-14  3:54 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: Paolo Bonzini

On Thu, 2018-12-13 at 21:01 -0600, Richard Henderson wrote:
> On 12/13/18 5:58 PM, Benjamin Herrenschmidt wrote:
> > +#ifdef CONFIG_ATOMIC64
> > +/* This is meant to be used for atomic PTE updates under MT-TCG */
> > +uint32_t glue(address_space_cmpxchgq_notdirty, SUFFIX)(ARG1_DECL,
> > +    hwaddr addr, uint64_t old, uint64_t new, MemTxAttrs attrs, MemTxResult *result)
> > +{
> > +    uint8_t *ptr;
> > +    MemoryRegion *mr;
> > +    hwaddr l = 8;
> > +    hwaddr addr1;
> > +    MemTxResult r;
> > +    uint8_t dirty_log_mask;
> > +
> > +    /* Must test result */
> > +    assert(result);
> > +
> > +    RCU_READ_LOCK();
> > +    mr = TRANSLATE(addr, &addr1, &l, true, attrs);
> > +    if (l < 8 || !memory_access_is_direct(mr, true)) {
> > +        r = MEMTX_ERROR;
> > +    } else {
> > +        uint32_t orig = old;
> > +
> > +        ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
> > +        old = atomic_cmpxchg(ptr, orig, new);
> > +
> 
> I think you need atomic_cmpxchg__nocheck here.
> 
> Failure would be with a 32-bit host that supports ATOMIC64.
> E.g. i686.

I'm confused by this and the comments around the definition of
ATOMIC_REG_SIZE :)

So would we have CONFIG_ATOMIC64 in that case and if yes why if all the
atomic_* end up barfing ?

Or rather, why set CONFIG_ATOMIC64 if we ought not to use 64-bit
atomics ?

Also we should probably define ATOMIC_REG_SIZE to 8 for ppc64...

Cheers
Ben.
> 
> r~

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg
  2018-12-14  0:05   ` Benjamin Herrenschmidt
@ 2018-12-14 11:11       ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 9+ messages in thread
From: Philippe Mathieu-Daudé @ 2018-12-14 11:11 UTC (permalink / raw)
  To: Michael Clark, Palmer Dabbelt
  Cc: Paolo Bonzini, Richard Henderson, qemu-devel, qemu-riscv

On 12/14/18 1:05 AM, Benjamin Herrenschmidt wrote:
> Note to RiscV folks: You may want to adapt your code to do the same as
> this, esp. afaik, what you do today is endian-broken if host and guest
> endian are different.

Cc'ing the RISC-V list.

> Cheers,
> Ben. 
> 
> On Fri, 2018-12-14 at 10:58 +1100, Benjamin Herrenschmidt wrote:
>> Afaik, this isn't well documented (at least it wasn't when I last looked)
>> but OSes such as Linux rely on this behaviour:
>>
>> The HW updates to the page tables need to be done atomically with the
>> checking of the present bit (and other permissions).
>>
>> This is what allows Linux to do simple xchg of PTEs with 0 and assume
>> the value read has "final" stable dirty and accessed bits (the TLB
>> invalidation is deferred).
>>
>> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> ---
>>  target/i386/excp_helper.c | 104 +++++++++++++++++++++++++++++---------
>>  1 file changed, 80 insertions(+), 24 deletions(-)
>>
>> diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
>> index 49231f6b69..93fc24c011 100644
>> --- a/target/i386/excp_helper.c
>> +++ b/target/i386/excp_helper.c
>> @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>  
>>  #else
>>  
>> +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
>> +                                    uint64_t orig_entry, uint32_t bits)
>> +{
>> +    uint64_t new_entry = orig_entry | bits;
>> +
>> +    /* Write the updated bottom 32-bits */
>> +    if (qemu_tcg_mttcg_enabled()) {
>> +        uint32_t old_le = cpu_to_le32(orig_entry);
>> +        uint32_t new_le = cpu_to_le32(new_entry);
>> +        MemTxResult result;
>> +        uint32_t old_ret;
>> +
>> +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
>> +                                                  old_le, new_le,
>> +                                                  MEMTXATTRS_UNSPECIFIED,
>> +                                                  &result);
>> +        if (result == MEMTX_OK) {
>> +            if (old_ret != old_le && old_ret != new_le) {
>> +                new_entry = 0;
>> +            }
>> +            return new_entry;
>> +        }
>> +
>> +        /* Do we need to support this case where PTEs aren't in RAM ?
>> +         *
>> +         * For now fallback to non-atomic case
>> +         */
>> +    }
>> +
>> +    x86_stl_phys_notdirty(cs, addr, new_entry);
>> +
>> +    return new_entry;
>> +}
>> +
>>  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>                          int *prot)
>>  {
>>      CPUX86State *env = &X86_CPU(cs)->env;
>> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>> +    uint64_t rsvd_mask;
>>      uint64_t ptep, pte;
>>      uint64_t exit_info_1 = 0;
>>      target_ulong pde_addr, pte_addr;
>> @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>          return gphys;
>>      }
>>  
>> + restart:
>> +    rsvd_mask = PG_HI_RSVD_MASK;
>>      if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
>>          rsvd_mask |= PG_NX_MASK;
>>      }
>> @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>                  goto do_fault_rsvd;
>>              }
>>              if (!(pml4e & PG_ACCESSED_MASK)) {
>> -                pml4e |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
>> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
>> +                if (!pml4e) {
>> +                    goto restart;
>> +                }
>>              }
>>              ptep &= pml4e ^ PG_NX_MASK;
>>              pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
>> @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>              }
>>              ptep &= pdpe ^ PG_NX_MASK;
>>              if (!(pdpe & PG_ACCESSED_MASK)) {
>> -                pdpe |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
>> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
>> +                if (!pdpe) {
>> +                    goto restart;
>> +                }
>>              }
>>              if (pdpe & PG_PSE_MASK) {
>>                  /* 1 GB page */
>> @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>          }
>>          /* 4 KB page */
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>          pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
>>          pte = x86_ldq_phys(cs, pte_addr);
>> @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>          }
>>  
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>  
>>          /* page directory entry */
>> @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>      int error_code = 0;
>>      int is_dirty, prot, page_size, is_write, is_user;
>>      hwaddr paddr;
>> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>> +    uint64_t rsvd_mask;
>>      uint32_t page_offset;
>>      target_ulong vaddr;
>>  
>> @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>          goto do_mapping;
>>      }
>>  
>> + restart:
>> +    rsvd_mask = PG_HI_RSVD_MASK;
>>      if (!(env->efer & MSR_EFER_NXE)) {
>>          rsvd_mask |= PG_NX_MASK;
>>      }
>> @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>                      goto do_fault_rsvd;
>>                  }
>>                  if (!(pml5e & PG_ACCESSED_MASK)) {
>> -                    pml5e |= PG_ACCESSED_MASK;
>> -                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
>> +                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
>> +                    if (!pml5e) {
>> +                        goto restart;
>> +                    }
>>                  }
>>                  ptep = pml5e ^ PG_NX_MASK;
>>              } else {
>> @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>                  goto do_fault_rsvd;
>>              }
>>              if (!(pml4e & PG_ACCESSED_MASK)) {
>> -                pml4e |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
>> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
>> +                if (!pml4e) {
>> +                    goto restart;
>> +                }
>>              }
>>              ptep &= pml4e ^ PG_NX_MASK;
>>              pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
>> @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>              }
>>              ptep &= pdpe ^ PG_NX_MASK;
>>              if (!(pdpe & PG_ACCESSED_MASK)) {
>> -                pdpe |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
>> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
>> +                if (!pdpe) {
>> +                    goto restart;
>> +                }
>>              }
>>              if (pdpe & PG_PSE_MASK) {
>>                  /* 1 GB page */
>> @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>          }
>>          /* 4 KB page */
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>          pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
>>              a20_mask;
>> @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>          }
>>  
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>  
>>          /* page directory entry */
>> @@ -634,11 +690,11 @@ do_check_protect_pse36:
>>      /* yes, it can! */
>>      is_dirty = is_write && !(pte & PG_DIRTY_MASK);
>>      if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
>> -        pte |= PG_ACCESSED_MASK;
>> -        if (is_dirty) {
>> -            pte |= PG_DIRTY_MASK;
>> +        pte = update_entry(cs, pte_addr, pte,
>> +                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
>> +        if (!pte) {
>> +            goto restart;
>>          }
>> -        x86_stl_phys_notdirty(cs, pte_addr, pte);
>>      }
>>  
>>      if (!(pte & PG_DIRTY_MASK)) {
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-riscv] [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg
@ 2018-12-14 11:11       ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 9+ messages in thread
From: Philippe Mathieu-Daudé @ 2018-12-14 11:11 UTC (permalink / raw)
  To: Michael Clark, Palmer Dabbelt
  Cc: Paolo Bonzini, Richard Henderson, qemu-devel, qemu-riscv

On 12/14/18 1:05 AM, Benjamin Herrenschmidt wrote:
> Note to RiscV folks: You may want to adapt your code to do the same as
> this, esp. afaik, what you do today is endian-broken if host and guest
> endian are different.

Cc'ing the RISC-V list.

> Cheers,
> Ben. 
> 
> On Fri, 2018-12-14 at 10:58 +1100, Benjamin Herrenschmidt wrote:
>> Afaik, this isn't well documented (at least it wasn't when I last looked)
>> but OSes such as Linux rely on this behaviour:
>>
>> The HW updates to the page tables need to be done atomically with the
>> checking of the present bit (and other permissions).
>>
>> This is what allows Linux to do simple xchg of PTEs with 0 and assume
>> the value read has "final" stable dirty and accessed bits (the TLB
>> invalidation is deferred).
>>
>> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> ---
>>  target/i386/excp_helper.c | 104 +++++++++++++++++++++++++++++---------
>>  1 file changed, 80 insertions(+), 24 deletions(-)
>>
>> diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
>> index 49231f6b69..93fc24c011 100644
>> --- a/target/i386/excp_helper.c
>> +++ b/target/i386/excp_helper.c
>> @@ -157,11 +157,45 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>  
>>  #else
>>  
>> +static inline uint64_t update_entry(CPUState *cs, target_ulong addr,
>> +                                    uint64_t orig_entry, uint32_t bits)
>> +{
>> +    uint64_t new_entry = orig_entry | bits;
>> +
>> +    /* Write the updated bottom 32-bits */
>> +    if (qemu_tcg_mttcg_enabled()) {
>> +        uint32_t old_le = cpu_to_le32(orig_entry);
>> +        uint32_t new_le = cpu_to_le32(new_entry);
>> +        MemTxResult result;
>> +        uint32_t old_ret;
>> +
>> +        old_ret = address_space_cmpxchgl_notdirty(cs->as, addr,
>> +                                                  old_le, new_le,
>> +                                                  MEMTXATTRS_UNSPECIFIED,
>> +                                                  &result);
>> +        if (result == MEMTX_OK) {
>> +            if (old_ret != old_le && old_ret != new_le) {
>> +                new_entry = 0;
>> +            }
>> +            return new_entry;
>> +        }
>> +
>> +        /* Do we need to support this case where PTEs aren't in RAM ?
>> +         *
>> +         * For now fallback to non-atomic case
>> +         */
>> +    }
>> +
>> +    x86_stl_phys_notdirty(cs, addr, new_entry);
>> +
>> +    return new_entry;
>> +}
>> +
>>  static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>                          int *prot)
>>  {
>>      CPUX86State *env = &X86_CPU(cs)->env;
>> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>> +    uint64_t rsvd_mask;
>>      uint64_t ptep, pte;
>>      uint64_t exit_info_1 = 0;
>>      target_ulong pde_addr, pte_addr;
>> @@ -172,6 +206,8 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>          return gphys;
>>      }
>>  
>> + restart:
>> +    rsvd_mask = PG_HI_RSVD_MASK;
>>      if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
>>          rsvd_mask |= PG_NX_MASK;
>>      }
>> @@ -198,8 +234,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>                  goto do_fault_rsvd;
>>              }
>>              if (!(pml4e & PG_ACCESSED_MASK)) {
>> -                pml4e |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
>> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
>> +                if (!pml4e) {
>> +                    goto restart;
>> +                }
>>              }
>>              ptep &= pml4e ^ PG_NX_MASK;
>>              pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
>> @@ -213,8 +251,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>              }
>>              ptep &= pdpe ^ PG_NX_MASK;
>>              if (!(pdpe & PG_ACCESSED_MASK)) {
>> -                pdpe |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
>> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
>> +                if (!pdpe) {
>> +                    goto restart;
>> +                }
>>              }
>>              if (pdpe & PG_PSE_MASK) {
>>                  /* 1 GB page */
>> @@ -256,8 +296,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>          }
>>          /* 4 KB page */
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>          pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
>>          pte = x86_ldq_phys(cs, pte_addr);
>> @@ -295,8 +337,10 @@ static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>          }
>>  
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>  
>>          /* page directory entry */
>> @@ -376,7 +420,7 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>      int error_code = 0;
>>      int is_dirty, prot, page_size, is_write, is_user;
>>      hwaddr paddr;
>> -    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>> +    uint64_t rsvd_mask;
>>      uint32_t page_offset;
>>      target_ulong vaddr;
>>  
>> @@ -401,6 +445,8 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>          goto do_mapping;
>>      }
>>  
>> + restart:
>> +    rsvd_mask = PG_HI_RSVD_MASK;
>>      if (!(env->efer & MSR_EFER_NXE)) {
>>          rsvd_mask |= PG_NX_MASK;
>>      }
>> @@ -436,8 +482,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>                      goto do_fault_rsvd;
>>                  }
>>                  if (!(pml5e & PG_ACCESSED_MASK)) {
>> -                    pml5e |= PG_ACCESSED_MASK;
>> -                    x86_stl_phys_notdirty(cs, pml5e_addr, pml5e);
>> +                    pml5e = update_entry(cs, pml5e_addr, pml5e, PG_ACCESSED_MASK);
>> +                    if (!pml5e) {
>> +                        goto restart;
>> +                    }
>>                  }
>>                  ptep = pml5e ^ PG_NX_MASK;
>>              } else {
>> @@ -456,8 +504,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>                  goto do_fault_rsvd;
>>              }
>>              if (!(pml4e & PG_ACCESSED_MASK)) {
>> -                pml4e |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
>> +                pml4e = update_entry(cs, pml4e_addr, pml4e, PG_ACCESSED_MASK);
>> +                if (!pml4e) {
>> +                    goto restart;
>> +                }
>>              }
>>              ptep &= pml4e ^ PG_NX_MASK;
>>              pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
>> @@ -472,8 +522,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>              }
>>              ptep &= pdpe ^ PG_NX_MASK;
>>              if (!(pdpe & PG_ACCESSED_MASK)) {
>> -                pdpe |= PG_ACCESSED_MASK;
>> -                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
>> +                pdpe = update_entry(cs, pdpe_addr, pdpe, PG_ACCESSED_MASK);
>> +                if (!pdpe) {
>> +                    goto restart;
>> +                }
>>              }
>>              if (pdpe & PG_PSE_MASK) {
>>                  /* 1 GB page */
>> @@ -520,8 +572,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>          }
>>          /* 4 KB page */
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>          pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
>>              a20_mask;
>> @@ -563,8 +617,10 @@ int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
>>          }
>>  
>>          if (!(pde & PG_ACCESSED_MASK)) {
>> -            pde |= PG_ACCESSED_MASK;
>> -            x86_stl_phys_notdirty(cs, pde_addr, pde);
>> +            pde = update_entry(cs, pde_addr, pde, PG_ACCESSED_MASK);
>> +            if (!pde) {
>> +                goto restart;
>> +            }
>>          }
>>  
>>          /* page directory entry */
>> @@ -634,11 +690,11 @@ do_check_protect_pse36:
>>      /* yes, it can! */
>>      is_dirty = is_write && !(pte & PG_DIRTY_MASK);
>>      if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
>> -        pte |= PG_ACCESSED_MASK;
>> -        if (is_dirty) {
>> -            pte |= PG_DIRTY_MASK;
>> +        pte = update_entry(cs, pte_addr, pte,
>> +                           PG_ACCESSED_MASK | (is_dirty ? PG_DIRTY_MASK : 0));
>> +        if (!pte) {
>> +            goto restart;
>>          }
>> -        x86_stl_phys_notdirty(cs, pte_addr, pte);
>>      }
>>  
>>      if (!(pte & PG_DIRTY_MASK)) {
> 
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-12-14 11:12 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-13 23:58 [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Benjamin Herrenschmidt
2018-12-13 23:58 ` [Qemu-devel] [PATCH 2/3] i386: Atomically update PTEs with mttcg Benjamin Herrenschmidt
2018-12-14  0:05   ` Benjamin Herrenschmidt
2018-12-14 11:11     ` Philippe Mathieu-Daudé
2018-12-14 11:11       ` [Qemu-riscv] " Philippe Mathieu-Daudé
2018-12-13 23:58 ` [Qemu-devel] [PATCH 3/3] ppc: Fix radix RC updates Benjamin Herrenschmidt
2018-12-14  0:03   ` Benjamin Herrenschmidt
2018-12-14  3:01 ` [Qemu-devel] [PATCH 1/3] memory_ldst: Add atomic ops for PTE updates Richard Henderson
2018-12-14  3:54   ` Benjamin Herrenschmidt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.