All of lore.kernel.org
 help / color / mirror / Atom feed
From: Richard Henderson <richard.henderson@linaro.org>
To: qemu-devel@nongnu.org
Cc: alex.bennee@linaro.org, laurent@vivier.eu, pbonzini@redhat.com,
	imp@bsdimp.com, f4bug@amsat.org
Subject: [PATCH 22/24] accel/tcg: Use interval tree for user-only page tracking
Date: Wed,  5 Oct 2022 20:11:11 -0700	[thread overview]
Message-ID: <20221006031113.1139454-23-richard.henderson@linaro.org> (raw)
In-Reply-To: <20221006031113.1139454-1-richard.henderson@linaro.org>

Finish weaning user-only away from PageDesc.

Using an interval tree to track page permissions means that
we can represent very large regions efficiently.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/290
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/967
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1214
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/internal.h           |   4 +-
 accel/tcg/tb-maint.c           |  20 +-
 accel/tcg/user-exec.c          | 614 ++++++++++++++++++++++-----------
 tests/tcg/multiarch/test-vma.c |  22 ++
 4 files changed, 451 insertions(+), 209 deletions(-)
 create mode 100644 tests/tcg/multiarch/test-vma.c

diff --git a/accel/tcg/internal.h b/accel/tcg/internal.h
index 250f0daac9..c7e157d1cd 100644
--- a/accel/tcg/internal.h
+++ b/accel/tcg/internal.h
@@ -24,9 +24,7 @@
 #endif
 
 typedef struct PageDesc {
-#ifdef CONFIG_USER_ONLY
-    unsigned long flags;
-#else
+#ifndef CONFIG_USER_ONLY
     QemuSpin lock;
     /* list of TBs intersecting this ram page */
     uintptr_t first_tb;
diff --git a/accel/tcg/tb-maint.c b/accel/tcg/tb-maint.c
index 14e8e47a6a..694440cb4a 100644
--- a/accel/tcg/tb-maint.c
+++ b/accel/tcg/tb-maint.c
@@ -68,15 +68,23 @@ static void page_flush_tb(void)
 /* Call with mmap_lock held. */
 static void tb_page_add(TranslationBlock *tb, PageDesc *p1, PageDesc *p2)
 {
-    /* translator_loop() must have made all TB pages non-writable */
-    assert(!(p1->flags & PAGE_WRITE));
-    if (p2) {
-        assert(!(p2->flags & PAGE_WRITE));
-    }
+    target_ulong addr;
+    int flags;
 
     assert_memory_lock();
-
     tb->itree.last = tb->itree.start + tb->size - 1;
+
+    /* translator_loop() must have made all TB pages non-writable */
+    addr = tb_page_addr0(tb);
+    flags = page_get_flags(addr);
+    assert(!(flags & PAGE_WRITE));
+
+    addr = tb_page_addr1(tb);
+    if (addr != -1) {
+        flags = page_get_flags(addr);
+        assert(!(flags & PAGE_WRITE));
+    }
+
     interval_tree_insert(&tb->itree, &tb_root);
 }
 
diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index b6050e2bfe..3b47a9e26a 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -135,106 +135,61 @@ bool handle_sigsegv_accerr_write(CPUState *cpu, sigset_t *old_set,
     }
 }
 
-/*
- * Walks guest process memory "regions" one by one
- * and calls callback function 'fn' for each region.
- */
-struct walk_memory_regions_data {
-    walk_memory_regions_fn fn;
-    void *priv;
-    target_ulong start;
-    int prot;
-};
+typedef struct PageFlagsNode {
+    IntervalTreeNode itree;
+    int flags;
+} PageFlagsNode;
 
-static int walk_memory_regions_end(struct walk_memory_regions_data *data,
-                                   target_ulong end, int new_prot)
+static IntervalTreeRoot pageflags_root;
+
+static PageFlagsNode *pageflags_find(target_ulong start, target_long last)
 {
-    if (data->start != -1u) {
-        int rc = data->fn(data->priv, data->start, end, data->prot);
-        if (rc != 0) {
-            return rc;
-        }
-    }
+    IntervalTreeNode *n;
 
-    data->start = (new_prot ? end : -1u);
-    data->prot = new_prot;
-
-    return 0;
+    n = interval_tree_iter_first(&pageflags_root, start, last);
+    return n ? container_of(n, PageFlagsNode, itree) : NULL;
 }
 
-static int walk_memory_regions_1(struct walk_memory_regions_data *data,
-                                 target_ulong base, int level, void **lp)
+static PageFlagsNode *pageflags_next(PageFlagsNode *p, target_ulong start,
+                                     target_long last)
 {
-    target_ulong pa;
-    int i, rc;
+    IntervalTreeNode *n;
 
-    if (*lp == NULL) {
-        return walk_memory_regions_end(data, base, 0);
-    }
-
-    if (level == 0) {
-        PageDesc *pd = *lp;
-
-        for (i = 0; i < V_L2_SIZE; ++i) {
-            int prot = pd[i].flags;
-
-            pa = base | (i << TARGET_PAGE_BITS);
-            if (prot != data->prot) {
-                rc = walk_memory_regions_end(data, pa, prot);
-                if (rc != 0) {
-                    return rc;
-                }
-            }
-        }
-    } else {
-        void **pp = *lp;
-
-        for (i = 0; i < V_L2_SIZE; ++i) {
-            pa = base | ((target_ulong)i <<
-                (TARGET_PAGE_BITS + V_L2_BITS * level));
-            rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
-            if (rc != 0) {
-                return rc;
-            }
-        }
-    }
-
-    return 0;
+    n = interval_tree_iter_next(&p->itree, start, last);
+    return n ? container_of(n, PageFlagsNode, itree) : NULL;
 }
 
 int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
 {
-    struct walk_memory_regions_data data;
-    uintptr_t i, l1_sz = v_l1_size;
+    IntervalTreeNode *n;
+    int rc = 0;
 
-    data.fn = fn;
-    data.priv = priv;
-    data.start = -1u;
-    data.prot = 0;
+    mmap_lock();
+    for (n = interval_tree_iter_first(&pageflags_root, 0, -1);
+         n != NULL;
+         n = interval_tree_iter_next(n, 0, -1)) {
+        PageFlagsNode *p = container_of(n, PageFlagsNode, itree);
 
-    for (i = 0; i < l1_sz; i++) {
-        target_ulong base = i << (v_l1_shift + TARGET_PAGE_BITS);
-        int rc = walk_memory_regions_1(&data, base, v_l2_levels, l1_map + i);
+        rc = fn(priv, n->start, n->last + 1, p->flags);
         if (rc != 0) {
-            return rc;
+            break;
         }
     }
+    mmap_unlock();
 
-    return walk_memory_regions_end(&data, 0, 0);
+    return rc;
 }
 
 static int dump_region(void *priv, target_ulong start,
-    target_ulong end, unsigned long prot)
+                       target_ulong end, unsigned long prot)
 {
     FILE *f = (FILE *)priv;
 
-    (void) fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx
-        " "TARGET_FMT_lx" %c%c%c\n",
-        start, end, end - start,
-        ((prot & PAGE_READ) ? 'r' : '-'),
-        ((prot & PAGE_WRITE) ? 'w' : '-'),
-        ((prot & PAGE_EXEC) ? 'x' : '-'));
-
+    fprintf(f, TARGET_FMT_lx"-"TARGET_FMT_lx" "TARGET_FMT_lx" %c%c%c\n",
+            start, end, end - start,
+            ((prot & PAGE_READ) ? 'r' : '-'),
+            ((prot & PAGE_WRITE) ? 'w' : '-'),
+            ((prot & PAGE_EXEC) ? 'x' : '-'));
     return 0;
 }
 
@@ -242,22 +197,134 @@ static int dump_region(void *priv, target_ulong start,
 void page_dump(FILE *f)
 {
     const int length = sizeof(target_ulong) * 2;
-    (void) fprintf(f, "%-*s %-*s %-*s %s\n",
+
+    fprintf(f, "%-*s %-*s %-*s %s\n",
             length, "start", length, "end", length, "size", "prot");
     walk_memory_regions(f, dump_region);
 }
 
 int page_get_flags(target_ulong address)
 {
-    PageDesc *p;
+    PageFlagsNode *p = pageflags_find(address, address);
 
-    p = page_find(address >> TARGET_PAGE_BITS);
+    /*
+     * See util/interval-tree.c re lockless lookups: no false positives but
+     * there are false negatives.  If we find nothing, retry with the mmap
+     * lock acquired.
+     */
     if (!p) {
-        return 0;
+        if (have_mmap_lock()) {
+            return 0;
+        }
+        mmap_lock();
+        p = pageflags_find(address, address);
+        mmap_unlock();
+        if (!p) {
+            return 0;
+        }
     }
     return p->flags;
 }
 
+/* A subroutine of page_set_flags: insert a new node for [start,last]. */
+static void pageflags_create(target_ulong start, target_ulong last, int flags)
+{
+    PageFlagsNode *p = g_new(PageFlagsNode, 1);
+
+    p->itree.start = start;
+    p->itree.last = last;
+    p->flags = flags;
+    interval_tree_insert(&p->itree, &pageflags_root);
+}
+
+/* A subroutine of page_set_flags: remove everything in [start,last]. */
+static bool pageflags_unset(target_ulong start, target_ulong last)
+{
+    bool inval_tb = false;
+
+    while (true) {
+        PageFlagsNode *p = pageflags_find(start, last);
+        target_ulong p_last;
+
+        if (!p) {
+            break;
+        }
+
+        if (p->flags & PAGE_EXEC) {
+            inval_tb = true;
+        }
+
+        interval_tree_remove(&p->itree, &pageflags_root);
+        p_last = p->itree.last;
+
+        if (p->itree.start < start) {
+            /* Truncate the node from the end, or split out the middle. */
+            p->itree.last = start - 1;
+            interval_tree_insert(&p->itree, &pageflags_root);
+            if (last < p_last) {
+                pageflags_create(last + 1, p_last, p->flags);
+                break;
+            }
+        } else if (p_last <= last) {
+            /* Range completely covers node -- remove it. */
+            g_free(p);
+        } else {
+            /* Truncate the node from the start. */
+            p->itree.start = last + 1;
+            interval_tree_insert(&p->itree, &pageflags_root);
+            break;
+        }
+    }
+
+    return inval_tb;
+}
+
+/*
+ * A subroutine of page_set_flags: nothing overlaps [start,last],
+ * but check adjacent mappings and maybe merge into a single range.
+ */
+static void pageflags_create_merge(target_ulong start, target_ulong last,
+                                   int flags)
+{
+    PageFlagsNode *next = NULL, *prev = NULL;
+
+    if (start > 0) {
+        prev = pageflags_find(start - 1, start - 1);
+        if (prev) {
+            if (prev->flags == flags) {
+                interval_tree_remove(&prev->itree, &pageflags_root);
+            } else {
+                prev = NULL;
+            }
+        }
+    }
+    if (last + 1 != 0) {
+        next = pageflags_find(last + 1, last + 1);
+        if (next) {
+            if (next->flags == flags) {
+                interval_tree_remove(&next->itree, &pageflags_root);
+            } else {
+                next = NULL;
+            }
+        }
+    }
+
+    if (prev) {
+        if (next) {
+            prev->itree.last = next->itree.last;
+            g_free(next);
+        } else {
+            prev->itree.last = last;
+        }
+        interval_tree_insert(&prev->itree, &pageflags_root);
+    } else if (next) {
+        next->itree.start = start;
+        interval_tree_insert(&next->itree, &pageflags_root);
+    } else {
+        pageflags_create(start, last, flags);
+    }
+}
+
 /*
  * Allow the target to decide if PAGE_TARGET_[12] may be reset.
  * By default, they are not kept.
@@ -267,6 +334,146 @@ int page_get_flags(target_ulong address)
 #endif
 #define PAGE_STICKY  (PAGE_ANON | PAGE_PASSTHROUGH | PAGE_TARGET_STICKY)
 
+/* A subroutine of page_set_flags: add flags to [start,last]. */
+static bool pageflags_set_clear(target_ulong start, target_ulong last,
+                                int set_flags, int clear_flags)
+{
+    PageFlagsNode *p;
+    target_ulong p_start, p_last;
+    int p_flags, merge_flags;
+    bool inval_tb = false;
+
+ restart:
+    p = pageflags_find(start, last);
+    if (!p) {
+        if (set_flags) {
+            pageflags_create_merge(start, last, set_flags);
+        }
+        goto done;
+    }
+
+    p_start = p->itree.start;
+    p_last = p->itree.last;
+    p_flags = p->flags;
+    /* Using mprotect on a page does not change sticky bits. */
+    merge_flags = (p_flags & ~clear_flags) | set_flags;
+
+    /*
+     * Need to flush if an overlapping executable region
+     * removes exec, or adds write.
+     */
+    if ((p_flags & PAGE_EXEC)
+        && (!(merge_flags & PAGE_EXEC)
+            || (merge_flags & ~p_flags & PAGE_WRITE))) {
+        inval_tb = true;
+    }
+
+    /*
+     * If there is an exact range match, update and return without
+     * attempting to merge with adjacent regions.
+     */
+    if (start == p_start && last == p_last) {
+        if (merge_flags) {
+            p->flags = merge_flags;
+        } else {
+            interval_tree_remove(&p->itree, &pageflags_root);
+            g_free(p);
+        }
+        goto done;
+    }
+
+    /*
+     * If sticky bits affect the original mapping, then we must be more
+     * careful about the existing intervals and the separate flags.
+     */
+    if (set_flags != merge_flags) {
+        if (p_start < start) {
+            interval_tree_remove(&p->itree, &pageflags_root);
+            p->itree.last = start - 1;
+            interval_tree_insert(&p->itree, &pageflags_root);
+
+            if (last < p_last) {
+                if (merge_flags) {
+                    pageflags_create(start, last, merge_flags);
+                }
+                pageflags_create(last + 1, p_last, p_flags);
+            } else {
+                if (merge_flags) {
+                    pageflags_create(start, p_last, merge_flags);
+                }
+                if (p_last < last) {
+                    start = p_last + 1;
+                    goto restart;
+                }
+            }
+        } else {
+            if (start < p_start && set_flags) {
+                pageflags_create(start, p_start - 1, set_flags);
+            }
+            if (last < p_last) {
+                interval_tree_remove(&p->itree, &pageflags_root);
+                p->itree.start = last + 1;
+                interval_tree_insert(&p->itree, &pageflags_root);
+                if (merge_flags) {
+                    pageflags_create(start, last, merge_flags);
+                }
+            } else {
+                if (merge_flags) {
+                    p->flags = merge_flags;
+                } else {
+                    interval_tree_remove(&p->itree, &pageflags_root);
+                    g_free(p);
+                }
+                if (p_last < last) {
+                    start = p_last + 1;
+                    goto restart;
+                }
+            }
+        }
+        goto done;
+    }
+
+    /* If flags are not changing for this range, incorporate it. */
+    if (set_flags == p_flags) {
+        if (start < p_start) {
+            interval_tree_remove(&p->itree, &pageflags_root);
+            p->itree.start = start;
+            interval_tree_insert(&p->itree, &pageflags_root);
+        }
+        if (p_last < last) {
+            start = p_last + 1;
+            goto restart;
+        }
+        goto done;
+    }
+
+    /* Maybe split out head and/or tail ranges with the original flags. */
+    interval_tree_remove(&p->itree, &pageflags_root);
+    if (p_start < start) {
+        p->itree.last = start - 1;
+        interval_tree_insert(&p->itree, &pageflags_root);
+
+        if (p_last < last) {
+            goto restart;
+        }
+        if (last < p_last) {
+            pageflags_create(last + 1, p_last, p_flags);
+        }
+    } else if (last < p_last) {
+        p->itree.start = last + 1;
+        interval_tree_insert(&p->itree, &pageflags_root);
+    } else {
+        g_free(p);
+        goto restart;
+    }
+    if (set_flags) {
+        pageflags_create(start, last, set_flags);
+    }
+
+ done:
+    return inval_tb;
+}
+
 /*
  * Modify the flags of a page and invalidate the code if necessary.
  * The flag PAGE_WRITE_ORG is positioned automatically depending
@@ -274,49 +481,41 @@ int page_get_flags(target_ulong address)
  */
 void page_set_flags(target_ulong start, target_ulong end, int flags)
 {
-    target_ulong addr, len;
-    bool reset, inval_tb = false;
+    target_ulong last;
+    bool reset = false;
+    bool inval_tb = false;
 
     /* This function should never be called with addresses outside the
        guest address space.  If this assert fires, it probably indicates
        a missing call to h2g_valid.  */
-    assert(end - 1 <= GUEST_ADDR_MAX);
     assert(start < end);
+    assert(end - 1 <= GUEST_ADDR_MAX);
     /* Only set PAGE_ANON with new mappings. */
     assert(!(flags & PAGE_ANON) || (flags & PAGE_RESET));
     assert_memory_lock();
 
     start = start & TARGET_PAGE_MASK;
     end = TARGET_PAGE_ALIGN(end);
+    last = end - 1;
 
-    if (flags & PAGE_WRITE) {
-        flags |= PAGE_WRITE_ORG;
-    }
-    reset = !(flags & PAGE_VALID) || (flags & PAGE_RESET);
-    if (reset) {
-        page_reset_target_data(start, end);
-    }
-    flags &= ~PAGE_RESET;
-
-    for (addr = start, len = end - start;
-         len != 0;
-         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        PageDesc *p = page_find_alloc(addr >> TARGET_PAGE_BITS, true);
-
-        /*
-         * If the page was executable, but is reset, or is no longer
-         * executable, or has become writable, then invalidate any code.
-         */
-        if ((p->flags & PAGE_EXEC)
-            && (reset ||
-                !(flags & PAGE_EXEC) ||
-                (flags & ~p->flags & PAGE_WRITE))) {
-            inval_tb = true;
+    if (!(flags & PAGE_VALID)) {
+        flags = 0;
+    } else {
+        reset = flags & PAGE_RESET;
+        flags &= ~PAGE_RESET;
+        if (flags & PAGE_WRITE) {
+            flags |= PAGE_WRITE_ORG;
         }
-        /* Using mprotect on a page does not change sticky bits. */
-        p->flags = (reset ? 0 : p->flags & PAGE_STICKY) | flags;
     }
 
+    if (!flags || reset) {
+        page_reset_target_data(start, end);
+        inval_tb |= pageflags_unset(start, last);
+    }
+    if (flags) {
+        inval_tb |= pageflags_set_clear(start, last, flags,
+                                        ~(reset ? 0 : PAGE_STICKY));
+    }
     if (inval_tb) {
         tb_invalidate_phys_range(start, end);
     }
@@ -324,87 +523,89 @@ void page_set_flags(target_ulong start, target_ulong end, int flags)
 
 int page_check_range(target_ulong start, target_ulong len, int flags)
 {
-    PageDesc *p;
-    target_ulong end;
-    target_ulong addr;
-
-    /*
-     * This function should never be called with addresses outside the
-     * guest address space.  If this assert fires, it probably indicates
-     * a missing call to h2g_valid.
-     */
-    if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
-        assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
-    }
+    target_ulong last;
 
     if (len == 0) {
-        return 0;
-    }
-    if (start + len - 1 < start) {
-        /* We've wrapped around.  */
-        return -1;
+        return 0;  /* trivial length */
     }
 
-    /* must do before we loose bits in the next step */
-    end = TARGET_PAGE_ALIGN(start + len);
-    start = start & TARGET_PAGE_MASK;
+    last = start + len - 1;
+    if (last < start) {
+        return -1; /* wrap around */
+    }
+
+    while (true) {
+        PageFlagsNode *p = pageflags_find(start, last);
+        int missing;
 
-    for (addr = start, len = end - start;
-         len != 0;
-         len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
-        p = page_find(addr >> TARGET_PAGE_BITS);
         if (!p) {
-            return -1;
+            return -1; /* entire region invalid */
         }
-        if (!(p->flags & PAGE_VALID)) {
-            return -1;
+        if (start < p->itree.start) {
+            return -1; /* initial bytes invalid */
         }
 
-        if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
-            return -1;
+        missing = flags & ~p->flags;
+        if (missing & PAGE_READ) {
+            return -1; /* page not readable */
         }
-        if (flags & PAGE_WRITE) {
+        if (missing & PAGE_WRITE) {
             if (!(p->flags & PAGE_WRITE_ORG)) {
+                return -1; /* page not writable */
+            }
+            /* Asking about writable, but has been protected: undo. */
+            if (!page_unprotect(start, 0)) {
                 return -1;
             }
-            /* unprotect the page if it was put read-only because it
-               contains translated code */
-            if (!(p->flags & PAGE_WRITE)) {
-                if (!page_unprotect(addr, 0)) {
-                    return -1;
-                }
+            /* TODO: page_unprotect should take a range, not a single page. */
+            if (last - start < TARGET_PAGE_SIZE) {
+                return 0; /* ok */
             }
+            start += TARGET_PAGE_SIZE;
+            continue;
         }
+
+        if (last <= p->itree.last) {
+            return 0; /* ok */
+        }
+        start = p->itree.last + 1;
     }
-    return 0;
 }
 
-void page_protect(tb_page_addr_t page_addr)
+void page_protect(tb_page_addr_t address)
 {
-    target_ulong addr;
-    PageDesc *p;
+    PageFlagsNode *p;
+    target_ulong start, last;
     int prot;
 
-    p = page_find(page_addr >> TARGET_PAGE_BITS);
-    if (p && (p->flags & PAGE_WRITE)) {
-        /*
-         * Force the host page as non writable (writes will have a page fault +
-         * mprotect overhead).
-         */
-        page_addr &= qemu_host_page_mask;
-        prot = 0;
-        for (addr = page_addr; addr < page_addr + qemu_host_page_size;
-             addr += TARGET_PAGE_SIZE) {
+    assert_memory_lock();
 
-            p = page_find(addr >> TARGET_PAGE_BITS);
-            if (!p) {
-                continue;
-            }
+    if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+        start = address & TARGET_PAGE_MASK;
+        last = start + TARGET_PAGE_SIZE - 1;
+    } else {
+        start = address & qemu_host_page_mask;
+        last = start + qemu_host_page_size - 1;
+    }
+
+    p = pageflags_find(start, last);
+    if (!p) {
+        return;
+    }
+    prot = p->flags;
+
+    if (unlikely(p->itree.last < last)) {
+        /* More than one protection region covers the one host page. */
+        assert(TARGET_PAGE_SIZE < qemu_host_page_size);
+        while ((p = pageflags_next(p, start, last)) != NULL) {
             prot |= p->flags;
-            p->flags &= ~PAGE_WRITE;
         }
-        mprotect(g2h_untagged(page_addr), qemu_host_page_size,
-                 (prot & PAGE_BITS) & ~PAGE_WRITE);
+    }
+
+    if (prot & PAGE_WRITE) {
+        pageflags_set_clear(start, last, 0, PAGE_WRITE);
+        mprotect(g2h_untagged(start), qemu_host_page_size,
+                 prot & (PAGE_READ | PAGE_EXEC) ? PROT_READ : PROT_NONE);
     }
 }
 
@@ -417,10 +618,8 @@ void page_protect(tb_page_addr_t page_addr)
  */
 int page_unprotect(target_ulong address, uintptr_t pc)
 {
-    unsigned int prot;
+    PageFlagsNode *p;
     bool current_tb_invalidated;
-    PageDesc *p;
-    target_ulong host_start, host_end, addr;
 
     /*
      * Technically this isn't safe inside a signal handler.  However we
@@ -429,40 +628,54 @@ int page_unprotect(target_ulong address, uintptr_t pc)
      */
     mmap_lock();
 
-    p = page_find(address >> TARGET_PAGE_BITS);
-    if (!p) {
+    p = pageflags_find(address, address);
+
+    /* If this address was not really writable, nothing to do. */
+    if (!p || !(p->flags & PAGE_WRITE_ORG)) {
         mmap_unlock();
         return 0;
     }
 
-    /*
-     * If the page was really writable, then we change its
-     * protection back to writable.
-     */
-    if (p->flags & PAGE_WRITE_ORG) {
-        current_tb_invalidated = false;
-        if (p->flags & PAGE_WRITE) {
-            /*
-             * If the page is actually marked WRITE then assume this is because
-             * this thread raced with another one which got here first and
-             * set the page to PAGE_WRITE and did the TB invalidate for us.
-             */
+    current_tb_invalidated = false;
+    if (p->flags & PAGE_WRITE) {
+        /*
+         * If the page is actually marked WRITE then assume this is because
+         * this thread raced with another one which got here first and
+         * set the page to PAGE_WRITE and did the TB invalidate for us.
+         */
 #ifdef TARGET_HAS_PRECISE_SMC
-            TranslationBlock *current_tb = tcg_tb_lookup(pc);
-            if (current_tb) {
-                current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
-            }
+        TranslationBlock *current_tb = tcg_tb_lookup(pc);
+        if (current_tb) {
+            current_tb_invalidated = tb_cflags(current_tb) & CF_INVALID;
+        }
 #endif
+    } else {
+        target_ulong start, len, i;
+        int prot;
+
+        if (qemu_host_page_size <= TARGET_PAGE_SIZE) {
+            start = address & TARGET_PAGE_MASK;
+            len = TARGET_PAGE_SIZE;
+            prot = p->flags | PAGE_WRITE;
+            pageflags_set_clear(start, start + len - 1, PAGE_WRITE, 0);
+            current_tb_invalidated = tb_invalidate_phys_page_unwind(start, pc);
         } else {
-            host_start = address & qemu_host_page_mask;
-            host_end = host_start + qemu_host_page_size;
-
+            start = address & qemu_host_page_mask;
+            len = qemu_host_page_size;
             prot = 0;
-            for (addr = host_start; addr < host_end; addr += TARGET_PAGE_SIZE) {
-                p = page_find(addr >> TARGET_PAGE_BITS);
-                p->flags |= PAGE_WRITE;
-                prot |= p->flags;
 
+            for (i = 0; i < len; i += TARGET_PAGE_SIZE) {
+                target_ulong addr = start + i;
+
+                p = pageflags_find(addr, addr);
+                if (p) {
+                    prot |= p->flags;
+                    if (p->flags & PAGE_WRITE_ORG) {
+                        prot |= PAGE_WRITE;
+                        pageflags_set_clear(addr, addr + TARGET_PAGE_SIZE - 1,
+                                            PAGE_WRITE, 0);
+                    }
+                }
                 /*
                  * Since the content will be modified, we must invalidate
                  * the corresponding translated code.
@@ -470,15 +683,16 @@ int page_unprotect(target_ulong address, uintptr_t pc)
                 current_tb_invalidated |=
                     tb_invalidate_phys_page_unwind(addr, pc);
             }
-            mprotect((void *)g2h_untagged(host_start), qemu_host_page_size,
-                     prot & PAGE_BITS);
         }
-        mmap_unlock();
-        /* If current TB was invalidated return to main loop */
-        return current_tb_invalidated ? 2 : 1;
+        if (prot & PAGE_EXEC) {
+            prot = (prot & ~PAGE_EXEC) | PAGE_READ;
+        }
+        mprotect((void *)g2h_untagged(start), len, prot & PAGE_BITS);
     }
     mmap_unlock();
-    return 0;
+
+    /* If current TB was invalidated return to main loop */
+    return current_tb_invalidated ? 2 : 1;
 }
 
 static int probe_access_internal(CPUArchState *env, target_ulong addr,
diff --git a/tests/tcg/multiarch/test-vma.c b/tests/tcg/multiarch/test-vma.c
new file mode 100644
index 0000000000..2893d60334
--- /dev/null
+++ b/tests/tcg/multiarch/test-vma.c
@@ -0,0 +1,22 @@
+/*
+ * Test very large vma allocations.
+ * The qemu out-of-memory condition was within the mmap syscall itself.
+ * If the syscall actually returns with MAP_FAILED, the test succeeded.
+ */
+#include <sys/mman.h>
+
+int main()
+{
+    int n = sizeof(size_t) == 4 ? 32 : 45;
+
+    for (int i = 28; i < n; i++) {
+        size_t l = (size_t)1 << i;
+        void *p = mmap(0, l, PROT_NONE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
+        if (p == MAP_FAILED) {
+            break;
+        }
+        munmap(p, l);
+    }
+    return 0;
+}
-- 
2.34.1



  parent reply	other threads:[~2022-10-06  3:28 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-06  3:10 [PATCH 00/24] accel/tcg: Rewrite user-only vma tracking Richard Henderson
2022-10-06  3:10 ` [PATCH 01/24] util: Add interval-tree.c Richard Henderson
2022-10-25  8:40   ` Alex Bennée
2022-10-25 10:36     ` Richard Henderson
2022-10-06  3:10 ` [PATCH 02/24] accel/tcg: Make page_alloc_target_data allocation constant Richard Henderson
2022-10-25  8:45   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 03/24] accel/tcg: Remove disabled debug in translate-all.c Richard Henderson
2022-10-25  8:46   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 04/24] accel/tcg: Split out PageDesc to internal.h Richard Henderson
2022-10-25  8:47   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 05/24] accel/tcg: Split out tb-maint.c Richard Henderson
2022-10-25  8:49   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 06/24] accel/tcg: Move assert_no_pages_locked to internal.h Richard Henderson
2022-10-25  8:49   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 07/24] accel/tcg: Drop cpu_get_tb_cpu_state from TARGET_HAS_PRECISE_SMC Richard Henderson
2022-10-25  8:50   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 08/24] accel/tcg: Remove duplicate store to tb->page_addr[] Richard Henderson
2022-10-25  9:12   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 09/24] accel/tcg: Introduce tb_{set_}page_addr{0,1} Richard Henderson
2022-10-25  9:47   ` Alex Bennée
2022-10-06  3:10 ` [PATCH 10/24] accel/tcg: Rename tb_invalidate_phys_page Richard Henderson
2022-10-25  9:49   ` Alex Bennée
2022-10-06  3:11 ` [PATCH 11/24] accel/tcg: Rename tb_invalidate_phys_page_range and drop end parameter Richard Henderson
2022-10-25 13:21   ` Alex Bennée
2022-10-06  3:11 ` [PATCH 12/24] accel/tcg: Unify declarations of tb_invalidate_phys_range Richard Henderson
2022-10-25 13:24   ` Alex Bennée
2022-10-06  3:11 ` [PATCH 13/24] accel/tcg: Use tb_invalidate_phys_page in page_set_flags Richard Henderson
2022-10-06  3:11 ` [PATCH 14/24] accel/tcg: Call tb_invalidate_phys_page for PAGE_RESET Richard Henderson
2022-10-25 15:42   ` Alex Bennée
2022-10-25 20:55     ` Richard Henderson
2022-10-06  3:11 ` [PATCH 15/24] accel/tcg: Use interval tree for TBs in user-only mode Richard Henderson
2022-10-25 15:58   ` Alex Bennée
2022-10-25 21:19     ` Richard Henderson
2022-10-06  3:11 ` [PATCH 16/24] accel/tcg: Use page_reset_target_data in page_set_flags Richard Henderson
2022-10-25 16:12   ` Alex Bennée
2022-10-06  3:11 ` [PATCH 17/24] accel/tcg: Use tb_invalidate_phys_range " Richard Henderson
2022-10-25 16:14   ` Alex Bennée
2022-10-06  3:11 ` [PATCH 18/24] accel/tcg: Move TARGET_PAGE_DATA_SIZE impl to user-exec.c Richard Henderson
2022-10-25 16:15   ` Alex Bennée
2022-10-06  3:11 ` [PATCH 19/24] accel/tcg: Simplify page_get/alloc_target_data Richard Henderson
2022-10-25 16:19   ` Alex Bennée
2022-10-06  3:11 ` [PATCH 20/24] accel/tcg: Use interval tree for TARGET_PAGE_DATA_SIZE Richard Henderson
2022-10-25 19:30   ` Alex Bennée
2022-10-25 21:29     ` Richard Henderson
2022-10-06  3:11 ` [PATCH 21/24] accel/tcg: Move page_{get,set}_flags to user-exec.c Richard Henderson
2022-10-26 12:35   ` [PATCH 21/24] accel/tcg: Move page_{get, set}_flags " Alex Bennée
2022-10-06  3:11 ` Richard Henderson [this message]
2022-10-26 13:36   ` [PATCH 22/24] accel/tcg: Use interval tree for user-only page tracking Alex Bennée
2022-10-27 10:20     ` Richard Henderson
2022-10-27 15:59       ` Alex Bennée
2022-10-27 21:38         ` Richard Henderson
2022-10-06  3:11 ` [PATCH 23/24] accel/tcg: Move PageDesc tree into tb-maint.c for system Richard Henderson
2022-10-06  3:11 ` [PATCH 24/24] accel/tcg: Move remainder of page locking to tb-maint.c Richard Henderson
2022-10-26 13:48   ` Alex Bennée
2022-10-24 23:05 ` [PATCH 00/24] accel/tcg: Rewrite user-only vma tracking Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221006031113.1139454-23-richard.henderson@linaro.org \
    --to=richard.henderson@linaro.org \
    --cc=alex.bennee@linaro.org \
    --cc=f4bug@amsat.org \
    --cc=imp@bsdimp.com \
    --cc=laurent@vivier.eu \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.