All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide
@ 2013-11-11 16:40 Michael S. Tsirkin
  2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 1/5] split definitions for exec.c and translate-all.c radix trees Michael S. Tsirkin
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2013-11-11 16:40 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini, Luiz Capitulino

At the moment, exec ignores high bits in each address,
for efficiency.
This is incorrect: devices can do full 64 bit DMA, it's
only the CPU that is limited by target address space.
Using full 64 bit addresses was clocked at 12% performance
hit on a microbenchmark.
To solve, teach pagetables to skip bits at any level
and not just the lowest level.

This should solve the performance problem (only one line
of code changed on the data path).
I'm still trying to figure out how to measure speed
properly with TCG, sending this out for early feedback and flames.

Michael S. Tsirkin (3):
  exec: relace leaf with skip
  exec: extend skip field to 3 bits
  exec: memory radix tree page level compression

Paolo Bonzini (2):
  split definitions for exec.c and translate-all.c radix trees
  exec: make address spaces 64-bit wide

 translate-all.h |   7 ----
 exec.c          | 117 +++++++++++++++++++++++++++++++++++++++++++++++---------
 translate-all.c |  32 +++++++++-------
 3 files changed, 117 insertions(+), 39 deletions(-)

-- 
MST

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Qemu-devel] [PATCH RFC 1/5] split definitions for exec.c and translate-all.c radix trees
  2013-11-11 16:40 [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Michael S. Tsirkin
@ 2013-11-11 16:40 ` Michael S. Tsirkin
  2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 2/5] exec: make address spaces 64-bit wide Michael S. Tsirkin
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2013-11-11 16:40 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini

From: Paolo Bonzini <pbonzini@redhat.com>

The exec.c and translate-all.c radix trees are quite different, and
the exec.c one in particular is not limited to the CPU---it can be
used also by devices that do DMA, and in that case the address space
is not limited to TARGET_PHYS_ADDR_SPACE_BITS bits.

We want to make exec.c's radix trees 64-bit wide.  As a first step,
stop sharing the constants between exec.c and translate-all.c.
exec.c gets P_L2_* constants, translate-all.c gets V_L2_*, for
consistency with the existing V_L1_* symbols.  Though actually
in the softmmu case translate-all.c is also indexed by physical
addresses...

This patch has no semantic change.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 translate-all.h |  7 -------
 exec.c          | 29 +++++++++++++++++++++--------
 translate-all.c | 32 ++++++++++++++++++--------------
 3 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/translate-all.h b/translate-all.h
index 5c38819..f7e5932 100644
--- a/translate-all.h
+++ b/translate-all.h
@@ -19,13 +19,6 @@
 #ifndef TRANSLATE_ALL_H
 #define TRANSLATE_ALL_H
 
-/* Size of the L2 (and L3, etc) page tables.  */
-#define L2_BITS 10
-#define L2_SIZE (1 << L2_BITS)
-
-#define P_L2_LEVELS \
-    (((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / L2_BITS) + 1)
-
 /* translate-all.c */
 void tb_invalidate_phys_page_fast(tb_page_addr_t start, int len);
 void cpu_unlink_tb(CPUState *cpu);
diff --git a/exec.c b/exec.c
index b453713..9e2fc4b 100644
--- a/exec.c
+++ b/exec.c
@@ -88,7 +88,15 @@ struct PhysPageEntry {
     uint16_t ptr : 15;
 };
 
-typedef PhysPageEntry Node[L2_SIZE];
+/* Size of the L2 (and L3, etc) page tables.  */
+#define ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
+
+#define P_L2_BITS 10
+#define P_L2_SIZE (1 << P_L2_BITS)
+
+#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
+
+typedef PhysPageEntry Node[P_L2_SIZE];
 
 struct AddressSpaceDispatch {
     /* This is a multi-level map on the physical address space.
@@ -155,7 +163,7 @@ static uint16_t phys_map_node_alloc(void)
     ret = next_map.nodes_nb++;
     assert(ret != PHYS_MAP_NODE_NIL);
     assert(ret != next_map.nodes_nb_alloc);
-    for (i = 0; i < L2_SIZE; ++i) {
+    for (i = 0; i < P_L2_SIZE; ++i) {
         next_map.nodes[ret][i].is_leaf = 0;
         next_map.nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
     }
@@ -168,13 +176,13 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
 {
     PhysPageEntry *p;
     int i;
-    hwaddr step = (hwaddr)1 << (level * L2_BITS);
+    hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 
     if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
         lp->ptr = phys_map_node_alloc();
         p = next_map.nodes[lp->ptr];
         if (level == 0) {
-            for (i = 0; i < L2_SIZE; i++) {
+            for (i = 0; i < P_L2_SIZE; i++) {
                 p[i].is_leaf = 1;
                 p[i].ptr = PHYS_SECTION_UNASSIGNED;
             }
@@ -182,9 +190,9 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
     } else {
         p = next_map.nodes[lp->ptr];
     }
-    lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
+    lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 
-    while (*nb && lp < &p[L2_SIZE]) {
+    while (*nb && lp < &p[P_L2_SIZE]) {
         if ((*index & (step - 1)) == 0 && *nb >= step) {
             lp->is_leaf = true;
             lp->ptr = leaf;
@@ -218,7 +226,7 @@ static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr index,
             return &sections[PHYS_SECTION_UNASSIGNED];
         }
         p = nodes[lp.ptr];
-        lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
+        lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
     }
     return &sections[lp.ptr];
 }
@@ -1741,7 +1749,12 @@ void address_space_destroy_dispatch(AddressSpace *as)
 static void memory_map_init(void)
 {
     system_memory = g_malloc(sizeof(*system_memory));
-    memory_region_init(system_memory, NULL, "system", INT64_MAX);
+
+    assert(ADDR_SPACE_BITS <= 64);
+
+    memory_region_init(system_memory, NULL, "system",
+                       ADDR_SPACE_BITS == 64 ?
+                       UINT64_MAX : (0x1ULL << ADDR_SPACE_BITS));
     address_space_init(&address_space_memory, system_memory, "memory");
 
     system_io = g_malloc(sizeof(*system_io));
diff --git a/translate-all.c b/translate-all.c
index aeda54d..1c63d78 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -96,12 +96,16 @@ typedef struct PageDesc {
 # define L1_MAP_ADDR_SPACE_BITS  TARGET_VIRT_ADDR_SPACE_BITS
 #endif
 
+/* Size of the L2 (and L3, etc) page tables.  */
+#define V_L2_BITS 10
+#define V_L2_SIZE (1 << V_L2_BITS)
+
 /* The bits remaining after N lower levels of page tables.  */
 #define V_L1_BITS_REM \
-    ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
+    ((L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % V_L2_BITS)
 
 #if V_L1_BITS_REM < 4
-#define V_L1_BITS  (V_L1_BITS_REM + L2_BITS)
+#define V_L1_BITS  (V_L1_BITS_REM + V_L2_BITS)
 #else
 #define V_L1_BITS  V_L1_BITS_REM
 #endif
@@ -395,18 +399,18 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
     lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1));
 
     /* Level 2..N-1.  */
-    for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
+    for (i = V_L1_SHIFT / V_L2_BITS - 1; i > 0; i--) {
         void **p = *lp;
 
         if (p == NULL) {
             if (!alloc) {
                 return NULL;
             }
-            ALLOC(p, sizeof(void *) * L2_SIZE);
+            ALLOC(p, sizeof(void *) * V_L2_SIZE);
             *lp = p;
         }
 
-        lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
+        lp = p + ((index >> (i * V_L2_BITS)) & (V_L2_SIZE - 1));
     }
 
     pd = *lp;
@@ -414,13 +418,13 @@ static PageDesc *page_find_alloc(tb_page_addr_t index, int alloc)
         if (!alloc) {
             return NULL;
         }
-        ALLOC(pd, sizeof(PageDesc) * L2_SIZE);
+        ALLOC(pd, sizeof(PageDesc) * V_L2_SIZE);
         *lp = pd;
     }
 
 #undef ALLOC
 
-    return pd + (index & (L2_SIZE - 1));
+    return pd + (index & (V_L2_SIZE - 1));
 }
 
 static inline PageDesc *page_find(tb_page_addr_t index)
@@ -655,14 +659,14 @@ static void page_flush_tb_1(int level, void **lp)
     if (level == 0) {
         PageDesc *pd = *lp;
 
-        for (i = 0; i < L2_SIZE; ++i) {
+        for (i = 0; i < V_L2_SIZE; ++i) {
             pd[i].first_tb = NULL;
             invalidate_page_bitmap(pd + i);
         }
     } else {
         void **pp = *lp;
 
-        for (i = 0; i < L2_SIZE; ++i) {
+        for (i = 0; i < V_L2_SIZE; ++i) {
             page_flush_tb_1(level - 1, pp + i);
         }
     }
@@ -673,7 +677,7 @@ static void page_flush_tb(void)
     int i;
 
     for (i = 0; i < V_L1_SIZE; i++) {
-        page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+        page_flush_tb_1(V_L1_SHIFT / V_L2_BITS - 1, l1_map + i);
     }
 }
 
@@ -1600,7 +1604,7 @@ static int walk_memory_regions_1(struct walk_memory_regions_data *data,
     if (level == 0) {
         PageDesc *pd = *lp;
 
-        for (i = 0; i < L2_SIZE; ++i) {
+        for (i = 0; i < V_L2_SIZE; ++i) {
             int prot = pd[i].flags;
 
             pa = base | (i << TARGET_PAGE_BITS);
@@ -1614,9 +1618,9 @@ static int walk_memory_regions_1(struct walk_memory_regions_data *data,
     } else {
         void **pp = *lp;
 
-        for (i = 0; i < L2_SIZE; ++i) {
+        for (i = 0; i < V_L2_SIZE; ++i) {
             pa = base | ((abi_ulong)i <<
-                (TARGET_PAGE_BITS + L2_BITS * level));
+                (TARGET_PAGE_BITS + V_L2_BITS * level));
             rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
             if (rc != 0) {
                 return rc;
@@ -1639,7 +1643,7 @@ int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
 
     for (i = 0; i < V_L1_SIZE; i++) {
         int rc = walk_memory_regions_1(&data, (abi_ulong)i << V_L1_SHIFT,
-                                       V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+                                       V_L1_SHIFT / V_L2_BITS - 1, l1_map + i);
 
         if (rc != 0) {
             return rc;
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [Qemu-devel] [PATCH RFC 2/5] exec: make address spaces 64-bit wide
  2013-11-11 16:40 [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Michael S. Tsirkin
  2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 1/5] split definitions for exec.c and translate-all.c radix trees Michael S. Tsirkin
@ 2013-11-11 16:40 ` Michael S. Tsirkin
  2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 3/5] exec: relace leaf with skip Michael S. Tsirkin
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2013-11-11 16:40 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini, Luiz Capitulino

From: Paolo Bonzini <pbonzini@redhat.com>

As an alternative to commit 818f86b (exec: limit system memory
size, 2013-11-04) let's just make all address spaces 64-bit wide.
This eliminates problems with phys_page_find ignoring bits above
TARGET_PHYS_ADDR_SPACE_BITS and address_space_translate_internal
consequently messing up the computations.

In Luiz's reported crash, at startup gdb attempts to read from address
0xffffffffffffffe6 to 0xffffffffffffffff inclusive.  The region it gets
is the newly introduced master abort region, which is as big as the PCI
address space (see pci_bus_init).  Due to a typo that's only 2^63-1,
not 2^64.  But we get it anyway because phys_page_find ignores the upper
bits of the physical address.  In address_space_translate_internal then

    diff = int128_sub(section->mr->size, int128_make64(addr));
    *plen = int128_get64(int128_min(diff, int128_make64(*plen)));

diff becomes negative, and int128_get64 booms.

The size of the PCI address space region should be fixed anyway.

Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 exec.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/exec.c b/exec.c
index 9e2fc4b..d5ce3da 100644
--- a/exec.c
+++ b/exec.c
@@ -89,7 +89,7 @@ struct PhysPageEntry {
 };
 
 /* Size of the L2 (and L3, etc) page tables.  */
-#define ADDR_SPACE_BITS TARGET_PHYS_ADDR_SPACE_BITS
+#define ADDR_SPACE_BITS 64
 
 #define P_L2_BITS 10
 #define P_L2_SIZE (1 << P_L2_BITS)
@@ -1750,11 +1750,7 @@ static void memory_map_init(void)
 {
     system_memory = g_malloc(sizeof(*system_memory));
 
-    assert(ADDR_SPACE_BITS <= 64);
-
-    memory_region_init(system_memory, NULL, "system",
-                       ADDR_SPACE_BITS == 64 ?
-                       UINT64_MAX : (0x1ULL << ADDR_SPACE_BITS));
+    memory_region_init(system_memory, NULL, "system", UINT64_MAX);
     address_space_init(&address_space_memory, system_memory, "memory");
 
     system_io = g_malloc(sizeof(*system_io));
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [Qemu-devel] [PATCH RFC 3/5] exec: relace leaf with skip
  2013-11-11 16:40 [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Michael S. Tsirkin
  2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 1/5] split definitions for exec.c and translate-all.c radix trees Michael S. Tsirkin
  2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 2/5] exec: make address spaces 64-bit wide Michael S. Tsirkin
@ 2013-11-11 16:40 ` Michael S. Tsirkin
  2013-11-11 16:41 ` [Qemu-devel] [PATCH RFC 4/5] exec: extend skip field to 3 bits Michael S. Tsirkin
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2013-11-11 16:40 UTC (permalink / raw)
  To: qemu-devel

In preparation for dynamic page support, rename is_leaf
field to skip, telling us how many bits to skip to next level.
Set to 0 for leaf.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 exec.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/exec.c b/exec.c
index d5ce3da..7e512db 100644
--- a/exec.c
+++ b/exec.c
@@ -83,8 +83,9 @@ int use_icount;
 typedef struct PhysPageEntry PhysPageEntry;
 
 struct PhysPageEntry {
-    uint16_t is_leaf : 1;
-     /* index into phys_sections (is_leaf) or phys_map_nodes (!is_leaf) */
+    /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
+    uint16_t skip : 1;
+     /* index into phys_sections (!skip) or phys_map_nodes (skip) */
     uint16_t ptr : 15;
 };
 
@@ -164,7 +165,7 @@ static uint16_t phys_map_node_alloc(void)
     assert(ret != PHYS_MAP_NODE_NIL);
     assert(ret != next_map.nodes_nb_alloc);
     for (i = 0; i < P_L2_SIZE; ++i) {
-        next_map.nodes[ret][i].is_leaf = 0;
+        next_map.nodes[ret][i].skip = 1;
         next_map.nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
     }
     return ret;
@@ -178,12 +179,12 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
     int i;
     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 
-    if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
+    if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
         lp->ptr = phys_map_node_alloc();
         p = next_map.nodes[lp->ptr];
         if (level == 0) {
             for (i = 0; i < P_L2_SIZE; i++) {
-                p[i].is_leaf = 1;
+                p[i].skip = 0;
                 p[i].ptr = PHYS_SECTION_UNASSIGNED;
             }
         }
@@ -194,7 +195,7 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
 
     while (*nb && lp < &p[P_L2_SIZE]) {
         if ((*index & (step - 1)) == 0 && *nb >= step) {
-            lp->is_leaf = true;
+            lp->skip = 0;
             lp->ptr = leaf;
             *index += step;
             *nb -= step;
@@ -221,7 +222,7 @@ static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr index,
     PhysPageEntry *p;
     int i;
 
-    for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
+    for (i = P_L2_LEVELS - 1; i >= 0 && lp.skip; i -= lp.skip) {
         if (lp.ptr == PHYS_MAP_NODE_NIL) {
             return &sections[PHYS_SECTION_UNASSIGNED];
         }
@@ -1644,7 +1645,7 @@ static void mem_begin(MemoryListener *listener)
     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
     AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
 
-    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
+    d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
     d->as = as;
     as->next_dispatch = d;
 }
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [Qemu-devel] [PATCH RFC 4/5] exec: extend skip field to 3 bits
  2013-11-11 16:40 [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Michael S. Tsirkin
                   ` (2 preceding siblings ...)
  2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 3/5] exec: relace leaf with skip Michael S. Tsirkin
@ 2013-11-11 16:41 ` Michael S. Tsirkin
  2013-11-11 16:41 ` [Qemu-devel] [PATCH RFC 5/5] exec: memory radix tree page level compression Michael S. Tsirkin
  2013-11-11 16:59 ` [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Paolo Bonzini
  5 siblings, 0 replies; 8+ messages in thread
From: Michael S. Tsirkin @ 2013-11-11 16:41 UTC (permalink / raw)
  To: qemu-devel

13 bits left for pointer, which seems enough.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 exec.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/exec.c b/exec.c
index 7e512db..39f76ee 100644
--- a/exec.c
+++ b/exec.c
@@ -84,9 +84,9 @@ typedef struct PhysPageEntry PhysPageEntry;
 
 struct PhysPageEntry {
     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
-    uint16_t skip : 1;
+    uint16_t skip : 3;
      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
-    uint16_t ptr : 15;
+    uint16_t ptr : 13;
 };
 
 /* Size of the L2 (and L3, etc) page tables.  */
@@ -134,7 +134,7 @@ typedef struct PhysPageMap {
 static PhysPageMap *prev_map;
 static PhysPageMap next_map;
 
-#define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
+#define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 3)
 
 static void io_mem_init(void);
 static void memory_map_init(void);
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [Qemu-devel] [PATCH RFC 5/5] exec: memory radix tree page level compression
  2013-11-11 16:40 [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Michael S. Tsirkin
                   ` (3 preceding siblings ...)
  2013-11-11 16:41 ` [Qemu-devel] [PATCH RFC 4/5] exec: extend skip field to 3 bits Michael S. Tsirkin
@ 2013-11-11 16:41 ` Michael S. Tsirkin
  2013-11-11 17:58   ` Eric Blake
  2013-11-11 16:59 ` [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Paolo Bonzini
  5 siblings, 1 reply; 8+ messages in thread
From: Michael S. Tsirkin @ 2013-11-11 16:41 UTC (permalink / raw)
  To: qemu-devel

At the moment, memory radix tree is already variable width, but it can
only skip the low bits of address.

This is efficient if we have huge memory regions but inefficient if we
are only using a tiny portion of the address space.

After we have built up the map, it's a simple matter to detect
configurations where a single L2 entry is valid.

We can them speed up the lookup by skipping one or more levels.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 exec.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/exec.c b/exec.c
index 39f76ee..3ec6c2c 100644
--- a/exec.c
+++ b/exec.c
@@ -216,6 +216,75 @@ static void phys_page_set(AddressSpaceDispatch *d,
     phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 }
 
+/* Compact a non leaf page entry. Simply detect that the entry has a single child,
+ * and update our entry so we can skip it and go directly to the destination.
+ */
+static void phys_page_compact(PhysPageEntry *lp, Node *nodes, unsigned long *compacted)
+{
+    unsigned valid_ptr = P_L2_SIZE;
+    int valid = 0;
+    PhysPageEntry *p;
+    int i;
+
+    if (lp->ptr == PHYS_MAP_NODE_NIL || test_and_set_bit(lp->ptr, compacted)) {
+        return;
+    }
+
+    set_bit(lp->ptr, compacted);
+
+    p = nodes[lp->ptr];
+    for (i = 0; i < P_L2_SIZE; i++) {
+        if (p[i].ptr == PHYS_MAP_NODE_NIL) {
+            continue;
+        }
+
+        valid_ptr = i;
+        valid++;
+        if (p[i].skip) {
+            phys_page_compact(&p[i], nodes, compacted);
+        }
+    }
+
+    /* We can only compress if there's only one child. */
+    if (valid != 1) {
+        return;
+    }
+
+    assert(valid_ptr < P_L2_SIZE);
+
+    /* Don't compress if it won't fit in the # of bits we have. */
+    if (lp->skip + p[valid_ptr].skip >= (1 << 3)) {
+        return;
+    }
+
+    lp->ptr = p[valid_ptr].ptr;
+    if (!p[valid_ptr].skip) {
+        /* If our only child is a leaf, make this a leaf. */
+        /* By design, we should have made this node a leaf to begin with so we
+         * should never reach here.
+         * But since it's so simple to handle this, let's do it just in case we
+         * change this rule.
+         */
+        lp->skip = 0;
+    } else {
+        lp->skip += p[valid_ptr].skip;
+    }
+}
+
+static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
+{
+    DECLARE_BITMAP(compacted, nodes_nb);
+    int i;
+
+    return;
+
+    for (i = 0; i < next_map.nodes_nb; ++i) {
+        if (d->phys_map.skip) {
+            phys_page_compact(&d->phys_map, d->nodes, compacted);
+        }
+    }
+}
+
 static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr index,
                                            Node *nodes, MemoryRegionSection *sections)
 {
@@ -1659,6 +1728,8 @@ static void mem_commit(MemoryListener *listener)
     next->nodes = next_map.nodes;
     next->sections = next_map.sections;
 
+    phys_page_compact_all(next, next_map.nodes_nb);
+
     as->dispatch = next;
     g_free(cur);
 }
-- 
MST

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide
  2013-11-11 16:40 [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Michael S. Tsirkin
                   ` (4 preceding siblings ...)
  2013-11-11 16:41 ` [Qemu-devel] [PATCH RFC 5/5] exec: memory radix tree page level compression Michael S. Tsirkin
@ 2013-11-11 16:59 ` Paolo Bonzini
  5 siblings, 0 replies; 8+ messages in thread
From: Paolo Bonzini @ 2013-11-11 16:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel, Luiz Capitulino

Il 11/11/2013 17:40, Michael S. Tsirkin ha scritto:
> At the moment, exec ignores high bits in each address,
> for efficiency.
> This is incorrect: devices can do full 64 bit DMA, it's
> only the CPU that is limited by target address space.
> Using full 64 bit addresses was clocked at 12% performance
> hit on a microbenchmark.
> To solve, teach pagetables to skip bits at any level
> and not just the lowest level.
> 
> This should solve the performance problem (only one line
> of code changed on the data path).
> I'm still trying to figure out how to measure speed
> properly with TCG, sending this out for early feedback and flames.

I used this:

x86_64-softmmu/qemu-system-x86_64 -kernel
../../kvm-unit-tests/x86/vmexit.flat  -serial stdio -device
isa-debug-exit,iobase=0xf4

with only one test enabled (I tried both inl_from_qemu and
inl_from_pmtimer) and with roughly the same inlining of the "inb %dx,
%al" instruction that you suggested earlier on the mailing list.

Paolo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Qemu-devel] [PATCH RFC 5/5] exec: memory radix tree page level compression
  2013-11-11 16:41 ` [Qemu-devel] [PATCH RFC 5/5] exec: memory radix tree page level compression Michael S. Tsirkin
@ 2013-11-11 17:58   ` Eric Blake
  0 siblings, 0 replies; 8+ messages in thread
From: Eric Blake @ 2013-11-11 17:58 UTC (permalink / raw)
  To: Michael S. Tsirkin, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 616 bytes --]

On 11/11/2013 09:41 AM, Michael S. Tsirkin wrote:
> At the moment, memory radix tree is already variable width, but it can
> only skip the low bits of address.
> 
> This is efficient if we have huge memory regions but inefficient if we
> are only using a tiny portion of the address space.
> 
> After we have built up the map, it's a simple matter to detect
> configurations where a single L2 entry is valid.
> 
> We can them speed up the lookup by skipping one or more levels.

s/them/then/

-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 621 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2013-11-11 17:58 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-11 16:40 [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Michael S. Tsirkin
2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 1/5] split definitions for exec.c and translate-all.c radix trees Michael S. Tsirkin
2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 2/5] exec: make address spaces 64-bit wide Michael S. Tsirkin
2013-11-11 16:40 ` [Qemu-devel] [PATCH RFC 3/5] exec: relace leaf with skip Michael S. Tsirkin
2013-11-11 16:41 ` [Qemu-devel] [PATCH RFC 4/5] exec: extend skip field to 3 bits Michael S. Tsirkin
2013-11-11 16:41 ` [Qemu-devel] [PATCH RFC 5/5] exec: memory radix tree page level compression Michael S. Tsirkin
2013-11-11 17:58   ` Eric Blake
2013-11-11 16:59 ` [Qemu-devel] [PATCH RFC 0/5] making address spaces 64 bit wide Paolo Bonzini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.