[Xen-devel] [PATCH v2] x86/domain_page: implement pure per-vCPU mapping infrastructure

From: Hongyan Xia <hongyxia@amazon.com>
To: <xen-devel@lists.xenproject.org>
Cc: "Wei Liu" <wl@xen.org>,
	"Andrew Cooper" <andrew.cooper3@citrix.com>,
	jgrall@amazon.com, "Hongyan Xia" <hongyxia@amazon.com>,
	"Jan Beulich" <jbeulich@suse.com>,
	"Roger Pau Monné" <roger.pau@citrix.com>
Subject: [Xen-devel] [PATCH v2] x86/domain_page: implement pure per-vCPU mapping infrastructure
Date: Thu, 6 Feb 2020 18:58:23 +0000	[thread overview]
Message-ID: <4058e92ce21627731c49b588a95809dc0affd83a.1581015491.git.hongyxia@amazon.com> (raw)

Rewrite the mapcache to be purely per-vCPU instead of partially per-vcpu
and partially per-domain.

The existing mapcache implementation keeps per-vcpu hash tables for
caching, but also creates a per-domain region which is shared by all
vcpus and protected by a per-domain lock. When the vcpu count of a
domain is large, the per-domain lock contention is high. Also, several
data structures are shared among all vcpus, potentially creating serious
cache ping-pong effects. Performance degradation can be seen on domains
with >16 vcpus.

This patch is needed to address these issues when we start relying on
the mapcache (e.g., when we do not have a direct map). It partitions the
per-domain region into per-vcpu regions so that no mapcache state is
shared among vcpus. As a result, no locking is required and a much
simpler maphash design can be used. The performance improvement after
this patch is quite noticeable.

Benchmarks
(baseline uses direct map instead of the mapcache in map_domain_page,
old is the existing mapcache and new is after this patch):

Geekbench on a 32-vCPU guest,

performance drop     old        new
single core         0.04%      0.18%
multi core          2.43%      0.08%

fio on a 32-vCPU guest, LVM atop 8*SSD,

performance drop     old        new
                    3.05%      0.28%

There should be room for futher optimisations, but this already
improves over the old mapcache by a lot.

In the new cache design, maphash entries themselves also occupy inuse
slots. The existing configuration of 16 slots for each vcpu is no longer
sufficient to include both the maphash and a nested page table walk.
Fortunately, the per-domain inuse and garbage bit vectors are removed
from the PERDOMAIN region, giving us enough room to expand the available
mapping slots.

Signed-off-by: Hongyan Xia <hongyxia@amazon.com>

---
Changed in v2:
- reword the commit message.
- code clean-up and style fixes
- avoid initialising the mapcache in NDEBUG build
- move freeing of the maphash into pv_vcpu_destroy because for now only
  pv has a mapcache.
- use is_idle and is_hvm in map_domain_page to filter out real pv
  domains.
---
 xen/arch/x86/domain.c        |   2 -
 xen/arch/x86/domain_page.c   | 233 ++++++++++++-----------------------
 xen/arch/x86/pv/domain.c     |   1 +
 xen/include/asm-x86/config.h |   2 +-
 xen/include/asm-x86/domain.h |  30 +----
 5 files changed, 84 insertions(+), 184 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f53ae5ff86..5622a26b5c 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -633,8 +633,6 @@ int arch_domain_create(struct domain *d,
     }
     else if ( is_pv_domain(d) )
     {
-        mapcache_domain_init(d);
-
         if ( (rc = pv_domain_initialise(d)) != 0 )
             goto fail;
     }
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index dd32712d2f..bec5bd09ab 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -69,12 +69,11 @@ void __init mapcache_override_current(struct vcpu *v)
 
 void *map_domain_page(mfn_t mfn)
 {
-    unsigned long flags;
-    unsigned int idx, i;
+    unsigned long flags, *phashmfn;
+    unsigned int idx, glb_idx, *phashidx;
     struct vcpu *v;
-    struct mapcache_domain *dcache;
     struct mapcache_vcpu *vcache;
-    struct vcpu_maphash_entry *hashent;
+    void *ret;
 
 #ifdef NDEBUG
     if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
@@ -82,104 +81,60 @@ void *map_domain_page(mfn_t mfn)
 #endif
 
     v = mapcache_current_vcpu();
-    if ( !v || !is_pv_vcpu(v) )
+    if ( !v || is_hvm_vcpu(v) || is_idle_vcpu(v) || !v->arch.pv.mapcache )
         return mfn_to_virt(mfn_x(mfn));
 
-    dcache = &v->domain->arch.pv.mapcache;
-    vcache = &v->arch.pv.mapcache;
-    if ( !dcache->inuse )
-        return mfn_to_virt(mfn_x(mfn));
+    vcache = v->arch.pv.mapcache;
+    phashmfn = &vcache->hash_mfn[MAPHASH_HASHFN(mfn_x(mfn))];
+    phashidx = &vcache->hash_idx[MAPHASH_HASHFN(mfn_x(mfn))];
 
     perfc_incr(map_domain_page_count);
 
     local_irq_save(flags);
 
-    hashent = &vcache->hash[MAPHASH_HASHFN(mfn_x(mfn))];
-    if ( hashent->mfn == mfn_x(mfn) )
+    if ( *phashmfn != mfn_x(mfn) )
     {
-        idx = hashent->idx;
-        ASSERT(idx < dcache->entries);
-        hashent->refcnt++;
-        ASSERT(hashent->refcnt);
-        ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
-        goto out;
-    }
+        idx = find_first_zero_bit(vcache->inuse, MAPCACHE_VCPU_ENTRIES);
+        BUG_ON(idx >= MAPCACHE_VCPU_ENTRIES);
 
-    spin_lock(&dcache->lock);
+        ASSERT(vcache->refcnt[idx] == 0);
+        __set_bit(idx, vcache->inuse);
 
-    /* Has some other CPU caused a wrap? We must flush if so. */
-    if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
-    {
-        vcache->shadow_epoch = dcache->epoch;
-        if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) )
-        {
-            perfc_incr(domain_page_tlb_flush);
-            flush_tlb_local();
-        }
-    }
+        glb_idx = idx + v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
+        l1e_write(&MAPCACHE_L1ENT(glb_idx),
+                  l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
+        ret = (void *)MAPCACHE_VIRT_START + pfn_to_paddr(glb_idx);
+        flush_tlb_one_local(ret);
+
+        /* Evict the entry from maphash. Clear inuse if its refcnt is 0. */
+        if ( *phashidx != MAPHASHENT_NOTINUSE && !vcache->refcnt[*phashidx] )
+            __clear_bit(*phashidx, vcache->inuse);
 
-    idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor);
-    if ( unlikely(idx >= dcache->entries) )
+        /* Add the new slot into the maphash. */
+        *phashmfn = mfn_x(mfn);
+        *phashidx = idx;
+    }
+    else /* We hit in the maphash, just return the entry. */
     {
-        unsigned long accum = 0, prev = 0;
-
-        /* /First/, clean the garbage map and update the inuse list. */
-        for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ )
-        {
-            accum |= prev;
-            dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0);
-            prev = ~dcache->inuse[i];
-        }
-
-        if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) )
-            idx = find_first_zero_bit(dcache->inuse, dcache->entries);
-        else
-        {
-            /* Replace a hash entry instead. */
-            i = MAPHASH_HASHFN(mfn_x(mfn));
-            do {
-                hashent = &vcache->hash[i];
-                if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt )
-                {
-                    idx = hashent->idx;
-                    ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == hashent->mfn);
-                    l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
-                    hashent->idx = MAPHASHENT_NOTINUSE;
-                    hashent->mfn = ~0UL;
-                    break;
-                }
-                if ( ++i == MAPHASH_ENTRIES )
-                    i = 0;
-            } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
-        }
-        BUG_ON(idx >= dcache->entries);
-
-        /* /Second/, flush TLBs. */
-        perfc_incr(domain_page_tlb_flush);
-        flush_tlb_local();
-        vcache->shadow_epoch = ++dcache->epoch;
-        dcache->tlbflush_timestamp = tlbflush_current_time();
+        idx = *phashidx;
+        glb_idx = idx + v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
+        ret = (void *)MAPCACHE_VIRT_START + pfn_to_paddr(glb_idx);
     }
 
-    set_bit(idx, dcache->inuse);
-    dcache->cursor = idx + 1;
+    vcache->refcnt[idx]++;
+    ASSERT(vcache->refcnt[idx]);
+    ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(glb_idx)) == mfn_x(mfn));
 
-    spin_unlock(&dcache->lock);
-
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
-
- out:
     local_irq_restore(flags);
-    return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx);
+    return ret;
 }
 
 void unmap_domain_page(const void *ptr)
 {
-    unsigned int idx;
+    unsigned int idx, glb_idx;
     struct vcpu *v;
-    struct mapcache_domain *dcache;
-    unsigned long va = (unsigned long)ptr, mfn, flags;
-    struct vcpu_maphash_entry *hashent;
+    struct mapcache_vcpu *vcache;
+    unsigned long va = (unsigned long)ptr, mfn, hashmfn, flags;
 
     if ( va >= DIRECTMAP_VIRT_START )
         return;
@@ -189,114 +144,78 @@ void unmap_domain_page(const void *ptr)
     v = mapcache_current_vcpu();
     ASSERT(v && is_pv_vcpu(v));
 
-    dcache = &v->domain->arch.pv.mapcache;
-    ASSERT(dcache->inuse);
+    vcache = v->arch.pv.mapcache;
+    ASSERT(vcache);
 
-    idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
-    mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
-    hashent = &v->arch.pv.mapcache.hash[MAPHASH_HASHFN(mfn)];
+    glb_idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
+    idx = glb_idx - v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
 
     local_irq_save(flags);
 
-    if ( hashent->idx == idx )
-    {
-        ASSERT(hashent->mfn == mfn);
-        ASSERT(hashent->refcnt);
-        hashent->refcnt--;
-    }
-    else if ( !hashent->refcnt )
-    {
-        if ( hashent->idx != MAPHASHENT_NOTINUSE )
-        {
-            /* /First/, zap the PTE. */
-            ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(hashent->idx)) ==
-                   hashent->mfn);
-            l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty());
-            /* /Second/, mark as garbage. */
-            set_bit(hashent->idx, dcache->garbage);
-        }
-
-        /* Add newly-freed mapping to the maphash. */
-        hashent->mfn = mfn;
-        hashent->idx = idx;
-    }
-    else
-    {
-        /* /First/, zap the PTE. */
-        l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
-        /* /Second/, mark as garbage. */
-        set_bit(idx, dcache->garbage);
-    }
+    mfn = l1e_get_pfn(MAPCACHE_L1ENT(glb_idx));
+    hashmfn = vcache->hash_mfn[MAPHASH_HASHFN(mfn)];
+
+    vcache->refcnt[idx]--;
+    /* If refcnt drops to 0, we only clear inuse when it's not in the maphash. */
+    if ( hashmfn != mfn && !vcache->refcnt[idx] )
+        __clear_bit(idx, vcache->inuse);
 
     local_irq_restore(flags);
 }
 
-int mapcache_domain_init(struct domain *d)
+int mapcache_vcpu_init(struct vcpu *v)
 {
+    struct domain *d = v->domain;
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
-    unsigned int bitmap_pages;
-
-    ASSERT(is_pv_domain(d));
+    unsigned long i;
+    unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
 
 #ifdef NDEBUG
     if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
         return 0;
 #endif
 
-    BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
-                 MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
-    bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
-    dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
-    dcache->garbage = dcache->inuse +
-                      (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
-
-    spin_lock_init(&dcache->lock);
-
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
-}
-
-int mapcache_vcpu_init(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    struct mapcache_domain *dcache = &d->arch.pv.mapcache;
-    unsigned long i;
-    unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
-    unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
-
-    if ( !is_pv_vcpu(v) || !dcache->inuse )
+    if ( is_idle_vcpu(v) || is_hvm_vcpu(v) )
         return 0;
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_END > ARG_XLAT_VIRT_START);
+
     if ( ents > dcache->entries )
     {
+        int rc;
+
+        ASSERT(ents * PAGE_SIZE <= (PERDOMAIN_SLOT_MBYTES << 20));
+
         /* Populate page tables. */
-        int rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
+        rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
                                           NIL(l1_pgentry_t *), NULL);
 
-        /* Populate bit maps. */
-        if ( !rc )
-            rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                          nr, NULL, NIL(struct page_info *));
-        if ( !rc )
-            rc = create_perdomain_mapping(d, (unsigned long)dcache->garbage,
-                                          nr, NULL, NIL(struct page_info *));
-
         if ( rc )
             return rc;
 
         dcache->entries = ents;
     }
 
-    /* Mark all maphash entries as not in use. */
     BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES);
+    /* MAPHASH_ENTRIES has to be power-of-two to make hashing work. */
+    BUILD_BUG_ON(MAPHASH_ENTRIES & (MAPHASH_ENTRIES - 1));
+    /*
+     * Since entries in the maphash also occupy inuse slots, we have to make
+     * sure MAPCACHE_VCPU_ENTRIES is large enough to accommodate both the
+     * maphash and a nested page table walk.
+     */
+    BUILD_BUG_ON(MAPCACHE_VCPU_ENTRIES - MAPHASH_ENTRIES <
+                 CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS);
+
+    v->arch.pv.mapcache = xzalloc(struct mapcache_vcpu);
+    if ( !v->arch.pv.mapcache )
+        return -ENOMEM;
+
+    /* Mark all maphash entries as not in use. */
     for ( i = 0; i < MAPHASH_ENTRIES; i++ )
     {
-        struct vcpu_maphash_entry *hashent = &v->arch.pv.mapcache.hash[i];
-
-        hashent->mfn = ~0UL; /* never valid to map */
-        hashent->idx = MAPHASHENT_NOTINUSE;
+        v->arch.pv.mapcache->hash_mfn[i] = ~0UL;
+        v->arch.pv.mapcache->hash_idx[i] = MAPHASHENT_NOTINUSE;
     }
 
     return 0;
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index c3473b9a47..be819ddfac 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -234,6 +234,7 @@ void pv_vcpu_destroy(struct vcpu *v)
 
     pv_destroy_gdt_ldt_l1tab(v);
     XFREE(v->arch.pv.trap_ctxt);
+    XFREE(v->arch.pv.mapcache);
 }
 
 int pv_vcpu_initialise(struct vcpu *v)
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index a34053c4c0..ef48190abf 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -296,7 +296,7 @@ extern unsigned long xen_phys_start;
     (GDT_VIRT_START(v) + (64*1024))
 
 /* map_domain_page() map cache. The second per-domain-mapping sub-area. */
-#define MAPCACHE_VCPU_ENTRIES    (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS)
+#define MAPCACHE_VCPU_ENTRIES    32
 #define MAPCACHE_ENTRIES         (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES)
 #define MAPCACHE_VIRT_START      PERDOMAIN_VIRT_SLOT(1)
 #define MAPCACHE_VIRT_END        (MAPCACHE_VIRT_START + \
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index a3ae5d9a20..367bba7110 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -40,35 +40,17 @@ struct trap_bounce {
 #define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1))
 #define MAPHASHENT_NOTINUSE ((u32)~0U)
 struct mapcache_vcpu {
-    /* Shadow of mapcache_domain.epoch. */
-    unsigned int shadow_epoch;
-
-    /* Lock-free per-VCPU hash of recently-used mappings. */
-    struct vcpu_maphash_entry {
-        unsigned long mfn;
-        uint32_t      idx;
-        uint32_t      refcnt;
-    } hash[MAPHASH_ENTRIES];
+    unsigned long hash_mfn[MAPHASH_ENTRIES];
+    uint32_t hash_idx[MAPHASH_ENTRIES];
+
+    uint8_t refcnt[MAPCACHE_VCPU_ENTRIES];
+    unsigned long inuse[BITS_TO_LONGS(MAPCACHE_VCPU_ENTRIES)];
 };
 
 struct mapcache_domain {
-    /* The number of array entries, and a cursor into the array. */
     unsigned int entries;
-    unsigned int cursor;
-
-    /* Protects map_domain_page(). */
-    spinlock_t lock;
-
-    /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */
-    unsigned int epoch;
-    u32 tlbflush_timestamp;
-
-    /* Which mappings are in use, and which are garbage to reap next epoch? */
-    unsigned long *inuse;
-    unsigned long *garbage;
 };
 
-int mapcache_domain_init(struct domain *);
 int mapcache_vcpu_init(struct vcpu *);
 void mapcache_override_current(struct vcpu *);
 
@@ -473,7 +455,7 @@ struct arch_domain
 struct pv_vcpu
 {
     /* map_domain_page() mapping cache. */
-    struct mapcache_vcpu mapcache;
+    struct mapcache_vcpu *mapcache;
 
     unsigned int vgc_flags;
 
-- 
2.17.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel