[Xen-devel] [PATCH] x86/domain_page: implement pure per-vCPU mapping infrastructure

* [Xen-devel] [PATCH] x86/domain_page: implement pure per-vCPU mapping infrastructure
@ 2020-02-03 18:36 Hongyan Xia
  2020-02-04 12:08 ` Wei Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Hongyan Xia @ 2020-02-03 18:36 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, jgrall, Wei Liu, Roger Pau Monné

Rewrite the mapcache to be purely per-vCPU instead of partly per-vCPU
and partly per-domain.

This patch is needed to address performance issues when we start relying
on the mapcache, e.g., when we do not have a direct map. Currently, the
per-domain lock on the mapcache is a bottleneck for multicore, causing
performance degradation and even functional regressions. This patch
makes the mapping structure per-vCPU and completely lockless.

Functional regression:

When a domain is run on more than 64 cores, FreeBSD 10 panicks frequently
due to occasional simultaneous set_singleshot_timer hypercalls from too
many cores. Some cores will be blocked waiting on map_domain_page,
eventually failing to set a timer in the future. FreeBSD cannot handle
this and panicks. This was fixed in later FreeBSD releases by handling
-ETIME, but still the degradation in timer performance is a big issue.

Performance regression:

Many benchmarks see a performance drop when having a large core count.
I have done a Geekbench on a 32-vCPU guest.

perf drop     old        new
single       0.04%      0.18%
multi        2.43%      0.08%

Removing the per-domain lock in the mapcache brings the multi-core
performance almost identical to using the direct map for mappings.

There should be room for futher optimisations, but this already
improves over the old mapcache by a lot.

Note that entries in the maphash will occupy inuse slots. With 16 slots
per vCPU and a maphash capacity of 8, we only have another 8 available,
which is not enough for nested page table walks. We need to increase the
number of slots in config.h.

Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
---
 xen/arch/x86/domain.c        |   5 +-
 xen/arch/x86/domain_page.c   | 229 +++++++++++------------------------
 xen/include/asm-x86/config.h |   2 +-
 xen/include/asm-x86/domain.h |  30 +----
 4 files changed, 80 insertions(+), 186 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f53ae5ff86..a278aa4678 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -445,6 +445,9 @@ void arch_vcpu_destroy(struct vcpu *v)
     xfree(v->arch.msrs);
     v->arch.msrs = NULL;
 
+    xfree(v->arch.pv.mapcache);
+    v->arch.pv.mapcache = NULL;
+
     if ( is_hvm_vcpu(v) )
         hvm_vcpu_destroy(v);
     else
@@ -633,8 +636,6 @@ int arch_domain_create(struct domain *d,
     }
     else if ( is_pv_domain(d) )
     {
-        mapcache_domain_init(d);
-
         if ( (rc = pv_domain_initialise(d)) != 0 )
             goto fail;
     }
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index dd32712d2f..52971d2ecc 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -69,12 +69,11 @@ void __init mapcache_override_current(struct vcpu *v)
 
 void *map_domain_page(mfn_t mfn)
 {
-    unsigned long flags;
-    unsigned int idx, i;
+    unsigned long flags, *phashmfn;
+    unsigned int idx, glb_idx, *phashidx, ohashidx;
     struct vcpu *v;
-    struct mapcache_domain *dcache;
     struct mapcache_vcpu *vcache;
-    struct vcpu_maphash_entry *hashent;
+    void *ret;
 
 #ifdef NDEBUG
     if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
@@ -82,104 +81,59 @@ void *map_domain_page(mfn_t mfn)
 #endif
 
     v = mapcache_current_vcpu();
-    if ( !v || !is_pv_vcpu(v) )
+    if ( !v || !is_pv_vcpu(v) || !v->arch.pv.mapcache )
         return mfn_to_virt(mfn_x(mfn));
 
-    dcache = &v->domain->arch.pv.mapcache;
-    vcache = &v->arch.pv.mapcache;
-    if ( !dcache->inuse )
-        return mfn_to_virt(mfn_x(mfn));
+    vcache = v->arch.pv.mapcache;
+    phashmfn = &vcache->hash_mfn[MAPHASH_HASHFN(mfn_x(mfn))];
+    phashidx = &vcache->hash_idx[MAPHASH_HASHFN(mfn_x(mfn))];
 
     perfc_incr(map_domain_page_count);
 
     local_irq_save(flags);
 
-    hashent = &vcache->hash[MAPHASH_HASHFN(mfn_x(mfn))];
-    if ( hashent->mfn == mfn_x(mfn) )
+    ohashidx = *phashidx;
+    if ( *phashmfn != mfn_x(mfn) )
     {
-        idx = hashent->idx;
-        ASSERT(idx < dcache->entries);
-        hashent->refcnt++;
-        ASSERT(hashent->refcnt);
-        ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
-        goto out;
-    }
+        idx = find_first_zero_bit(vcache->inuse, MAPCACHE_VCPU_ENTRIES);
+        BUG_ON(idx >= MAPCACHE_VCPU_ENTRIES);
 
-    spin_lock(&dcache->lock);
+        ASSERT(vcache->refcnt[idx] == 0);
+        __set_bit(idx, vcache->inuse);
 
-    /* Has some other CPU caused a wrap? We must flush if so. */
-    if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
-    {
-        vcache->shadow_epoch = dcache->epoch;
-        if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) )
-        {
-            perfc_incr(domain_page_tlb_flush);
-            flush_tlb_local();
-        }
-    }
+        glb_idx = idx + v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
+        l1e_write(&MAPCACHE_L1ENT(glb_idx),
+                  l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
+        ret = (void *)MAPCACHE_VIRT_START + pfn_to_paddr(glb_idx);
+        flush_tlb_one_local(ret);
+
+        if ( ohashidx != MAPHASHENT_NOTINUSE && !vcache->refcnt[ohashidx] )
+            __clear_bit(ohashidx, vcache->inuse);
 
-    idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor);
-    if ( unlikely(idx >= dcache->entries) )
+        *phashmfn = mfn_x(mfn);
+        *phashidx = idx;
+    }
+    else
     {
-        unsigned long accum = 0, prev = 0;
-
-        /* /First/, clean the garbage map and update the inuse list. */
-        for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ )
-        {
-            accum |= prev;
-            dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0);
-            prev = ~dcache->inuse[i];
-        }
-
-        if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) )
-            idx = find_first_zero_bit(dcache->inuse, dcache->entries);
-        else
-        {
-            /* Replace a hash entry instead. */
-            i = MAPHASH_HASHFN(mfn_x(mfn));
-            do {
-                hashent = &vcache->hash[i];
-                if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt )
-                {
-                    idx = hashent->idx;
-                    ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == hashent->mfn);
-                    l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
-                    hashent->idx = MAPHASHENT_NOTINUSE;
-                    hashent->mfn = ~0UL;
-                    break;
-                }
-                if ( ++i == MAPHASH_ENTRIES )
-                    i = 0;
-            } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
-        }
-        BUG_ON(idx >= dcache->entries);
-
-        /* /Second/, flush TLBs. */
-        perfc_incr(domain_page_tlb_flush);
-        flush_tlb_local();
-        vcache->shadow_epoch = ++dcache->epoch;
-        dcache->tlbflush_timestamp = tlbflush_current_time();
+        idx = ohashidx;
+        glb_idx = idx + v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
+        ret = (void *)MAPCACHE_VIRT_START + pfn_to_paddr(glb_idx);
     }
 
-    set_bit(idx, dcache->inuse);
-    dcache->cursor = idx + 1;
-
-    spin_unlock(&dcache->lock);
-
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
+    vcache->refcnt[idx]++;
+    ASSERT(vcache->refcnt[idx]);
+    ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(glb_idx)) == mfn_x(mfn));
 
- out:
     local_irq_restore(flags);
-    return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx);
+    return ret;
 }
 
 void unmap_domain_page(const void *ptr)
 {
-    unsigned int idx;
+    unsigned int idx, glb_idx;
     struct vcpu *v;
-    struct mapcache_domain *dcache;
-    unsigned long va = (unsigned long)ptr, mfn, flags;
-    struct vcpu_maphash_entry *hashent;
+    struct mapcache_vcpu *vcache;
+    unsigned long va = (unsigned long)ptr, mfn, hashmfn, flags;
 
     if ( va >= DIRECTMAP_VIRT_START )
         return;
@@ -189,73 +143,21 @@ void unmap_domain_page(const void *ptr)
     v = mapcache_current_vcpu();
     ASSERT(v && is_pv_vcpu(v));
 
-    dcache = &v->domain->arch.pv.mapcache;
-    ASSERT(dcache->inuse);
-
-    idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
-    mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
-    hashent = &v->arch.pv.mapcache.hash[MAPHASH_HASHFN(mfn)];
+    vcache = v->arch.pv.mapcache;
+    ASSERT(vcache);
 
+    glb_idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
+    idx = glb_idx - v->vcpu_id * MAPCACHE_VCPU_ENTRIES;
     local_irq_save(flags);
 
-    if ( hashent->idx == idx )
-    {
-        ASSERT(hashent->mfn == mfn);
-        ASSERT(hashent->refcnt);
-        hashent->refcnt--;
-    }
-    else if ( !hashent->refcnt )
-    {
-        if ( hashent->idx != MAPHASHENT_NOTINUSE )
-        {
-            /* /First/, zap the PTE. */
-            ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(hashent->idx)) ==
-                   hashent->mfn);
-            l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty());
-            /* /Second/, mark as garbage. */
-            set_bit(hashent->idx, dcache->garbage);
-        }
-
-        /* Add newly-freed mapping to the maphash. */
-        hashent->mfn = mfn;
-        hashent->idx = idx;
-    }
-    else
-    {
-        /* /First/, zap the PTE. */
-        l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
-        /* /Second/, mark as garbage. */
-        set_bit(idx, dcache->garbage);
-    }
-
-    local_irq_restore(flags);
-}
-
-int mapcache_domain_init(struct domain *d)
-{
-    struct mapcache_domain *dcache = &d->arch.pv.mapcache;
-    unsigned int bitmap_pages;
+    mfn = l1e_get_pfn(MAPCACHE_L1ENT(glb_idx));
+    hashmfn = vcache->hash_mfn[MAPHASH_HASHFN(mfn)];
 
-    ASSERT(is_pv_domain(d));
+    vcache->refcnt[idx]--;
+    if ( hashmfn != mfn && !vcache->refcnt[idx] )
+        __clear_bit(idx, vcache->inuse);
 
-#ifdef NDEBUG
-    if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
-        return 0;
-#endif
-
-    BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
-                 MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
-    bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
-    dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
-    dcache->garbage = dcache->inuse +
-                      (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
-
-    spin_lock_init(&dcache->lock);
-
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
+    local_irq_restore(flags);
 }
 
 int mapcache_vcpu_init(struct vcpu *v)
@@ -264,39 +166,48 @@ int mapcache_vcpu_init(struct vcpu *v)
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
     unsigned long i;
     unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
-    unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
 
-    if ( !is_pv_vcpu(v) || !dcache->inuse )
+    if ( !is_pv_vcpu(v) )
         return 0;
 
+    BUILD_BUG_ON(MAPCACHE_VIRT_END > ARG_XLAT_VIRT_START);
+
     if ( ents > dcache->entries )
     {
+        int rc;
+
+        ASSERT(ents * PAGE_SIZE <= (PERDOMAIN_SLOT_MBYTES << 20));
+
         /* Populate page tables. */
-        int rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
+        rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
                                           NIL(l1_pgentry_t *), NULL);
 
-        /* Populate bit maps. */
-        if ( !rc )
-            rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                          nr, NULL, NIL(struct page_info *));
-        if ( !rc )
-            rc = create_perdomain_mapping(d, (unsigned long)dcache->garbage,
-                                          nr, NULL, NIL(struct page_info *));
-
         if ( rc )
             return rc;
 
         dcache->entries = ents;
     }
 
-    /* Mark all maphash entries as not in use. */
     BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES);
+    /* MAPHASH_ENTRIES has to be power-of-two to make hashing work. */
+    BUILD_BUG_ON(MAPHASH_ENTRIES & (MAPHASH_ENTRIES - 1));
+    /*
+     * Since entries in the maphash also occupy inuse slots, we have to make
+     * sure MAPCACHE_VCPU_ENTRIES is large enough to accommodate both the
+     * maphash and a nested page table walk.
+     */
+    BUILD_BUG_ON(MAPCACHE_VCPU_ENTRIES - MAPHASH_ENTRIES >=
+                 CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS);
+
+    v->arch.pv.mapcache = xzalloc(struct mapcache_vcpu);
+    if ( !v->arch.pv.mapcache )
+        return -ENOMEM;
+
+    /* Mark all maphash entries as not in use. */
     for ( i = 0; i < MAPHASH_ENTRIES; i++ )
     {
-        struct vcpu_maphash_entry *hashent = &v->arch.pv.mapcache.hash[i];
-
-        hashent->mfn = ~0UL; /* never valid to map */
-        hashent->idx = MAPHASHENT_NOTINUSE;
+        v->arch.pv.mapcache->hash_mfn[i] = ~0UL;
+        v->arch.pv.mapcache->hash_idx[i] = MAPHASHENT_NOTINUSE;
     }
 
     return 0;
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index d0cfbb70a8..4b2217151b 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -296,7 +296,7 @@ extern unsigned long xen_phys_start;
     (GDT_VIRT_START(v) + (64*1024))
 
 /* map_domain_page() map cache. The second per-domain-mapping sub-area. */
-#define MAPCACHE_VCPU_ENTRIES    (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS)
+#define MAPCACHE_VCPU_ENTRIES    32
 #define MAPCACHE_ENTRIES         (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES)
 #define MAPCACHE_VIRT_START      PERDOMAIN_VIRT_SLOT(1)
 #define MAPCACHE_VIRT_END        (MAPCACHE_VIRT_START + \
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index a3ae5d9a20..367bba7110 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -40,35 +40,17 @@ struct trap_bounce {
 #define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1))
 #define MAPHASHENT_NOTINUSE ((u32)~0U)
 struct mapcache_vcpu {
-    /* Shadow of mapcache_domain.epoch. */
-    unsigned int shadow_epoch;
-
-    /* Lock-free per-VCPU hash of recently-used mappings. */
-    struct vcpu_maphash_entry {
-        unsigned long mfn;
-        uint32_t      idx;
-        uint32_t      refcnt;
-    } hash[MAPHASH_ENTRIES];
+    unsigned long hash_mfn[MAPHASH_ENTRIES];
+    uint32_t hash_idx[MAPHASH_ENTRIES];
+
+    uint8_t refcnt[MAPCACHE_VCPU_ENTRIES];
+    unsigned long inuse[BITS_TO_LONGS(MAPCACHE_VCPU_ENTRIES)];
 };
 
 struct mapcache_domain {
-    /* The number of array entries, and a cursor into the array. */
     unsigned int entries;
-    unsigned int cursor;
-
-    /* Protects map_domain_page(). */
-    spinlock_t lock;
-
-    /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */
-    unsigned int epoch;
-    u32 tlbflush_timestamp;
-
-    /* Which mappings are in use, and which are garbage to reap next epoch? */
-    unsigned long *inuse;
-    unsigned long *garbage;
 };
 
-int mapcache_domain_init(struct domain *);
 int mapcache_vcpu_init(struct vcpu *);
 void mapcache_override_current(struct vcpu *);
 
@@ -473,7 +455,7 @@ struct arch_domain
 struct pv_vcpu
 {
     /* map_domain_page() mapping cache. */
-    struct mapcache_vcpu mapcache;
+    struct mapcache_vcpu *mapcache;
 
     unsigned int vgc_flags;
 
-- 
2.17.1


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply related	[flat|nested] 8+ messages in thread