All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag
@ 2014-06-17 11:49 Bob Liu
  2014-06-17 11:49 ` [PATCH 2/4] xen: introduce an "scrub" free page list Bob Liu
                   ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Bob Liu @ 2014-06-17 11:49 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, jbeulich

To indicate a page is free but need to be scrubbed before in use.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/include/asm-x86/mm.h |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 7059adc..660fc33 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -223,9 +223,12 @@ struct page_info
 #define PGC_state_offlined PG_mask(2, 9)
 #define PGC_state_free    PG_mask(3, 9)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+/* Page need to be scrubbed */
+#define _PGC_need_scrub   PG_shift(10)
+#define PGC_need_scrub    PG_mask(1, 10)
 
  /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(9)
+#define PGC_count_width   PG_shift(10)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 struct spage_info
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/4] xen: introduce an "scrub" free page list
  2014-06-17 11:49 [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Bob Liu
@ 2014-06-17 11:49 ` Bob Liu
  2014-06-17 12:36   ` Jan Beulich
  2014-06-17 11:49 ` [PATCH 3/4] xen: separate a function merge_free_trunks from Bob Liu
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Bob Liu @ 2014-06-17 11:49 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, jbeulich

Because of page scrubbing, it's very slow to destroy a domain with large memory.
It takes around 10 minutes when destroy a guest of nearly 1 TB of memory.

This patch introduced a "scrub" free page list, pages on this list need to be
scrubbed before use. During domain destory just mark pages "PGC_need_scrub"
and then add them to this list, so that xl can return quickly.

In alloc_domheap_pages():
  - If there are pages on the normal "heap" freelist, allocate them.
  - Try to allocate from the "scrub" free list with the same order and do
    scrubbing synchronously.

In order to not fail high order allocate request, merge page trunks for
PGC_need_scrub pages.
Note: PCG_need_scrub pages and normal pages are not mergeable

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/common/page_alloc.c |  108 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 91 insertions(+), 17 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 601319c..723d273 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -267,6 +267,9 @@ typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
 #define heap(node, zone, order) ((*_heap[node])[zone][order])
 
+static heap_by_zone_and_order_t _scrub[MAX_NUMNODES];
+#define scrub(node, zone, order) (_scrub[node][zone][order])
+
 static unsigned long *avail[MAX_NUMNODES];
 static long total_avail_pages;
 
@@ -629,8 +632,24 @@ static struct page_info *alloc_heap_pages(
 
             /* Find smallest order which can satisfy the request. */
             for ( j = order; j <= MAX_ORDER; j++ )
+            {
                 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
                     goto found;
+                
+                /* Try to scrub a page directly */
+                if ( (pg = page_list_remove_head(&scrub(node, zone, j))) )
+                {
+                    for ( i = 0; i < (1 << j); i++ )
+                    {
+                        if ( test_bit(_PGC_need_scrub, &(pg[i].count_info)) )
+                        {
+                            scrub_one_page(&pg[i]);
+                            pg[i].count_info &= ~(PGC_need_scrub);
+			}
+                    }
+                    goto found;
+                }
+            }
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
         if ( memflags & MEMF_exact_node )
@@ -810,7 +829,7 @@ static int reserve_offlined_page(struct page_info *head)
 
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
-    struct page_info *pg, unsigned int order)
+    struct page_info *pg, unsigned int order, bool_t need_scrub)
 {
     unsigned long mask, mfn = page_to_mfn(pg);
     unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
@@ -859,6 +878,22 @@ static void free_heap_pages(
         midsize_alloc_zone_pages = max(
             midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
 
+    if ( need_scrub )
+    {
+        /* Specail for tainted case */
+        if ( tainted )
+        {
+            for ( i = 0; i < (1 << order); i++ )
+                scrub_one_page(&pg[i]);
+            need_scrub = 0;
+        }
+        else
+        {
+            for ( i = 0; i < (1 << order); i++ )
+                pg[i].count_info |= PGC_need_scrub;
+        }
+    }
+
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
     {
@@ -872,8 +907,22 @@ static void free_heap_pages(
                  (PFN_ORDER(pg-mask) != order) ||
                  (phys_to_nid(page_to_maddr(pg-mask)) != node) )
                 break;
-            pg -= mask;
-            page_list_del(pg, &heap(node, zone, order));
+
+            /* If we need scrub, only merge with PGC_need_scrub pages */
+            if ( need_scrub )
+            {
+                if ( !test_bit(_PGC_need_scrub, &((pg-mask)->count_info)) )
+                    break;
+                pg -= mask;
+                page_list_del(pg, &scrub(node, zone, order));
+            }
+	    else
+            {
+                if ( test_bit(_PGC_need_scrub, &((pg-mask)->count_info)) )
+                    break;
+                pg -= mask;
+                page_list_del(pg, &heap(node, zone, order));
+            }
         }
         else
         {
@@ -883,14 +932,35 @@ static void free_heap_pages(
                  (PFN_ORDER(pg+mask) != order) ||
                  (phys_to_nid(page_to_maddr(pg+mask)) != node) )
                 break;
-            page_list_del(pg + mask, &heap(node, zone, order));
+
+            /* If we need scrub, only merge with PGC_need_scrub pages */
+            if ( need_scrub )
+            {
+                if ( !test_bit(_PGC_need_scrub, &((pg+mask)->count_info)) )
+                    break;
+                page_list_del(pg + mask, &scrub(node, zone, order));
+            }
+            else
+            {
+                if ( test_bit(_PGC_need_scrub, &((pg+mask)->count_info)) )
+                    break;
+                page_list_del(pg + mask, &heap(node, zone, order));
+            }
         }
 
         order++;
     }
 
     PFN_ORDER(pg) = order;
-    page_list_add_tail(pg, &heap(node, zone, order));
+    if ( need_scrub )
+    {
+        ASSERT( test_bit(_PGC_need_scrub, &(pg->count_info)) );
+        page_list_add_tail(pg, &scrub(node, zone, order));
+    }
+    else
+    {
+        page_list_add_tail(pg, &heap(node, zone, order));
+    }
 
     if ( tainted )
         reserve_offlined_page(pg);
@@ -1115,7 +1185,7 @@ unsigned int online_page(unsigned long mfn, uint32_t *status)
     spin_unlock(&heap_lock);
 
     if ( (y & PGC_state) == PGC_state_offlined )
-        free_heap_pages(pg, 0);
+        free_heap_pages(pg, 0, 0);
 
     return ret;
 }
@@ -1184,7 +1254,7 @@ static void init_heap_pages(
             nr_pages -= n;
         }
 
-        free_heap_pages(pg+i, 0);
+        free_heap_pages(pg+i, 0, 0);
     }
 }
 
@@ -1216,7 +1286,7 @@ unsigned long total_free_pages(void)
 
 void __init end_boot_allocator(void)
 {
-    unsigned int i;
+    unsigned int i, j, order;
 
     /* Pages that are free now go to the domain sub-allocator. */
     for ( i = 0; i < nr_bootmem_regions; i++ )
@@ -1250,6 +1320,11 @@ void __init end_boot_allocator(void)
 #endif
     }
 
+    for ( i = 0; i < MAX_NUMNODES; i++ )
+        for ( j = 0; j < NR_ZONES; j++ )
+            for ( order = 0; order <= MAX_ORDER; order++ )
+                INIT_PAGE_LIST_HEAD(&scrub(i, j, order));
+
     printk("Domain heap initialised");
     if ( dma_bitsize )
         printk(" DMA width %u bits", dma_bitsize);
@@ -1357,7 +1432,7 @@ void free_xenheap_pages(void *v, unsigned int order)
 
     memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
 
-    free_heap_pages(virt_to_page(v), order);
+    free_heap_pages(virt_to_page(v), order, 0);
 }
 
 #else
@@ -1411,7 +1486,7 @@ void free_xenheap_pages(void *v, unsigned int order)
     for ( i = 0; i < (1u << order); i++ )
         pg[i].count_info &= ~PGC_xen_heap;
 
-    free_heap_pages(pg, order);
+    free_heap_pages(pg, order, 0);
 }
 
 #endif
@@ -1515,7 +1590,7 @@ struct page_info *alloc_domheap_pages(
 
     if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
     {
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, 0);
         return NULL;
     }
     
@@ -1564,22 +1639,21 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
          * domain has died we assume responsibility for erasure.
          */
         if ( unlikely(d->is_dying) )
-            for ( i = 0; i < (1 << order); i++ )
-                scrub_one_page(&pg[i]);
-
-        free_heap_pages(pg, order);
+            free_heap_pages(pg, order, 1);
+        else
+            free_heap_pages(pg, order, 0);
     }
     else if ( unlikely(d == dom_cow) )
     {
         ASSERT(order == 0); 
         scrub_one_page(pg);
-        free_heap_pages(pg, 0);
+        free_heap_pages(pg, 0, 0);
         drop_dom_ref = 0;
     }
     else
     {
         /* Freeing anonymous domain-heap pages. */
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, 0);
         drop_dom_ref = 0;
     }
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/4] xen: separate a function merge_free_trunks from
  2014-06-17 11:49 [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Bob Liu
  2014-06-17 11:49 ` [PATCH 2/4] xen: introduce an "scrub" free page list Bob Liu
@ 2014-06-17 11:49 ` Bob Liu
  2014-06-17 12:39   ` Jan Beulich
  2014-06-17 11:49 ` [PATCH 4/4] xen: use idle vcpus to scrub pages Bob Liu
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Bob Liu @ 2014-06-17 11:49 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, jbeulich

This function will be used when scrubbing a page from idle vcpus.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/common/page_alloc.c |  137 +++++++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 63 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 723d273..5698596 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -827,73 +827,14 @@ static int reserve_offlined_page(struct page_info *head)
     return count;
 }
 
-/* Free 2^@order set of pages. */
-static void free_heap_pages(
+/* Caller must hold heap lock */
+static void merge_free_trunks(
     struct page_info *pg, unsigned int order, bool_t need_scrub)
 {
-    unsigned long mask, mfn = page_to_mfn(pg);
-    unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
+    unsigned long mask;
+    unsigned int node = phys_to_nid(page_to_maddr(pg));
     unsigned int zone = page_to_zone(pg);
 
-    ASSERT(order <= MAX_ORDER);
-    ASSERT(node >= 0);
-
-    spin_lock(&heap_lock);
-
-    for ( i = 0; i < (1 << order); i++ )
-    {
-        /*
-         * Cannot assume that count_info == 0, as there are some corner cases
-         * where it isn't the case and yet it isn't a bug:
-         *  1. page_get_owner() is NULL
-         *  2. page_get_owner() is a domain that was never accessible by
-         *     its domid (e.g., failed to fully construct the domain).
-         *  3. page was never addressable by the guest (e.g., it's an
-         *     auto-translate-physmap guest and the page was never included
-         *     in its pseudophysical address space).
-         * In all the above cases there can be no guest mappings of this page.
-         */
-        ASSERT(!page_state_is(&pg[i], offlined));
-        pg[i].count_info =
-            ((pg[i].count_info & PGC_broken) |
-             (page_state_is(&pg[i], offlining)
-              ? PGC_state_offlined : PGC_state_free));
-        if ( page_state_is(&pg[i], offlined) )
-            tainted = 1;
-
-        /* If a page has no owner it will need no safety TLB flush. */
-        pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
-        if ( pg[i].u.free.need_tlbflush )
-            pg[i].tlbflush_timestamp = tlbflush_current_time();
-
-        /* This page is not a guest frame any more. */
-        page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
-        set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
-    }
-
-    avail[node][zone] += 1 << order;
-    total_avail_pages += 1 << order;
-
-    if ( opt_tmem )
-        midsize_alloc_zone_pages = max(
-            midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
-
-    if ( need_scrub )
-    {
-        /* Specail for tainted case */
-        if ( tainted )
-        {
-            for ( i = 0; i < (1 << order); i++ )
-                scrub_one_page(&pg[i]);
-            need_scrub = 0;
-        }
-        else
-        {
-            for ( i = 0; i < (1 << order); i++ )
-                pg[i].count_info |= PGC_need_scrub;
-        }
-    }
-
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
     {
@@ -961,6 +902,76 @@ static void free_heap_pages(
     {
         page_list_add_tail(pg, &heap(node, zone, order));
     }
+}
+
+/* Free 2^@order set of pages. */
+static void free_heap_pages(
+    struct page_info *pg, unsigned int order, bool_t need_scrub)
+{
+    unsigned long mfn = page_to_mfn(pg);
+    unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
+    unsigned int zone = page_to_zone(pg);
+
+    ASSERT(order <= MAX_ORDER);
+    ASSERT(node >= 0);
+
+    spin_lock(&heap_lock);
+
+    for ( i = 0; i < (1 << order); i++ )
+    {
+        /*
+         * Cannot assume that count_info == 0, as there are some corner cases
+         * where it isn't the case and yet it isn't a bug:
+         *  1. page_get_owner() is NULL
+         *  2. page_get_owner() is a domain that was never accessible by
+         *     its domid (e.g., failed to fully construct the domain).
+         *  3. page was never addressable by the guest (e.g., it's an
+         *     auto-translate-physmap guest and the page was never included
+         *     in its pseudophysical address space).
+         * In all the above cases there can be no guest mappings of this page.
+         */
+        ASSERT(!page_state_is(&pg[i], offlined));
+        pg[i].count_info =
+            ((pg[i].count_info & PGC_broken) |
+             (page_state_is(&pg[i], offlining)
+              ? PGC_state_offlined : PGC_state_free));
+        if ( page_state_is(&pg[i], offlined) )
+            tainted = 1;
+
+        /* If a page has no owner it will need no safety TLB flush. */
+        pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
+        if ( pg[i].u.free.need_tlbflush )
+            pg[i].tlbflush_timestamp = tlbflush_current_time();
+
+        /* This page is not a guest frame any more. */
+        page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
+        set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
+    }
+
+    avail[node][zone] += 1 << order;
+    total_avail_pages += 1 << order;
+
+    if ( opt_tmem )
+        midsize_alloc_zone_pages = max(
+            midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
+
+    if ( need_scrub )
+    {
+        /* Specail for tainted case */
+        if ( tainted )
+        {
+            for ( i = 0; i < (1 << order); i++ )
+                scrub_one_page(&pg[i]);
+            need_scrub = 0;
+        }
+        else
+        {
+            for ( i = 0; i < (1 << order); i++ )
+                pg[i].count_info |= PGC_need_scrub;
+        }
+    }
+
+    merge_free_trunks(pg, order, need_scrub);
 
     if ( tainted )
         reserve_offlined_page(pg);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/4] xen: use idle vcpus to scrub pages
  2014-06-17 11:49 [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Bob Liu
  2014-06-17 11:49 ` [PATCH 2/4] xen: introduce an "scrub" free page list Bob Liu
  2014-06-17 11:49 ` [PATCH 3/4] xen: separate a function merge_free_trunks from Bob Liu
@ 2014-06-17 11:49 ` Bob Liu
  2014-06-17 13:01   ` Jan Beulich
  2014-06-17 12:35 ` [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Jan Beulich
  2014-06-17 12:46 ` Julien Grall
  4 siblings, 1 reply; 14+ messages in thread
From: Bob Liu @ 2014-06-17 11:49 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, jbeulich

In case of heavy lock contention, use a percpu list.
 - Delist a batch of pages to a percpu list from "scrub" free page list.
 - Scrub pages on this percpu list.
 - Add those clean pages to normal "head" free page list, merge with other
   chunks if needed.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/arch/x86/domain.c   |    1 +
 xen/common/domain.c     |    2 ++
 xen/common/page_alloc.c |   82 +++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/mm.h    |    2 ++
 4 files changed, 87 insertions(+)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 6fddd4c..a46a2ba 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -116,6 +116,7 @@ static void idle_loop(void)
     {
         if ( cpu_is_offline(smp_processor_id()) )
             play_dead();
+        scrub_free_pages();
         (*pm_idle)();
         do_tasklet();
         do_softirq();
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 4291e29..bd386e6 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -587,12 +587,14 @@ int domain_kill(struct domain *d)
         d->tmem_client = NULL;
         /* fallthrough */
     case DOMDYING_dying:
+        enable_idle_scrub = 0;
         rc = domain_relinquish_resources(d);
         if ( rc != 0 )
         {
             BUG_ON(rc != -EAGAIN);
             break;
         }
+        enable_idle_scrub = 1;
         for_each_vcpu ( d, v )
             unmap_vcpu_info(v);
         d->is_dying = DOMDYING_dead;
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 5698596..2b2fd04 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -79,6 +79,9 @@ PAGE_LIST_HEAD(page_offlined_list);
 /* Broken page list, protected by heap_lock. */
 PAGE_LIST_HEAD(page_broken_list);
 
+volatile bool_t enable_idle_scrub;
+DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);
+
 /*************************
  * BOOT-TIME ALLOCATOR
  */
@@ -1387,7 +1390,86 @@ void __init scrub_heap_pages(void)
     setup_low_mem_virq();
 }
 
+#define SCRUB_BATCH 1024
+void scrub_free_pages(void)
+{
+    struct page_info *pg;
+    unsigned int i, j, node_empty = 0, nr_delisted = 0;
+    int order;
+    unsigned int cpu = smp_processor_id();
+    unsigned int node = cpu_to_node(cpu);
+    struct page_list_head *temp_list = &this_cpu(scrub_list_cpu);
+
+    if ( !enable_idle_scrub )
+        return;
+
+    do
+    {
+        if ( page_list_empty(temp_list) )
+        {
+            /* Delist a batch of pages from global scrub list */
+            spin_lock(&heap_lock);
+            for ( j = 0; j < NR_ZONES; j++ )
+            {
+                for ( order = MAX_ORDER; order >= 0; order-- )
+                {
+                    if ( (pg = page_list_remove_head(&scrub(node, j, order))) )
+                    {
+                        for ( i = 0; i < (1 << order); i++)
+                            mark_page_offline(&pg[i], 0);
+
+                        page_list_add_tail(pg, temp_list);
+                        nr_delisted += (1 << order);
+                        if ( nr_delisted > SCRUB_BATCH )
+                        {
+                            nr_delisted = 0;
+                            spin_unlock(&heap_lock);
+                            goto start_scrub;
+                        }
+                    }
+                }
+            }
+
+            node_empty = 1;
+            spin_unlock(&heap_lock);
+        }
+        else
+        {
+start_scrub:
+            /* Scrub percpu list */
+            while ( !page_list_empty(temp_list) )
+            {
+                pg = page_list_remove_head(temp_list);
+                ASSERT(pg);
+                order = PFN_ORDER(pg);
+                for ( i = 0; i < (1 << order); i++ )
+                {
+                    ASSERT( test_bit(_PGC_need_scrub, &(pg[i].count_info)) );
+                    scrub_one_page(&pg[i]);
+                    pg[i].count_info &= ~(PGC_need_scrub);
+                }
 
+                /* Add pages to free heap list */
+                spin_lock(&heap_lock);
+                for ( i = 0; i < (1 << order); i++ )
+                {
+                    ASSERT ( !test_bit(_PGC_need_scrub, &(pg[i].count_info)) );
+                    pg[i].count_info |= PGC_state_free;
+                }
+                ASSERT (node == phys_to_nid(page_to_maddr(pg)));
+                merge_free_trunks(pg, order, 0);
+                spin_unlock(&heap_lock);
+
+                if ( softirq_pending(cpu) )
+                    return;
+            }
+        }
+
+        /* Scrub list of this node is empty */
+        if ( node_empty )
+            return;
+    } while ( !softirq_pending(cpu) );
+}
 
 /*************************
  * XEN-HEAP SUB-ALLOCATOR
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index b183189..c3f481d 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -34,6 +34,7 @@
 
 struct domain;
 struct page_info;
+extern volatile bool_t enable_idle_scrub;
 
 /* Boot-time allocator. Turns into generic allocator after bootstrap. */
 void init_boot_pages(paddr_t ps, paddr_t pe);
@@ -78,6 +79,7 @@ int query_page_offline(unsigned long mfn, uint32_t *status);
 unsigned long total_free_pages(void);
 
 void scrub_heap_pages(void);
+void scrub_free_pages(void);
 
 int assign_pages(
     struct domain *d,
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag
  2014-06-17 11:49 [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Bob Liu
                   ` (2 preceding siblings ...)
  2014-06-17 11:49 ` [PATCH 4/4] xen: use idle vcpus to scrub pages Bob Liu
@ 2014-06-17 12:35 ` Jan Beulich
  2014-06-17 12:46 ` Julien Grall
  4 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2014-06-17 12:35 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:
> To indicate a page is free but need to be scrubbed before in use.

I don't think this is worth a separate patch.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] xen: introduce an "scrub" free page list
  2014-06-17 11:49 ` [PATCH 2/4] xen: introduce an "scrub" free page list Bob Liu
@ 2014-06-17 12:36   ` Jan Beulich
  2014-06-18  0:54     ` Bob Liu
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2014-06-17 12:36 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:
> Because of page scrubbing, it's very slow to destroy a domain with large 
> memory.
> It takes around 10 minutes when destroy a guest of nearly 1 TB of memory.
> 
> This patch introduced a "scrub" free page list, pages on this list need to be
> scrubbed before use. During domain destory just mark pages "PGC_need_scrub"
> and then add them to this list, so that xl can return quickly.
> 
> In alloc_domheap_pages():
>   - If there are pages on the normal "heap" freelist, allocate them.
>   - Try to allocate from the "scrub" free list with the same order and do
>     scrubbing synchronously.
> 
> In order to not fail high order allocate request, merge page trunks for
> PGC_need_scrub pages.
> Note: PCG_need_scrub pages and normal pages are not mergeable

The whole series needs sync-ing logically both with Konrad's recent
boot time scrubbing changes (this change here really makes his
change unnecessary - the affected pages could quite well be simply
marked as needing scrubbing, getting dealt with by the logic added
here) and with XSA-100's change.

> @@ -629,8 +632,24 @@ static struct page_info *alloc_heap_pages(
>  
>              /* Find smallest order which can satisfy the request. */
>              for ( j = order; j <= MAX_ORDER; j++ )
> +            {
>                  if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
>                      goto found;
> +                
> +                /* Try to scrub a page directly */
> +                if ( (pg = page_list_remove_head(&scrub(node, zone, j))) )
> +                {
> +                    for ( i = 0; i < (1 << j); i++ )

Definitely not: You may have found a 4Tb chunk for a request of a
single page. You don't want to scrub all 4Tb in that case. I think this
needs to be put after the "found" label.

Which raises the question on the need for separate _heap[] and
_scrub[] - if chunks with different PGC_need_scrub flags aren't
mergeable, why do you need separate arrays?

> +                    {
> +                        if ( test_bit(_PGC_need_scrub, &(pg[i].count_info)) )

Please no pointless and inconsistent (with surrounding code)
parentheses.

> +                        {
> +                            scrub_one_page(&pg[i]);
> +                            pg[i].count_info &= ~(PGC_need_scrub);

Again.

> +			}

Hard tabs (also further down).

> @@ -859,6 +878,22 @@ static void free_heap_pages(
>          midsize_alloc_zone_pages = max(
>              midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
>  
> +    if ( need_scrub )
> +    {
> +        /* Specail for tainted case */
> +        if ( tainted )
> +        {
> +            for ( i = 0; i < (1 << order); i++ )
> +                scrub_one_page(&pg[i]);

Are you trying to provoke another #MC for pages that got offlined?
Just avoid setting PGC_need_scrub in this case and you're done.

> @@ -1250,6 +1320,11 @@ void __init end_boot_allocator(void)
>  #endif
>      }
>  
> +    for ( i = 0; i < MAX_NUMNODES; i++ )
> +        for ( j = 0; j < NR_ZONES; j++ )
> +            for ( order = 0; order <= MAX_ORDER; order++ )
> +                INIT_PAGE_LIST_HEAD(&scrub(i, j, order));
> +

Why is this not being done alongside the setting up of _heap[]?

> @@ -1564,22 +1639,21 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>           * domain has died we assume responsibility for erasure.
>           */
>          if ( unlikely(d->is_dying) )
> -            for ( i = 0; i < (1 << order); i++ )
> -                scrub_one_page(&pg[i]);
> -
> -        free_heap_pages(pg, order);
> +            free_heap_pages(pg, order, 1);
> +        else
> +            free_heap_pages(pg, order, 0);
>      }
>      else if ( unlikely(d == dom_cow) )
>      {
>          ASSERT(order == 0); 
>          scrub_one_page(pg);
> -        free_heap_pages(pg, 0);
> +        free_heap_pages(pg, 0, 0);

Is there a particular reason why you don't defer the scrubbing here?

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/4] xen: separate a function merge_free_trunks from
  2014-06-17 11:49 ` [PATCH 3/4] xen: separate a function merge_free_trunks from Bob Liu
@ 2014-06-17 12:39   ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2014-06-17 12:39 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:

Truncated subject?

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -827,73 +827,14 @@ static int reserve_offlined_page(struct page_info *head)
>      return count;
>  }
>  
> -/* Free 2^@order set of pages. */
> -static void free_heap_pages(
> +/* Caller must hold heap lock */
> +static void merge_free_trunks(
>      struct page_info *pg, unsigned int order, bool_t need_scrub)

>From a purely abstract pov this function shouldn't need a "need_scrub"
parameter.

>  {
> -    unsigned long mask, mfn = page_to_mfn(pg);
> -    unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
> +    unsigned long mask;
> +    unsigned int node = phys_to_nid(page_to_maddr(pg));

Instead, the caller already calculated "node", which hence could be
passed in.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag
  2014-06-17 11:49 [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Bob Liu
                   ` (3 preceding siblings ...)
  2014-06-17 12:35 ` [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Jan Beulich
@ 2014-06-17 12:46 ` Julien Grall
  2014-06-18  0:55   ` Bob Liu
  4 siblings, 1 reply; 14+ messages in thread
From: Julien Grall @ 2014-06-17 12:46 UTC (permalink / raw)
  To: Bob Liu, xen-devel
  Cc: George.Dunlap, andrew.cooper3, keir, ian.campbell, jbeulich

Hi Bob,

On 06/17/2014 12:49 PM, Bob Liu wrote:
> To indicate a page is free but need to be scrubbed before in use.
> 
> Signed-off-by: Bob Liu <bob.liu@oracle.com>

AFAIU, you plan to use this new flag in the common code. Please add a
similar one for ARM (in include/asm-arm/mm.h) otherwise you will break
compilation on this architecture.

Thanks,

-- 
Julien Grall

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] xen: use idle vcpus to scrub pages
  2014-06-17 11:49 ` [PATCH 4/4] xen: use idle vcpus to scrub pages Bob Liu
@ 2014-06-17 13:01   ` Jan Beulich
  2014-06-18  1:18     ` Bob Liu
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2014-06-17 13:01 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:
> In case of heavy lock contention, use a percpu list.
>  - Delist a batch of pages to a percpu list from "scrub" free page list.
>  - Scrub pages on this percpu list.
>  - Add those clean pages to normal "head" free page list, merge with other

Did you mean "heap" here?

> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -587,12 +587,14 @@ int domain_kill(struct domain *d)
>          d->tmem_client = NULL;
>          /* fallthrough */
>      case DOMDYING_dying:
> +        enable_idle_scrub = 0;
>          rc = domain_relinquish_resources(d);
>          if ( rc != 0 )
>          {
>              BUG_ON(rc != -EAGAIN);
>              break;
>          }
> +        enable_idle_scrub = 1;
>          for_each_vcpu ( d, v )
>              unmap_vcpu_info(v);
>          d->is_dying = DOMDYING_dead;

???

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -79,6 +79,9 @@ PAGE_LIST_HEAD(page_offlined_list);
>  /* Broken page list, protected by heap_lock. */
>  PAGE_LIST_HEAD(page_broken_list);
>  
> +volatile bool_t enable_idle_scrub;
> +DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);

I'm pretty sure I pointed out to you before that variables used only in
a single file should be static.

> @@ -1387,7 +1390,86 @@ void __init scrub_heap_pages(void)
>      setup_low_mem_virq();
>  }
>  
> +#define SCRUB_BATCH 1024
> +void scrub_free_pages(void)
> +{
> +    struct page_info *pg;
> +    unsigned int i, j, node_empty = 0, nr_delisted = 0;
> +    int order;
> +    unsigned int cpu = smp_processor_id();
> +    unsigned int node = cpu_to_node(cpu);
> +    struct page_list_head *temp_list = &this_cpu(scrub_list_cpu);
> +
> +    if ( !enable_idle_scrub )
> +        return;
> +
> +    do
> +    {
> +        if ( page_list_empty(temp_list) )
> +        {
> +            /* Delist a batch of pages from global scrub list */
> +            spin_lock(&heap_lock);
> +            for ( j = 0; j < NR_ZONES; j++ )
> +            {
> +                for ( order = MAX_ORDER; order >= 0; order-- )
> +                {
> +                    if ( (pg = page_list_remove_head(&scrub(node, j, order))) )
> +                    {
> +                        for ( i = 0; i < (1 << order); i++)
> +                            mark_page_offline(&pg[i], 0);

A page previously having caused #MC now suddenly became healthy
again?

> +
> +                        page_list_add_tail(pg, temp_list);
> +                        nr_delisted += (1 << order);
> +                        if ( nr_delisted > SCRUB_BATCH )
> +                        {
> +                            nr_delisted = 0;
> +                            spin_unlock(&heap_lock);
> +                            goto start_scrub;
> +                        }
> +                    }
> +                }
> +            }
> +
> +            node_empty = 1;
> +            spin_unlock(&heap_lock);
> +        }
> +        else
> +        {
> +start_scrub:

Labels need to be indented by at least one space.

> +            /* Scrub percpu list */
> +            while ( !page_list_empty(temp_list) )
> +            {
> +                pg = page_list_remove_head(temp_list);
> +                ASSERT(pg);
> +                order = PFN_ORDER(pg);
> +                for ( i = 0; i < (1 << order); i++ )
> +                {

Order here can again be almost arbitrarily high. You aren't allowed
to scrub e.g. 4Tb in one go.

> +                    ASSERT( test_bit(_PGC_need_scrub, &(pg[i].count_info)) );
> +                    scrub_one_page(&pg[i]);
> +                    pg[i].count_info &= ~(PGC_need_scrub);
> +                }
>  
> +                /* Add pages to free heap list */
> +                spin_lock(&heap_lock);

Between the dropping of the lock above and the re-acquiring of it
here the pages "stolen" can lead to allocation failures despite there
being available memory. This needs to be avoided if at all possible.

> +                for ( i = 0; i < (1 << order); i++ )
> +                {
> +                    ASSERT ( !test_bit(_PGC_need_scrub, &(pg[i].count_info)) );
> +                    pg[i].count_info |= PGC_state_free;
> +                }
> +                ASSERT (node == phys_to_nid(page_to_maddr(pg)));
> +                merge_free_trunks(pg, order, 0);
> +                spin_unlock(&heap_lock);
> +
> +                if ( softirq_pending(cpu) )
> +                    return;
> +            }
> +        }
> +
> +        /* Scrub list of this node is empty */
> +        if ( node_empty )
> +            return;
> +    } while ( !softirq_pending(cpu) );
> +}

And all the logic needs to be made NUMA-aware, i.e. a CPU should
prefer to scrub local memory, and only resort to scrubbing distant
memory if no CPU on the correct node is idle. Plus (as we have seen
with Konrad's boot time scrubbing changes) you should avoid having
two hyperthreads within the same core doing scrubbing at the same
time.

In the end I'm getting the impression that this all wasn't properly
thought through yet. Convoys on the lock inside the idle processing
should e.g. also be avoided (or reduced to a minimum) - there's no
point preventing the entering of C states if all you're otherwise going
to do is spin on a lock.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] xen: introduce an "scrub" free page list
  2014-06-17 12:36   ` Jan Beulich
@ 2014-06-18  0:54     ` Bob Liu
  2014-06-18 13:18       ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 14+ messages in thread
From: Bob Liu @ 2014-06-18  0:54 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 06/17/2014 08:36 PM, Jan Beulich wrote:
>>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:
>> Because of page scrubbing, it's very slow to destroy a domain with large 
>> memory.
>> It takes around 10 minutes when destroy a guest of nearly 1 TB of memory.
>>
>> This patch introduced a "scrub" free page list, pages on this list need to be
>> scrubbed before use. During domain destory just mark pages "PGC_need_scrub"
>> and then add them to this list, so that xl can return quickly.
>>
>> In alloc_domheap_pages():
>>   - If there are pages on the normal "heap" freelist, allocate them.
>>   - Try to allocate from the "scrub" free list with the same order and do
>>     scrubbing synchronously.
>>
>> In order to not fail high order allocate request, merge page trunks for
>> PGC_need_scrub pages.
>> Note: PCG_need_scrub pages and normal pages are not mergeable
> 
> The whole series needs sync-ing logically both with Konrad's recent
> boot time scrubbing changes (this change here really makes his
> change unnecessary - the affected pages could quite well be simply
> marked as needing scrubbing, getting dealt with by the logic added
> here) and with XSA-100's change.
> 

I agree. Konrad?

>> @@ -629,8 +632,24 @@ static struct page_info *alloc_heap_pages(
>>  
>>              /* Find smallest order which can satisfy the request. */
>>              for ( j = order; j <= MAX_ORDER; j++ )
>> +            {
>>                  if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
>>                      goto found;
>> +                
>> +                /* Try to scrub a page directly */
>> +                if ( (pg = page_list_remove_head(&scrub(node, zone, j))) )
>> +                {
>> +                    for ( i = 0; i < (1 << j); i++ )
> 
> Definitely not: You may have found a 4Tb chunk for a request of a
> single page. You don't want to scrub all 4Tb in that case. I think this
> needs to be put after the "found" label.
> 

Thank you for pointing out.


> Which raises the question on the need for separate _heap[] and
> _scrub[] - if chunks with different PGC_need_scrub flags aren't
> mergeable, why do you need separate arrays?
> 

I also like use _heap[] only and it's much easier.

The main reason for an new _scrub[] list is we need a way to find all
need_scrub pages for idle vcpus.

It's too cost adding an new page_list_head to struct page_info and I
didn't find a member can be union in struct page_info.

If use _heap[] only, we have to iterate all pages in idle vcpus to check
whether this page need scrubbed. I think it's not effective.

Welcome better ideas.

>> +                    {
>> +                        if ( test_bit(_PGC_need_scrub, &(pg[i].count_info)) )
> 
> Please no pointless and inconsistent (with surrounding code)
> parentheses.
> 

I'm sorry for all of those issues, I'm not sure whether the direction of
this series is right so I didn't take careful checking.

>> +                        {
>> +                            scrub_one_page(&pg[i]);
>> +                            pg[i].count_info &= ~(PGC_need_scrub);
> 
> Again.
> 
>> +			}
> 
> Hard tabs (also further down).
> 
>> @@ -859,6 +878,22 @@ static void free_heap_pages(
>>          midsize_alloc_zone_pages = max(
>>              midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
>>  
>> +    if ( need_scrub )
>> +    {
>> +        /* Specail for tainted case */
>> +        if ( tainted )
>> +        {
>> +            for ( i = 0; i < (1 << order); i++ )
>> +                scrub_one_page(&pg[i]);
> 
> Are you trying to provoke another #MC for pages that got offlined?
> Just avoid setting PGC_need_scrub in this case and you're done.
> 

You are right, will be fixed.

>> @@ -1250,6 +1320,11 @@ void __init end_boot_allocator(void)
>>  #endif
>>      }
>>  
>> +    for ( i = 0; i < MAX_NUMNODES; i++ )
>> +        for ( j = 0; j < NR_ZONES; j++ )
>> +            for ( order = 0; order <= MAX_ORDER; order++ )
>> +                INIT_PAGE_LIST_HEAD(&scrub(i, j, order));
>> +
> 
> Why is this not being done alongside the setting up of _heap[]?
> 
>> @@ -1564,22 +1639,21 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>>           * domain has died we assume responsibility for erasure.
>>           */
>>          if ( unlikely(d->is_dying) )
>> -            for ( i = 0; i < (1 << order); i++ )
>> -                scrub_one_page(&pg[i]);
>> -
>> -        free_heap_pages(pg, order);
>> +            free_heap_pages(pg, order, 1);
>> +        else
>> +            free_heap_pages(pg, order, 0);
>>      }
>>      else if ( unlikely(d == dom_cow) )
>>      {
>>          ASSERT(order == 0); 
>>          scrub_one_page(pg);
>> -        free_heap_pages(pg, 0);
>> +        free_heap_pages(pg, 0, 0);
> 
> Is there a particular reason why you don't defer the scrubbing here?
> 

Will be fixed.

Thanks,
-Bob

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag
  2014-06-17 12:46 ` Julien Grall
@ 2014-06-18  0:55   ` Bob Liu
  0 siblings, 0 replies; 14+ messages in thread
From: Bob Liu @ 2014-06-18  0:55 UTC (permalink / raw)
  To: Julien Grall
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3,
	jbeulich, xen-devel

Hi Julien,

On 06/17/2014 08:46 PM, Julien Grall wrote:
> Hi Bob,
> 
> On 06/17/2014 12:49 PM, Bob Liu wrote:
>> To indicate a page is free but need to be scrubbed before in use.
>>
>> Signed-off-by: Bob Liu <bob.liu@oracle.com>
> 
> AFAIU, you plan to use this new flag in the common code. Please add a
> similar one for ARM (in include/asm-arm/mm.h) otherwise you will break
> compilation on this architecture.
> 

Thank you very much for your reminding.

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] xen: use idle vcpus to scrub pages
  2014-06-17 13:01   ` Jan Beulich
@ 2014-06-18  1:18     ` Bob Liu
  2014-06-18 10:42       ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Bob Liu @ 2014-06-18  1:18 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 06/17/2014 09:01 PM, Jan Beulich wrote:
>>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:
>> In case of heavy lock contention, use a percpu list.
>>  - Delist a batch of pages to a percpu list from "scrub" free page list.
>>  - Scrub pages on this percpu list.
>>  - Add those clean pages to normal "head" free page list, merge with other
> 
> Did you mean "heap" here?
> 

Yes, sorry!

>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -587,12 +587,14 @@ int domain_kill(struct domain *d)
>>          d->tmem_client = NULL;
>>          /* fallthrough */
>>      case DOMDYING_dying:
>> +        enable_idle_scrub = 0;
>>          rc = domain_relinquish_resources(d);
>>          if ( rc != 0 )
>>          {
>>              BUG_ON(rc != -EAGAIN);
>>              break;
>>          }
>> +        enable_idle_scrub = 1;
>>          for_each_vcpu ( d, v )
>>              unmap_vcpu_info(v);
>>          d->is_dying = DOMDYING_dead;
> 
> ???

enable_idle_scrub is a rough way to disable idle vcpu scrubbing, so that
domain_relinquish_resources() can get "&heap_lock" faster and make xl
destroy return quickly.

> 
>> --- a/xen/common/page_alloc.c
>> +++ b/xen/common/page_alloc.c
>> @@ -79,6 +79,9 @@ PAGE_LIST_HEAD(page_offlined_list);
>>  /* Broken page list, protected by heap_lock. */
>>  PAGE_LIST_HEAD(page_broken_list);
>>  
>> +volatile bool_t enable_idle_scrub;
>> +DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);
> 
> I'm pretty sure I pointed out to you before that variables used only in
> a single file should be static.
> 

Sorry, again..

>> @@ -1387,7 +1390,86 @@ void __init scrub_heap_pages(void)
>>      setup_low_mem_virq();
>>  }
>>  
>> +#define SCRUB_BATCH 1024
>> +void scrub_free_pages(void)
>> +{
>> +    struct page_info *pg;
>> +    unsigned int i, j, node_empty = 0, nr_delisted = 0;
>> +    int order;
>> +    unsigned int cpu = smp_processor_id();
>> +    unsigned int node = cpu_to_node(cpu);
>> +    struct page_list_head *temp_list = &this_cpu(scrub_list_cpu);
>> +
>> +    if ( !enable_idle_scrub )
>> +        return;
>> +
>> +    do
>> +    {
>> +        if ( page_list_empty(temp_list) )
>> +        {
>> +            /* Delist a batch of pages from global scrub list */
>> +            spin_lock(&heap_lock);
>> +            for ( j = 0; j < NR_ZONES; j++ )
>> +            {
>> +                for ( order = MAX_ORDER; order >= 0; order-- )
>> +                {
>> +                    if ( (pg = page_list_remove_head(&scrub(node, j, order))) )
>> +                    {
>> +                        for ( i = 0; i < (1 << order); i++)
>> +                            mark_page_offline(&pg[i], 0);
> 
> A page previously having caused #MC now suddenly became healthy
> again?
> 

Will be fixed.

>> +
>> +                        page_list_add_tail(pg, temp_list);
>> +                        nr_delisted += (1 << order);
>> +                        if ( nr_delisted > SCRUB_BATCH )
>> +                        {
>> +                            nr_delisted = 0;
>> +                            spin_unlock(&heap_lock);
>> +                            goto start_scrub;
>> +                        }
>> +                    }
>> +                }
>> +            }
>> +
>> +            node_empty = 1;
>> +            spin_unlock(&heap_lock);
>> +        }
>> +        else
>> +        {
>> +start_scrub:
> 
> Labels need to be indented by at least one space.
> 
>> +            /* Scrub percpu list */
>> +            while ( !page_list_empty(temp_list) )
>> +            {
>> +                pg = page_list_remove_head(temp_list);
>> +                ASSERT(pg);
>> +                order = PFN_ORDER(pg);
>> +                for ( i = 0; i < (1 << order); i++ )
>> +                {
> 
> Order here can again be almost arbitrarily high. You aren't allowed
> to scrub e.g. 4Tb in one go.
> 

I can add a softirq_pending(cpu) check after each scrub_one_page.

>> +                    ASSERT( test_bit(_PGC_need_scrub, &(pg[i].count_info)) );
>> +                    scrub_one_page(&pg[i]);
>> +                    pg[i].count_info &= ~(PGC_need_scrub);
>> +                }
>>  
>> +                /* Add pages to free heap list */
>> +                spin_lock(&heap_lock);
> 
> Between the dropping of the lock above and the re-acquiring of it
> here the pages "stolen" can lead to allocation failures despite there
> being available memory. This needs to be avoided if at all possible.
> 

#define SCRUB_BATCH 1024 is used to limit the number of pages in percpu
list.
But yes, in the worst case(order=20) 4Tb will be invisible. I don't have
a good solution right now.

>> +                for ( i = 0; i < (1 << order); i++ )
>> +                {
>> +                    ASSERT ( !test_bit(_PGC_need_scrub, &(pg[i].count_info)) );
>> +                    pg[i].count_info |= PGC_state_free;
>> +                }
>> +                ASSERT (node == phys_to_nid(page_to_maddr(pg)));
>> +                merge_free_trunks(pg, order, 0);
>> +                spin_unlock(&heap_lock);
>> +
>> +                if ( softirq_pending(cpu) )
>> +                    return;
>> +            }
>> +        }
>> +
>> +        /* Scrub list of this node is empty */
>> +        if ( node_empty )
>> +            return;
>> +    } while ( !softirq_pending(cpu) );
>> +}
> 
> And all the logic needs to be made NUMA-aware, i.e. a CPU should
> prefer to scrub local memory, and only resort to scrubbing distant
> memory if no CPU on the correct node is idle. Plus (as we have seen
> with Konrad's boot time scrubbing changes) you should avoid having
> two hyperthreads within the same core doing scrubbing at the same
> time.
> 

Okay, will be taken into account.

> In the end I'm getting the impression that this all wasn't properly
> thought through yet. Convoys on the lock inside the idle processing
> should e.g. also be avoided (or reduced to a minimum) - there's no
> point preventing the entering of C states if all you're otherwise going
> to do is spin on a lock.
> 

I already use percpu list to reduce spin lock. node_empty can be
extended to avoid the case your mentioned.
Thank you very much for all your review.

-Bob

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/4] xen: use idle vcpus to scrub pages
  2014-06-18  1:18     ` Bob Liu
@ 2014-06-18 10:42       ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2014-06-18 10:42 UTC (permalink / raw)
  To: bob.liu
  Cc: lliubbo, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> Bob Liu <bob.liu@oracle.com> 06/18/14 3:18 AM >>>
>On 06/17/2014 09:01 PM, Jan Beulich wrote:
>>>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:
>>> --- a/xen/common/domain.c
>>> +++ b/xen/common/domain.c
>>> @@ -587,12 +587,14 @@ int domain_kill(struct domain *d)
>>>          d->tmem_client = NULL;
>>>          /* fallthrough */
>>>      case DOMDYING_dying:
>>> +        enable_idle_scrub = 0;
>>>          rc = domain_relinquish_resources(d);
>>>          if ( rc != 0 )
>>>          {
>>>              BUG_ON(rc != -EAGAIN);
>>>              break;
>>>          }
>>> +        enable_idle_scrub = 1;
>>>          for_each_vcpu ( d, v )
>>>              unmap_vcpu_info(v);
>>>          d->is_dying = DOMDYING_dead;
>> 
>> ???
>
>enable_idle_scrub is a rough way to disable idle vcpu scrubbing, so that
>domain_relinquish_resources() can get "&heap_lock" faster and make xl
>destroy return quickly.

If such a hack is really necessary, it requires a decent code comment. But I
don't think it's needed, as long as you keep down the lock holding time on the
scrubber side.

Jan

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] xen: introduce an "scrub" free page list
  2014-06-18  0:54     ` Bob Liu
@ 2014-06-18 13:18       ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-06-18 13:18 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3,
	Jan Beulich, xen-devel

On Wed, Jun 18, 2014 at 08:54:00AM +0800, Bob Liu wrote:
> 
> On 06/17/2014 08:36 PM, Jan Beulich wrote:
> >>>> On 17.06.14 at 13:49, <lliubbo@gmail.com> wrote:
> >> Because of page scrubbing, it's very slow to destroy a domain with large 
> >> memory.
> >> It takes around 10 minutes when destroy a guest of nearly 1 TB of memory.
> >>
> >> This patch introduced a "scrub" free page list, pages on this list need to be
> >> scrubbed before use. During domain destory just mark pages "PGC_need_scrub"
> >> and then add them to this list, so that xl can return quickly.
> >>
> >> In alloc_domheap_pages():
> >>   - If there are pages on the normal "heap" freelist, allocate them.
> >>   - Try to allocate from the "scrub" free list with the same order and do
> >>     scrubbing synchronously.
> >>
> >> In order to not fail high order allocate request, merge page trunks for
> >> PGC_need_scrub pages.
> >> Note: PCG_need_scrub pages and normal pages are not mergeable
> > 
> > The whole series needs sync-ing logically both with Konrad's recent
> > boot time scrubbing changes (this change here really makes his
> > change unnecessary - the affected pages could quite well be simply
> > marked as needing scrubbing, getting dealt with by the logic added
> > here) and with XSA-100's change.
> > 
> 
> I agree. Konrad?

The patch is already in staging. See 7430a86de0c9bd126b441570e459f6e06413cbf6
and 4bd78937ec324bcef4e29ef951e0ff9815770de1.

Thanks!

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-06-18 13:18 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-17 11:49 [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Bob Liu
2014-06-17 11:49 ` [PATCH 2/4] xen: introduce an "scrub" free page list Bob Liu
2014-06-17 12:36   ` Jan Beulich
2014-06-18  0:54     ` Bob Liu
2014-06-18 13:18       ` Konrad Rzeszutek Wilk
2014-06-17 11:49 ` [PATCH 3/4] xen: separate a function merge_free_trunks from Bob Liu
2014-06-17 12:39   ` Jan Beulich
2014-06-17 11:49 ` [PATCH 4/4] xen: use idle vcpus to scrub pages Bob Liu
2014-06-17 13:01   ` Jan Beulich
2014-06-18  1:18     ` Bob Liu
2014-06-18 10:42       ` Jan Beulich
2014-06-17 12:35 ` [PATCH 1/4] xen: asm-x86: introduce PGC_need_scrub page flag Jan Beulich
2014-06-17 12:46 ` Julien Grall
2014-06-18  0:55   ` Bob Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.