All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/3] xen: delay page scrubbing to allocation path
@ 2014-06-30 13:39 Bob Liu
  2014-06-30 13:39 ` [PATCH v2 2/3] xen: introduce function merge_free_trunks Bob Liu
                   ` (2 more replies)
  0 siblings, 3 replies; 25+ messages in thread
From: Bob Liu @ 2014-06-30 13:39 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, JBeulich

Because of page scrubbing, it's very slow to destroy a domain with large memory.
It takes around 10 minutes when destroy a guest of nearly 1 TB of memory.

This patch introduced a "PGC_need_scrub" flag, pages with this flag need to be
scrubbed before use. During domain destory just mark pages as "PGC_need_scrub"
and then add them to free list, so that xl can return quickly.

Note: PCG_need_scrub pages and normal pages are not mergeable

v2:
 * Fix issue: Avoid to scrub all 4Tb when a 4Tb chunk found for a request of a
 single page.
 * Replace more scrub_one_page() place by setting "need_scrub"
 * No more use an extra _scrub[] array

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/common/page_alloc.c  |   63 ++++++++++++++++++++++++++++++++++------------
 xen/include/asm-arm/mm.h |    5 +++-
 xen/include/asm-x86/mm.h |    5 +++-
 3 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 58677d0..c184f86 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -711,6 +711,12 @@ static struct page_info *alloc_heap_pages(
 
     for ( i = 0; i < (1 << order); i++ )
     {
+        if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
+        {
+            scrub_one_page(&pg[i]);
+            pg[i].count_info &= ~PGC_need_scrub;
+        }
+
         /* Reference count must continuously be zero for free pages. */
         BUG_ON(pg[i].count_info != PGC_state_free);
         pg[i].count_info = PGC_state_inuse;
@@ -827,7 +833,7 @@ static int reserve_offlined_page(struct page_info *head)
 
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
-    struct page_info *pg, unsigned int order)
+    struct page_info *pg, unsigned int order, bool_t need_scrub)
 {
     unsigned long mask, mfn = page_to_mfn(pg);
     unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
@@ -876,6 +882,15 @@ static void free_heap_pages(
         midsize_alloc_zone_pages = max(
             midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
 
+    if ( need_scrub )
+    {
+        if ( !tainted )
+        {
+            for ( i = 0; i < (1 << order); i++ )
+                pg[i].count_info |= PGC_need_scrub;
+        }
+    }
+
     /* Merge chunks as far as possible. */
     while ( order < MAX_ORDER )
     {
@@ -889,6 +904,17 @@ static void free_heap_pages(
                  (PFN_ORDER(pg-mask) != order) ||
                  (phys_to_nid(page_to_maddr(pg-mask)) != node) )
                 break;
+            /* If we need scrub, only merge with PGC_need_scrub pages */
+            if ( need_scrub )
+            {
+                if ( !test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
+                    break;
+            }
+            else
+            {
+                if ( test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
+                    break;
+            }
             pg -= mask;
             page_list_del(pg, &heap(node, zone, order));
         }
@@ -900,6 +926,16 @@ static void free_heap_pages(
                  (PFN_ORDER(pg+mask) != order) ||
                  (phys_to_nid(page_to_maddr(pg+mask)) != node) )
                 break;
+            if ( need_scrub )
+            {
+                if ( !test_bit(_PGC_need_scrub, &(pg+mask)->count_info) )
+                    break;
+            }
+            else
+            {
+                if ( test_bit(_PGC_need_scrub, &(pg+mask)->count_info) )
+                    break;
+            }
             page_list_del(pg + mask, &heap(node, zone, order));
         }
 
@@ -1132,7 +1168,7 @@ unsigned int online_page(unsigned long mfn, uint32_t *status)
     spin_unlock(&heap_lock);
 
     if ( (y & PGC_state) == PGC_state_offlined )
-        free_heap_pages(pg, 0);
+        free_heap_pages(pg, 0, 0);
 
     return ret;
 }
@@ -1201,7 +1237,7 @@ static void init_heap_pages(
             nr_pages -= n;
         }
 
-        free_heap_pages(pg+i, 0);
+        free_heap_pages(pg+i, 0, 0);
     }
 }
 
@@ -1535,7 +1571,7 @@ void free_xenheap_pages(void *v, unsigned int order)
 
     memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
 
-    free_heap_pages(virt_to_page(v), order);
+    free_heap_pages(virt_to_page(v), order, 1);
 }
 
 #else
@@ -1588,11 +1624,10 @@ void free_xenheap_pages(void *v, unsigned int order)
 
     for ( i = 0; i < (1u << order); i++ )
     {
-        scrub_one_page(&pg[i]);
         pg[i].count_info &= ~PGC_xen_heap;
     }
 
-    free_heap_pages(pg, order);
+    free_heap_pages(pg, order, 1);
 }
 
 #endif
@@ -1696,7 +1731,7 @@ struct page_info *alloc_domheap_pages(
 
     if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
     {
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, 0);
         return NULL;
     }
     
@@ -1745,24 +1780,20 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
          * domain has died we assume responsibility for erasure.
          */
         if ( unlikely(d->is_dying) )
-            for ( i = 0; i < (1 << order); i++ )
-                scrub_one_page(&pg[i]);
-
-        free_heap_pages(pg, order);
+            free_heap_pages(pg, order, 1);
+        else
+            free_heap_pages(pg, order, 0);
     }
     else if ( unlikely(d == dom_cow) )
     {
         ASSERT(order == 0); 
-        scrub_one_page(pg);
-        free_heap_pages(pg, 0);
+        free_heap_pages(pg, 0, 1);
         drop_dom_ref = 0;
     }
     else
     {
         /* Freeing anonymous domain-heap pages. */
-        for ( i = 0; i < (1 << order); i++ )
-            scrub_one_page(&pg[i]);
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, 1);
         drop_dom_ref = 0;
     }
 
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index 2552d34..e8913a8 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -103,9 +103,12 @@ struct page_info
 #define PGC_state_offlined PG_mask(2, 9)
 #define PGC_state_free    PG_mask(3, 9)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+/* Page need to be scrubbed */
+#define _PGC_need_scrub   PG_shift(10)
+#define PGC_need_scrub    PG_mask(1, 10)
 
 /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(9)
+#define PGC_count_width   PG_shift(10)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 extern unsigned long xenheap_mfn_start, xenheap_mfn_end;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index d253117..35746ab 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -223,9 +223,12 @@ struct page_info
 #define PGC_state_offlined PG_mask(2, 9)
 #define PGC_state_free    PG_mask(3, 9)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
+/* Page need to be scrubbed */
+#define _PGC_need_scrub   PG_shift(10)
+#define PGC_need_scrub    PG_mask(1, 10)
 
  /* Count of references to this frame. */
-#define PGC_count_width   PG_shift(9)
+#define PGC_count_width   PG_shift(10)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
 struct spage_info
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 2/3] xen: introduce function merge_free_trunks
  2014-06-30 13:39 [PATCH v2 1/3] xen: delay page scrubbing to allocation path Bob Liu
@ 2014-06-30 13:39 ` Bob Liu
  2014-06-30 15:58   ` Jan Beulich
  2014-06-30 13:39 ` [PATCH v2 3/3] xen: use idle vcpus to scrub pages Bob Liu
  2014-06-30 15:56 ` [PATCH v2 1/3] xen: delay page scrubbing to allocation path Jan Beulich
  2 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-06-30 13:39 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, JBeulich

Separated from free_heap_pages(), will be used by idle vcpu scrubbing.

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/common/page_alloc.c |  118 +++++++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 54 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index c184f86..ab293c8 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -831,11 +831,73 @@ static int reserve_offlined_page(struct page_info *head)
     return count;
 }
 
+static void merge_free_trunks(struct page_info *pg, unsigned int order,
+    unsigned int node, unsigned int zone, bool_t need_scrub)
+{
+    unsigned long mask;
+
+    ASSERT(spin_is_locked(&heap_lock));
+
+    /* Merge chunks as far as possible. */
+    while ( order < MAX_ORDER )
+    {
+        mask = 1UL << order;
+
+        if ( (page_to_mfn(pg) & mask) )
+        {
+            /* Merge with predecessor block? */
+            if ( !mfn_valid(page_to_mfn(pg-mask)) ||
+                 !page_state_is(pg-mask, free) ||
+                 (PFN_ORDER(pg-mask) != order) ||
+                 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
+                break;
+            /* If we need scrub, only merge with PGC_need_scrub pages */
+            if ( need_scrub )
+            {
+                if ( !test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
+                    break;
+            }
+            else
+            {
+                if ( test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
+                    break;
+            }
+            pg -= mask;
+            page_list_del(pg, &heap(node, zone, order));
+        }
+        else
+        {
+            /* Merge with successor block? */
+            if ( !mfn_valid(page_to_mfn(pg+mask)) ||
+                 !page_state_is(pg+mask, free) ||
+                 (PFN_ORDER(pg+mask) != order) ||
+                 (phys_to_nid(page_to_maddr(pg+mask)) != node) )
+                break;
+            if ( need_scrub )
+            {
+                if ( !test_bit(_PGC_need_scrub, &(pg+mask)->count_info) )
+                    break;
+            }
+            else
+            {
+                if ( test_bit(_PGC_need_scrub, &(pg+mask)->count_info) )
+                    break;
+            }
+            page_list_del(pg + mask, &heap(node, zone, order));
+        }
+
+        order++;
+    }
+
+    PFN_ORDER(pg) = order;
+    page_list_add_tail(pg, &heap(node, zone, order));
+}
+
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
     struct page_info *pg, unsigned int order, bool_t need_scrub)
 {
-    unsigned long mask, mfn = page_to_mfn(pg);
+    unsigned long mfn = page_to_mfn(pg);
     unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
     unsigned int zone = page_to_zone(pg);
 
@@ -891,59 +953,7 @@ static void free_heap_pages(
         }
     }
 
-    /* Merge chunks as far as possible. */
-    while ( order < MAX_ORDER )
-    {
-        mask = 1UL << order;
-
-        if ( (page_to_mfn(pg) & mask) )
-        {
-            /* Merge with predecessor block? */
-            if ( !mfn_valid(page_to_mfn(pg-mask)) ||
-                 !page_state_is(pg-mask, free) ||
-                 (PFN_ORDER(pg-mask) != order) ||
-                 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
-                break;
-            /* If we need scrub, only merge with PGC_need_scrub pages */
-            if ( need_scrub )
-            {
-                if ( !test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
-                    break;
-            }
-            else
-            {
-                if ( test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
-                    break;
-            }
-            pg -= mask;
-            page_list_del(pg, &heap(node, zone, order));
-        }
-        else
-        {
-            /* Merge with successor block? */
-            if ( !mfn_valid(page_to_mfn(pg+mask)) ||
-                 !page_state_is(pg+mask, free) ||
-                 (PFN_ORDER(pg+mask) != order) ||
-                 (phys_to_nid(page_to_maddr(pg+mask)) != node) )
-                break;
-            if ( need_scrub )
-            {
-                if ( !test_bit(_PGC_need_scrub, &(pg+mask)->count_info) )
-                    break;
-            }
-            else
-            {
-                if ( test_bit(_PGC_need_scrub, &(pg+mask)->count_info) )
-                    break;
-            }
-            page_list_del(pg + mask, &heap(node, zone, order));
-        }
-
-        order++;
-    }
-
-    PFN_ORDER(pg) = order;
-    page_list_add_tail(pg, &heap(node, zone, order));
+    merge_free_trunks(pg, order, node, zone, need_scrub);
 
     if ( tainted )
         reserve_offlined_page(pg);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-06-30 13:39 [PATCH v2 1/3] xen: delay page scrubbing to allocation path Bob Liu
  2014-06-30 13:39 ` [PATCH v2 2/3] xen: introduce function merge_free_trunks Bob Liu
@ 2014-06-30 13:39 ` Bob Liu
  2014-07-01  9:12   ` Jan Beulich
  2014-06-30 15:56 ` [PATCH v2 1/3] xen: delay page scrubbing to allocation path Jan Beulich
  2 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-06-30 13:39 UTC (permalink / raw)
  To: xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, JBeulich

In case of heavy lock contention, use two percpu lists.
 - Delist a batch of pages to a percpu list from _heap[] free page list.
 - Scrub pages on this percpu list and add to another percpu free list.
 - Free those clean pages to _heap[], merge with other chunks if needed.

v2:
* Avoid having two hyperthreads within the same core doing scrubbing
* Limit (1<<SCRUB_BATCH_ORDER) pages to percpu list in one go
* Won't spin on heap lock when there is nothing to scrub
* Partial numa aware

Signed-off-by: Bob Liu <bob.liu@oracle.com>
---
 xen/arch/arm/domain.c   |    1 +
 xen/arch/x86/domain.c   |    1 +
 xen/common/page_alloc.c |  130 +++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/mm.h    |    1 +
 4 files changed, 133 insertions(+)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 04d0cd0..b6bc3ac 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -44,6 +44,7 @@ void idle_loop(void)
         if ( cpu_is_offline(smp_processor_id()) )
             stop_cpu();
 
+        scrub_free_pages();
         local_irq_disable();
         if ( cpu_is_haltable(smp_processor_id()) )
         {
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e896210..e8d4fe7 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -116,6 +116,7 @@ static void idle_loop(void)
     {
         if ( cpu_is_offline(smp_processor_id()) )
             play_dead();
+        scrub_free_pages();
         (*pm_idle)();
         do_tasklet();
         do_softirq();
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index ab293c8..6ab1d1d 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -86,6 +86,12 @@ PAGE_LIST_HEAD(page_offlined_list);
 /* Broken page list, protected by heap_lock. */
 PAGE_LIST_HEAD(page_broken_list);
 
+/* A rough flag to indicate whether a node have need_scrub pages */
+static bool_t node_need_scrub[MAX_NUMNODES];
+static DEFINE_PER_CPU(bool_t, is_scrubbing);
+static DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);
+static DEFINE_PER_CPU(struct page_list_head, free_list_cpu);
+
 /*************************
  * BOOT-TIME ALLOCATOR
  */
@@ -948,6 +954,7 @@ static void free_heap_pages(
     {
         if ( !tainted )
         {
+            node_need_scrub[node] = 1;
             for ( i = 0; i < (1 << order); i++ )
                 pg[i].count_info |= PGC_need_scrub;
         }
@@ -1525,7 +1532,130 @@ void __init scrub_heap_pages(void)
     setup_low_mem_virq();
 }
 
+#define SCRUB_BATCH_ORDER 12
+static void __scrub_free_pages(unsigned int node, unsigned int cpu)
+{
+    struct page_info *pg, *tmp;
+    unsigned int i;
+    int order;
+    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
+    struct page_list_head *local_free_list = &this_cpu(free_list_cpu);
+
+    /* Scrub percpu list */
+    while ( !page_list_empty(local_scrub_list) )
+    {
+        pg = page_list_remove_head(local_scrub_list);
+        order = PFN_ORDER(pg);
+        ASSERT( pg && order <= SCRUB_BATCH_ORDER );
+        for ( i = 0; i < (1 << order); i++ )
+        {
+            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
+            scrub_one_page(&pg[i]);
+        }
+        page_list_add_tail(pg, local_free_list);
+        if ( softirq_pending(cpu) )
+		return;
+    }
+
+    /* free percpu free list */
+    if ( !page_list_empty(local_free_list) )
+    {
+        spin_lock(&heap_lock);
+        page_list_for_each_safe( pg, tmp, local_free_list )
+        {
+            order = PFN_ORDER(pg);
+            page_list_del(pg, local_free_list);
+            for ( i = 0; i < (1 << order); i++ )
+	    {
+                pg[i].count_info |= PGC_state_free;
+                pg[i].count_info &= ~PGC_need_scrub;
+            }
+            merge_free_trunks(pg, order, node, page_to_zone(pg), 0);
+        }
+        spin_unlock(&heap_lock);
+    }
+}
+
+void scrub_free_pages(void)
+{
+    int order;
+    struct page_info *pg, *tmp;
+    unsigned int i, zone, nr_delisted = 0;
+    unsigned int cpu = smp_processor_id();
+    unsigned int node = cpu_to_node(cpu);
+    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
+
+    /* Return if our sibling already started scrubbing */
+    for_each_cpu( i, per_cpu(cpu_sibling_mask,cpu) )
+        if ( per_cpu(is_scrubbing, i) )
+            return;
+    this_cpu(is_scrubbing) = 1;
+
+    while ( !softirq_pending(cpu) )
+    {
+        if ( !node_need_scrub[node] )
+        {
+            /* Free local per cpu list before we exit */
+            __scrub_free_pages(node, cpu);
+            goto out;
+        }
+
+        /* Delist a batch of pages from global scrub list */
+        if ( page_list_empty(local_scrub_list) )
+        {
+            spin_lock(&heap_lock);
+            for ( zone = 0; zone < NR_ZONES; zone++ )
+            {
+                for ( order = MAX_ORDER; order >= 0; order-- )
+                {
+                    page_list_for_each_safe( pg, tmp, &heap(node, zone, order) )
+                    {
+                        if ( !test_bit(_PGC_need_scrub, &(pg->count_info)) )
+                            continue;
+
+                        page_list_del( pg, &heap(node, zone, order) );
+                        if ( order > SCRUB_BATCH_ORDER)
+                        {
+                            /* putback extra pages */
+                            i = order;
+                            while ( i != SCRUB_BATCH_ORDER )
+                            {
+                                PFN_ORDER(pg) = --i;
+                                page_list_add_tail(pg, &heap(node, zone, i));
+                                pg += 1 << i;
+                            }
+                            PFN_ORDER(pg) = SCRUB_BATCH_ORDER;
+                        }
+
+                        for ( i = 0; i < (1 << PFN_ORDER(pg)); i++ )
+                        {
+                            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
+                            ASSERT( !test_bit(_PGC_broken, &pg[i].count_info) );
+                            mark_page_offline(&pg[i], 0);
+                        }
+                        page_list_add_tail(pg, local_scrub_list);
+                        nr_delisted += ( 1 << PFN_ORDER(pg) );
+                        if ( nr_delisted >= (1 << SCRUB_BATCH_ORDER) )
+                        {
+                            nr_delisted = 0;
+                            spin_unlock(&heap_lock);
+                            goto start_scrub;
+                        }
+                    }
+                }
+            }
+
+            node_need_scrub[node] = 0;
+            spin_unlock(&heap_lock);
+        }
 
+ start_scrub:
+        __scrub_free_pages(node, cpu);
+    }
+
+ out:
+    this_cpu(is_scrubbing) = 0;
+}
 
 /*************************
  * XEN-HEAP SUB-ALLOCATOR
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index b183189..1fa8c3d 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -78,6 +78,7 @@ int query_page_offline(unsigned long mfn, uint32_t *status);
 unsigned long total_free_pages(void);
 
 void scrub_heap_pages(void);
+void scrub_free_pages(void);
 
 int assign_pages(
     struct domain *d,
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/3] xen: delay page scrubbing to allocation path
  2014-06-30 13:39 [PATCH v2 1/3] xen: delay page scrubbing to allocation path Bob Liu
  2014-06-30 13:39 ` [PATCH v2 2/3] xen: introduce function merge_free_trunks Bob Liu
  2014-06-30 13:39 ` [PATCH v2 3/3] xen: use idle vcpus to scrub pages Bob Liu
@ 2014-06-30 15:56 ` Jan Beulich
  2014-07-01  8:12   ` Bob Liu
  2 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-06-30 15:56 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -711,6 +711,12 @@ static struct page_info *alloc_heap_pages(
>  
>      for ( i = 0; i < (1 << order); i++ )
>      {
> +        if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
> +        {
> +            scrub_one_page(&pg[i]);
> +            pg[i].count_info &= ~PGC_need_scrub;
> +        }
> +

heap_lock is still being held here - scrubbing should be done after it
was dropped (or else you re-introduce the same latency problem to
other paths now needing to wait for the scrubbing to complete).

> @@ -876,6 +882,15 @@ static void free_heap_pages(
>          midsize_alloc_zone_pages = max(
>              midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
>  
> +    if ( need_scrub )
> +    {
> +        if ( !tainted )
> +        {
> +            for ( i = 0; i < (1 << order); i++ )
> +                pg[i].count_info |= PGC_need_scrub;
> +        }
> +    }

Two if()s like these should be folded into one.

> @@ -889,6 +904,17 @@ static void free_heap_pages(
>                   (PFN_ORDER(pg-mask) != order) ||
>                   (phys_to_nid(page_to_maddr(pg-mask)) != node) )
>                  break;
> +            /* If we need scrub, only merge with PGC_need_scrub pages */
> +            if ( need_scrub )
> +            {
> +                if ( !test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
> +                    break;
> +            }
> +            else
> +            {
> +                if ( test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
> +                    break;
> +            }

You're setting PGC_need_scrub on each 4k page anyway (which
is debatable), hence there's no need to look at the passed in
need_scrub flag here: Just check whether both chunks have the
flag set the same. Same below.

> @@ -1535,7 +1571,7 @@ void free_xenheap_pages(void *v, unsigned int order)
>  
>      memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
>  
> -    free_heap_pages(virt_to_page(v), order);
> +    free_heap_pages(virt_to_page(v), order, 1);

Why?

> @@ -1588,11 +1624,10 @@ void free_xenheap_pages(void *v, unsigned int order)
>  
>      for ( i = 0; i < (1u << order); i++ )
>      {
> -        scrub_one_page(&pg[i]);
>          pg[i].count_info &= ~PGC_xen_heap;
>      }
>  
> -    free_heap_pages(pg, order);
> +    free_heap_pages(pg, order, 1);

The flags needs to be 1 here, but I don't see why you also pass 1 in
the other free_xenheap_pages() incarnation above.

> @@ -1745,24 +1780,20 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>           * domain has died we assume responsibility for erasure.
>           */
>          if ( unlikely(d->is_dying) )
> -            for ( i = 0; i < (1 << order); i++ )
> -                scrub_one_page(&pg[i]);
> -
> -        free_heap_pages(pg, order);
> +            free_heap_pages(pg, order, 1);
> +        else
> +            free_heap_pages(pg, order, 0);
>      }
>      else if ( unlikely(d == dom_cow) )
>      {
>          ASSERT(order == 0); 
> -        scrub_one_page(pg);
> -        free_heap_pages(pg, 0);
> +        free_heap_pages(pg, 0, 1);
>          drop_dom_ref = 0;
>      }
>      else
>      {
>          /* Freeing anonymous domain-heap pages. */
> -        for ( i = 0; i < (1 << order); i++ )
> -            scrub_one_page(&pg[i]);
> -        free_heap_pages(pg, order);
> +        free_heap_pages(pg, order, 1);
>          drop_dom_ref = 0;
>      }
>  

This hunk is patching no longer existing code (see commit daa4b800
"slightly consolidate code in free_domheap_pages()").

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/3] xen: introduce function merge_free_trunks
  2014-06-30 13:39 ` [PATCH v2 2/3] xen: introduce function merge_free_trunks Bob Liu
@ 2014-06-30 15:58   ` Jan Beulich
  2014-07-01  8:14     ` Bob Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-06-30 15:58 UTC (permalink / raw)
  To: Bob Liu, xen-devel; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3

>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -831,11 +831,73 @@ static int reserve_offlined_page(struct page_info *head)
>      return count;
>  }
>  
> +static void merge_free_trunks(struct page_info *pg, unsigned int order,

Did you perhaps mean merge_free_chunks(), ...

> @@ -891,59 +953,7 @@ static void free_heap_pages(
>          }
>      }
>  
> -    /* Merge chunks as far as possible. */

... taking into consideration this comment the patch removes?

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 1/3] xen: delay page scrubbing to allocation path
  2014-06-30 15:56 ` [PATCH v2 1/3] xen: delay page scrubbing to allocation path Jan Beulich
@ 2014-07-01  8:12   ` Bob Liu
  0 siblings, 0 replies; 25+ messages in thread
From: Bob Liu @ 2014-07-01  8:12 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 06/30/2014 11:56 PM, Jan Beulich wrote:
>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>> --- a/xen/common/page_alloc.c
>> +++ b/xen/common/page_alloc.c
>> @@ -711,6 +711,12 @@ static struct page_info *alloc_heap_pages(
>>  
>>      for ( i = 0; i < (1 << order); i++ )
>>      {
>> +        if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
>> +        {
>> +            scrub_one_page(&pg[i]);
>> +            pg[i].count_info &= ~PGC_need_scrub;
>> +        }
>> +
> 
> heap_lock is still being held here - scrubbing should be done after it
> was dropped (or else you re-introduce the same latency problem to
> other paths now needing to wait for the scrubbing to complete).
> 

I see, now it only avoids this case e.g don't scrub all 4Tb when a 4Tb
chunk found for a request of a single page.

Anyway I will move the scrubbing out of spinlock in next version.

>> @@ -876,6 +882,15 @@ static void free_heap_pages(
>>          midsize_alloc_zone_pages = max(
>>              midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
>>  
>> +    if ( need_scrub )
>> +    {
>> +        if ( !tainted )
>> +        {
>> +            for ( i = 0; i < (1 << order); i++ )
>> +                pg[i].count_info |= PGC_need_scrub;
>> +        }
>> +    }
> 
> Two if()s like these should be folded into one.
> 

Will be fixed.

>> @@ -889,6 +904,17 @@ static void free_heap_pages(
>>                   (PFN_ORDER(pg-mask) != order) ||
>>                   (phys_to_nid(page_to_maddr(pg-mask)) != node) )
>>                  break;
>> +            /* If we need scrub, only merge with PGC_need_scrub pages */
>> +            if ( need_scrub )
>> +            {
>> +                if ( !test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
>> +                    break;
>> +            }
>> +            else
>> +            {
>> +                if ( test_bit(_PGC_need_scrub, &(pg-mask)->count_info) )
>> +                    break;
>> +            }
> 
> You're setting PGC_need_scrub on each 4k page anyway (which
> is debatable), hence there's no need to look at the passed in
> need_scrub flag here: Just check whether both chunks have the
> flag set the same. Same below.
> 

Right, thanks for your suggestion.

>> @@ -1535,7 +1571,7 @@ void free_xenheap_pages(void *v, unsigned int order)
>>  
>>      memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
>>  
>> -    free_heap_pages(virt_to_page(v), order);
>> +    free_heap_pages(virt_to_page(v), order, 1);
> 
> Why?
> 

Sorry, a mistake here and will be fixed.

>> @@ -1588,11 +1624,10 @@ void free_xenheap_pages(void *v, unsigned int order)
>>  
>>      for ( i = 0; i < (1u << order); i++ )
>>      {
>> -        scrub_one_page(&pg[i]);
>>          pg[i].count_info &= ~PGC_xen_heap;
>>      }
>>  
>> -    free_heap_pages(pg, order);
>> +    free_heap_pages(pg, order, 1);
> 
> The flags needs to be 1 here, but I don't see why you also pass 1 in
> the other free_xenheap_pages() incarnation above.
> 
>> @@ -1745,24 +1780,20 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>>           * domain has died we assume responsibility for erasure.
>>           */
>>          if ( unlikely(d->is_dying) )
>> -            for ( i = 0; i < (1 << order); i++ )
>> -                scrub_one_page(&pg[i]);
>> -
>> -        free_heap_pages(pg, order);
>> +            free_heap_pages(pg, order, 1);
>> +        else
>> +            free_heap_pages(pg, order, 0);
>>      }
>>      else if ( unlikely(d == dom_cow) )
>>      {
>>          ASSERT(order == 0); 
>> -        scrub_one_page(pg);
>> -        free_heap_pages(pg, 0);
>> +        free_heap_pages(pg, 0, 1);
>>          drop_dom_ref = 0;
>>      }
>>      else
>>      {
>>          /* Freeing anonymous domain-heap pages. */
>> -        for ( i = 0; i < (1 << order); i++ )
>> -            scrub_one_page(&pg[i]);
>> -        free_heap_pages(pg, order);
>> +        free_heap_pages(pg, order, 1);
>>          drop_dom_ref = 0;
>>      }
>>  
> 
> This hunk is patching no longer existing code (see commit daa4b800
> "slightly consolidate code in free_domheap_pages()").
> 

I'll rebase this patch to an newer version after that commit.
Thanks again.

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/3] xen: introduce function merge_free_trunks
  2014-06-30 15:58   ` Jan Beulich
@ 2014-07-01  8:14     ` Bob Liu
  2014-07-01  8:27       ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-07-01  8:14 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 06/30/2014 11:58 PM, Jan Beulich wrote:
>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>> --- a/xen/common/page_alloc.c
>> +++ b/xen/common/page_alloc.c
>> @@ -831,11 +831,73 @@ static int reserve_offlined_page(struct page_info *head)
>>      return count;
>>  }
>>  
>> +static void merge_free_trunks(struct page_info *pg, unsigned int order,
> 
> Did you perhaps mean merge_free_chunks(), ...

Oh, yes.

> 
>> @@ -891,59 +953,7 @@ static void free_heap_pages(
>>          }
>>      }
>>  
>> -    /* Merge chunks as far as possible. */
> 
> ... taking into consideration this comment the patch removes?
> 

I will fix these two issues.

Would you please also take a look at [PATCH v2 3/3]?

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 2/3] xen: introduce function merge_free_trunks
  2014-07-01  8:14     ` Bob Liu
@ 2014-07-01  8:27       ` Jan Beulich
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Beulich @ 2014-07-01  8:27 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 01.07.14 at 10:14, <bob.liu@oracle.com> wrote:

> On 06/30/2014 11:58 PM, Jan Beulich wrote:
>>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>>> --- a/xen/common/page_alloc.c
>>> +++ b/xen/common/page_alloc.c
>>> @@ -831,11 +831,73 @@ static int reserve_offlined_page(struct page_info 
> *head)
>>>      return count;
>>>  }
>>>  
>>> +static void merge_free_trunks(struct page_info *pg, unsigned int order,
>> 
>> Did you perhaps mean merge_free_chunks(), ...
> 
> Oh, yes.
> 
>> 
>>> @@ -891,59 +953,7 @@ static void free_heap_pages(
>>>          }
>>>      }
>>>  
>>> -    /* Merge chunks as far as possible. */
>> 
>> ... taking into consideration this comment the patch removes?
>> 
> 
> I will fix these two issues.

The removal of the comment is not an issue - I was just using this
to help pointing out what the intended naming here is.

> Would you please also take a look at [PATCH v2 3/3]?

Off course - all in due order (this isn't the only thing I have to
take care of, and not highest priority either).

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-06-30 13:39 ` [PATCH v2 3/3] xen: use idle vcpus to scrub pages Bob Liu
@ 2014-07-01  9:12   ` Jan Beulich
  2014-07-01 12:25     ` Bob Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-07-01  9:12 UTC (permalink / raw)
  To: Bob Liu; +Cc: keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -116,6 +116,7 @@ static void idle_loop(void)
>      {
>          if ( cpu_is_offline(smp_processor_id()) )
>              play_dead();
> +        scrub_free_pages();
>          (*pm_idle)();

So is it really appropriate to call pm_idle() if scrub_free_pages() didn't
return immediately? I.e. I would suppose the function ought to return
a boolean indicator whether any meaningful amount of processing
was done, in which case we may want to skip entering any kind of C
state (even if it would end up being just C1), or doing any of the
processing associated with this possible entering.

> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index ab293c8..6ab1d1d 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -86,6 +86,12 @@ PAGE_LIST_HEAD(page_offlined_list);
>  /* Broken page list, protected by heap_lock. */
>  PAGE_LIST_HEAD(page_broken_list);
>  
> +/* A rough flag to indicate whether a node have need_scrub pages */
> +static bool_t node_need_scrub[MAX_NUMNODES];
> +static DEFINE_PER_CPU(bool_t, is_scrubbing);
> +static DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);
> +static DEFINE_PER_CPU(struct page_list_head, free_list_cpu);
> +
>  /*************************
>   * BOOT-TIME ALLOCATOR
>   */
> @@ -948,6 +954,7 @@ static void free_heap_pages(
>      {
>          if ( !tainted )
>          {
> +            node_need_scrub[node] = 1;
>              for ( i = 0; i < (1 << order); i++ )
>                  pg[i].count_info |= PGC_need_scrub;
>          }

Iirc it was more than this single place where you set
PGC_need_scrub, and hence where you'd now need to set the
other flag too.

> @@ -1525,7 +1532,130 @@ void __init scrub_heap_pages(void)
>      setup_low_mem_virq();
>  }
>  
> +#define SCRUB_BATCH_ORDER 12

Please make this adjustable via command line, so that eventual
latency issues can be worked around.

> +static void __scrub_free_pages(unsigned int node, unsigned int cpu)
> +{
> +    struct page_info *pg, *tmp;
> +    unsigned int i;
> +    int order;
> +    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
> +    struct page_list_head *local_free_list = &this_cpu(free_list_cpu);
> +
> +    /* Scrub percpu list */
> +    while ( !page_list_empty(local_scrub_list) )
> +    {
> +        pg = page_list_remove_head(local_scrub_list);
> +        order = PFN_ORDER(pg);
> +        ASSERT( pg && order <= SCRUB_BATCH_ORDER );
> +        for ( i = 0; i < (1 << order); i++ )
> +        {
> +            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
> +            scrub_one_page(&pg[i]);
> +        }
> +        page_list_add_tail(pg, local_free_list);
> +        if ( softirq_pending(cpu) )
> +		return;

Hard tabs. Please, also with further violations below in mind, check
your code before submitting.

> +    }
> +
> +    /* free percpu free list */
> +    if ( !page_list_empty(local_free_list) )
> +    {
> +        spin_lock(&heap_lock);
> +        page_list_for_each_safe( pg, tmp, local_free_list )
> +        {
> +            order = PFN_ORDER(pg);
> +            page_list_del(pg, local_free_list);
> +            for ( i = 0; i < (1 << order); i++ )
> +	    {
> +                pg[i].count_info |= PGC_state_free;
> +                pg[i].count_info &= ~PGC_need_scrub;

This needs to happen earlier - the scrub flag should be cleared right
after scrubbing, and the free flag should imo be set when the page
gets freed. That's for two reasons:
1) Hypervisor allocations don't need scrubbed pages, i.e. they can
allocate memory regardless of the scrub flag's state.
2) You still detain the memory on the local lists from allocation. On a
many-node system, the 16Mb per node can certainly sum up (which
is not to say that I don't view the 16Mb on a single node as already
problematic).

> +            }
> +            merge_free_trunks(pg, order, node, page_to_zone(pg), 0);
> +        }
> +        spin_unlock(&heap_lock);
> +    }
> +}
> +
> +void scrub_free_pages(void)
> +{
> +    int order;
> +    struct page_info *pg, *tmp;
> +    unsigned int i, zone, nr_delisted = 0;
> +    unsigned int cpu = smp_processor_id();
> +    unsigned int node = cpu_to_node(cpu);
> +    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
> +
> +    /* Return if our sibling already started scrubbing */
> +    for_each_cpu( i, per_cpu(cpu_sibling_mask,cpu) )
> +        if ( per_cpu(is_scrubbing, i) )
> +            return;
> +    this_cpu(is_scrubbing) = 1;

If you really want to enforce exclusiveness, you need a single
canonical flag per core (could e.g. be
per_cpu(is_scrubbing, cpumask_first(per_cpu(cpu_sibling_mask, cpu))),
set via test_and_set_bool()).

> +
> +    while ( !softirq_pending(cpu) )
> +    {
> +        if ( !node_need_scrub[node] )
> +        {
> +            /* Free local per cpu list before we exit */
> +            __scrub_free_pages(node, cpu);
> +            goto out;
> +        }

This seems unnecessary: Just have the if() below be

        if ( node_need_scrub[node] && page_list_empty(local_scrub_list) )

along with __scrub_free_pages() returning whether it exited because
of softirq_pending(cpu) (which at once eliminates the need for the
check at the loop header above, allowing this to become a nice
do { ... } while ( !__scrub_free_pages() ).

> +
> +        /* Delist a batch of pages from global scrub list */
> +        if ( page_list_empty(local_scrub_list) )
> +        {
> +            spin_lock(&heap_lock);
> +            for ( zone = 0; zone < NR_ZONES; zone++ )
> +            {
> +                for ( order = MAX_ORDER; order >= 0; order-- )
> +                {
> +                    page_list_for_each_safe( pg, tmp, &heap(node, zone, order) )
> +                    {
> +                        if ( !test_bit(_PGC_need_scrub, &(pg->count_info)) )

Please avoid the inner parentheses here.

> +                            continue;
> +
> +                        page_list_del( pg, &heap(node, zone, order) );
> +                        if ( order > SCRUB_BATCH_ORDER)

Coding style.

> +                        {
> +                            /* putback extra pages */
> +                            i = order;
> +                            while ( i != SCRUB_BATCH_ORDER )

This would better be a do/while construct - on the first iteration you
already know the condition is true.

> +                            {
> +                                PFN_ORDER(pg) = --i;
> +                                page_list_add_tail(pg, &heap(node, zone, i));
> +                                pg += 1 << i;

I realize there are cases where this is also not being done correctly in
existing code, but please use 1UL here and in all similar cases.

> +                            }
> +                            PFN_ORDER(pg) = SCRUB_BATCH_ORDER;
> +                        }
> +
> +                        for ( i = 0; i < (1 << PFN_ORDER(pg)); i++ )

Can't you just use "order" here (and a few lines down)?

> +                        {
> +                            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
> +                            ASSERT( !test_bit(_PGC_broken, &pg[i].count_info) );
> +                            mark_page_offline(&pg[i], 0);

mark_page_offline() ???

Jan

> +                        }
> +                        page_list_add_tail(pg, local_scrub_list);
> +                        nr_delisted += ( 1 << PFN_ORDER(pg) );
> +                        if ( nr_delisted >= (1 << SCRUB_BATCH_ORDER) )
> +                        {
> +                            nr_delisted = 0;
> +                            spin_unlock(&heap_lock);
> +                            goto start_scrub;
> +                        }
> +                    }
> +                }
> +            }
> +
> +            node_need_scrub[node] = 0;
> +            spin_unlock(&heap_lock);
> +        }
>  
> + start_scrub:
> +        __scrub_free_pages(node, cpu);
> +    }
> +
> + out:
> +    this_cpu(is_scrubbing) = 0;
> +}

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-01  9:12   ` Jan Beulich
@ 2014-07-01 12:25     ` Bob Liu
  2014-07-01 12:59       ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-07-01 12:25 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 07/01/2014 05:12 PM, Jan Beulich wrote:
>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -116,6 +116,7 @@ static void idle_loop(void)
>>      {
>>          if ( cpu_is_offline(smp_processor_id()) )
>>              play_dead();
>> +        scrub_free_pages();
>>          (*pm_idle)();
> 
> So is it really appropriate to call pm_idle() if scrub_free_pages() didn't
> return immediately? I.e. I would suppose the function ought to return
> a boolean indicator whether any meaningful amount of processing
> was done, in which case we may want to skip entering any kind of C
> state (even if it would end up being just C1), or doing any of the
> processing associated with this possible entering.
>

Thanks, I'll add a return value for scrub_free_pages() and skip pm_idle
if necessary.

>> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
>> index ab293c8..6ab1d1d 100644
>> --- a/xen/common/page_alloc.c
>> +++ b/xen/common/page_alloc.c
>> @@ -86,6 +86,12 @@ PAGE_LIST_HEAD(page_offlined_list);
>>  /* Broken page list, protected by heap_lock. */
>>  PAGE_LIST_HEAD(page_broken_list);
>>  
>> +/* A rough flag to indicate whether a node have need_scrub pages */
>> +static bool_t node_need_scrub[MAX_NUMNODES];
>> +static DEFINE_PER_CPU(bool_t, is_scrubbing);
>> +static DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);
>> +static DEFINE_PER_CPU(struct page_list_head, free_list_cpu);
>> +
>>  /*************************
>>   * BOOT-TIME ALLOCATOR
>>   */
>> @@ -948,6 +954,7 @@ static void free_heap_pages(
>>      {
>>          if ( !tainted )
>>          {
>> +            node_need_scrub[node] = 1;
>>              for ( i = 0; i < (1 << order); i++ )
>>                  pg[i].count_info |= PGC_need_scrub;
>>          }
> 
> Iirc it was more than this single place where you set
> PGC_need_scrub, and hence where you'd now need to set the
> other flag too.
> 

I'm afraid this is the only place where PGC_need_scrub was set.

>> @@ -1525,7 +1532,130 @@ void __init scrub_heap_pages(void)
>>      setup_low_mem_virq();
>>  }
>>  
>> +#define SCRUB_BATCH_ORDER 12
> 
> Please make this adjustable via command line, so that eventual
> latency issues can be worked around.
> 

Okay.

>> +static void __scrub_free_pages(unsigned int node, unsigned int cpu)
>> +{
>> +    struct page_info *pg, *tmp;
>> +    unsigned int i;
>> +    int order;
>> +    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
>> +    struct page_list_head *local_free_list = &this_cpu(free_list_cpu);
>> +
>> +    /* Scrub percpu list */
>> +    while ( !page_list_empty(local_scrub_list) )
>> +    {
>> +        pg = page_list_remove_head(local_scrub_list);
>> +        order = PFN_ORDER(pg);
>> +        ASSERT( pg && order <= SCRUB_BATCH_ORDER );
>> +        for ( i = 0; i < (1 << order); i++ )
>> +        {
>> +            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
>> +            scrub_one_page(&pg[i]);
>> +        }
>> +        page_list_add_tail(pg, local_free_list);
>> +        if ( softirq_pending(cpu) )
>> +		return;
> 
> Hard tabs. Please, also with further violations below in mind, check
> your code before submitting.
>

I'm sorry for all of the coding style problems.

By the way is there any script which can be used to check the code
before submitting? Something like ./scripts/checkpatch.pl under linux.

>> +    }
>> +
>> +    /* free percpu free list */
>> +    if ( !page_list_empty(local_free_list) )
>> +    {
>> +        spin_lock(&heap_lock);
>> +        page_list_for_each_safe( pg, tmp, local_free_list )
>> +        {
>> +            order = PFN_ORDER(pg);
>> +            page_list_del(pg, local_free_list);
>> +            for ( i = 0; i < (1 << order); i++ )
>> +	    {
>> +                pg[i].count_info |= PGC_state_free;
>> +                pg[i].count_info &= ~PGC_need_scrub;
> 
> This needs to happen earlier - the scrub flag should be cleared right
> after scrubbing, and the free flag should imo be set when the page
> gets freed. That's for two reasons:
> 1) Hypervisor allocations don't need scrubbed pages, i.e. they can
> allocate memory regardless of the scrub flag's state.

AFAIR, the reason I set those flags here is to avoid a panic happen.

> 2) You still detain the memory on the local lists from allocation. On a
> many-node system, the 16Mb per node can certainly sum up (which
> is not to say that I don't view the 16Mb on a single node as already
> problematic).
> 

Right, but we can adjust SCRUB_BATCH_ORDER.
Anyway I'll take a retry as you suggested.

>> +            }
>> +            merge_free_trunks(pg, order, node, page_to_zone(pg), 0);
>> +        }
>> +        spin_unlock(&heap_lock);
>> +    }
>> +}
>> +
>> +void scrub_free_pages(void)
>> +{
>> +    int order;
>> +    struct page_info *pg, *tmp;
>> +    unsigned int i, zone, nr_delisted = 0;
>> +    unsigned int cpu = smp_processor_id();
>> +    unsigned int node = cpu_to_node(cpu);
>> +    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
>> +
>> +    /* Return if our sibling already started scrubbing */
>> +    for_each_cpu( i, per_cpu(cpu_sibling_mask,cpu) )
>> +        if ( per_cpu(is_scrubbing, i) )
>> +            return;
>> +    this_cpu(is_scrubbing) = 1;
> 
> If you really want to enforce exclusiveness, you need a single
> canonical flag per core (could e.g. be
> per_cpu(is_scrubbing, cpumask_first(per_cpu(cpu_sibling_mask, cpu))),
> set via test_and_set_bool()).

Will be updated.

> 
>> +
>> +    while ( !softirq_pending(cpu) )
>> +    {
>> +        if ( !node_need_scrub[node] )
>> +        {
>> +            /* Free local per cpu list before we exit */
>> +            __scrub_free_pages(node, cpu);
>> +            goto out;
>> +        }
> 
> This seems unnecessary: Just have the if() below be
> 
>         if ( node_need_scrub[node] && page_list_empty(local_scrub_list) )
> 
> along with __scrub_free_pages() returning whether it exited because
> of softirq_pending(cpu) (which at once eliminates the need for the
> check at the loop header above, allowing this to become a nice
> do { ... } while ( !__scrub_free_pages() ).
>

Same here.

>> +
>> +        /* Delist a batch of pages from global scrub list */
>> +        if ( page_list_empty(local_scrub_list) )
>> +        {
>> +            spin_lock(&heap_lock);
>> +            for ( zone = 0; zone < NR_ZONES; zone++ )
>> +            {
>> +                for ( order = MAX_ORDER; order >= 0; order-- )
>> +                {
>> +                    page_list_for_each_safe( pg, tmp, &heap(node, zone, order) )
>> +                    {
>> +                        if ( !test_bit(_PGC_need_scrub, &(pg->count_info)) )
> 
> Please avoid the inner parentheses here.
> 
>> +                            continue;
>> +
>> +                        page_list_del( pg, &heap(node, zone, order) );
>> +                        if ( order > SCRUB_BATCH_ORDER)
> 
> Coding style.
> 
>> +                        {
>> +                            /* putback extra pages */
>> +                            i = order;
>> +                            while ( i != SCRUB_BATCH_ORDER )
> 
> This would better be a do/while construct - on the first iteration you
> already know the condition is true.
> 
>> +                            {
>> +                                PFN_ORDER(pg) = --i;
>> +                                page_list_add_tail(pg, &heap(node, zone, i));
>> +                                pg += 1 << i;
> 
> I realize there are cases where this is also not being done correctly in
> existing code, but please use 1UL here and in all similar cases.
> 

Sure.

>> +                            }
>> +                            PFN_ORDER(pg) = SCRUB_BATCH_ORDER;
>> +                        }
>> +
>> +                        for ( i = 0; i < (1 << PFN_ORDER(pg)); i++ )
> 
> Can't you just use "order" here (and a few lines down)?
> 

Then I have to use another temporary variable. Anyway, I'll make a update.

>> +                        {
>> +                            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
>> +                            ASSERT( !test_bit(_PGC_broken, &pg[i].count_info) );
>> +                            mark_page_offline(&pg[i], 0);
> 
> mark_page_offline() ???
>

Yes, it's a bit tricky here and I don't have a good idea right now.
The problem is when free a page frame we have to avoid merging with
pages on percpu scrub/free list, so I marked pages offline temporarily
while adding to percpu lists.

Another thing I'm still not clear about is how to handle the situation
if #mc happened for pages on percpu scrub/free list.

Thank you very much for your review.

-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-01 12:25     ` Bob Liu
@ 2014-07-01 12:59       ` Jan Beulich
  2014-07-02  6:27         ` Bob Liu
  2014-07-15  9:16         ` Bob Liu
  0 siblings, 2 replies; 25+ messages in thread
From: Jan Beulich @ 2014-07-01 12:59 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 01.07.14 at 14:25, <bob.liu@oracle.com> wrote:
> On 07/01/2014 05:12 PM, Jan Beulich wrote:
>>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>>> @@ -948,6 +954,7 @@ static void free_heap_pages(
>>>      {
>>>          if ( !tainted )
>>>          {
>>> +            node_need_scrub[node] = 1;
>>>              for ( i = 0; i < (1 << order); i++ )
>>>                  pg[i].count_info |= PGC_need_scrub;
>>>          }
>> 
>> Iirc it was more than this single place where you set
>> PGC_need_scrub, and hence where you'd now need to set the
>> other flag too.
>> 
> 
> I'm afraid this is the only place where PGC_need_scrub was set.

Ah, indeed - I misremembered others, they are all tests for the flag.

> I'm sorry for all of the coding style problems.
> 
> By the way is there any script which can be used to check the code
> before submitting? Something like ./scripts/checkpatch.pl under linux.

No, there isn't. But avoiding (or spotting) hard tabs should be easy
enough, and other things you ought to simply inspect your patch for
- after all that's no different from what reviewers do.

>>> +    }
>>> +
>>> +    /* free percpu free list */
>>> +    if ( !page_list_empty(local_free_list) )
>>> +    {
>>> +        spin_lock(&heap_lock);
>>> +        page_list_for_each_safe( pg, tmp, local_free_list )
>>> +        {
>>> +            order = PFN_ORDER(pg);
>>> +            page_list_del(pg, local_free_list);
>>> +            for ( i = 0; i < (1 << order); i++ )
>>> +	    {
>>> +                pg[i].count_info |= PGC_state_free;
>>> +                pg[i].count_info &= ~PGC_need_scrub;
>> 
>> This needs to happen earlier - the scrub flag should be cleared right
>> after scrubbing, and the free flag should imo be set when the page
>> gets freed. That's for two reasons:
>> 1) Hypervisor allocations don't need scrubbed pages, i.e. they can
>> allocate memory regardless of the scrub flag's state.
> 
> AFAIR, the reason I set those flags here is to avoid a panic happen.

That's pretty vague a statement.

>> 2) You still detain the memory on the local lists from allocation. On a
>> many-node system, the 16Mb per node can certainly sum up (which
>> is not to say that I don't view the 16Mb on a single node as already
>> problematic).
> 
> Right, but we can adjust SCRUB_BATCH_ORDER.
> Anyway I'll take a retry as you suggested.

You should really drop the idea of removing pages temporarily.
All you need to do is make sure a page being allocated and getting
simultaneously scrubbed by another CPU won't get passed to the
caller until the scrubbing finished. In particular it's no problem if
the allocating CPU occasionally ends up scrubbing a page already
being scrubbed elsewhere.

>>> +                            }
>>> +                            PFN_ORDER(pg) = SCRUB_BATCH_ORDER;
>>> +                        }
>>> +
>>> +                        for ( i = 0; i < (1 << PFN_ORDER(pg)); i++ )
>> 
>> Can't you just use "order" here (and a few lines down)?
>> 
> 
> Then I have to use another temporary variable. Anyway, I'll make a update.

Why? You don't modify the value? Oh, did you misread this as a
request to replace "i"? That wasn't my point, I was asking to replace
PFN_ORDER(pg).

>>> +                        {
>>> +                            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
>>> +                            ASSERT( !test_bit(_PGC_broken, &pg[i].count_info) );
>>> +                            mark_page_offline(&pg[i], 0);
>> 
>> mark_page_offline() ???
>>
> 
> Yes, it's a bit tricky here and I don't have a good idea right now.
> The problem is when free a page frame we have to avoid merging with
> pages on percpu scrub/free list, so I marked pages offline temporarily
> while adding to percpu lists.

Which will be solved implicitly by no longer removing pages from
what the allocator can see.

> Another thing I'm still not clear about is how to handle the situation
> if #mc happened for pages on percpu scrub/free list.

Dito.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-01 12:59       ` Jan Beulich
@ 2014-07-02  6:27         ` Bob Liu
  2014-07-07 12:20           ` Bob Liu
  2014-07-15  9:16         ` Bob Liu
  1 sibling, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-07-02  6:27 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 07/01/2014 08:59 PM, Jan Beulich wrote:
>>>> On 01.07.14 at 14:25, <bob.liu@oracle.com> wrote:
>> On 07/01/2014 05:12 PM, Jan Beulich wrote:
>>>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>>>> @@ -948,6 +954,7 @@ static void free_heap_pages(
>>>>      {
>>>>          if ( !tainted )
>>>>          {
>>>> +            node_need_scrub[node] = 1;
>>>>              for ( i = 0; i < (1 << order); i++ )
>>>>                  pg[i].count_info |= PGC_need_scrub;
>>>>          }
>>>
>>> Iirc it was more than this single place where you set
>>> PGC_need_scrub, and hence where you'd now need to set the
>>> other flag too.
>>>
>>
>> I'm afraid this is the only place where PGC_need_scrub was set.
> 
> Ah, indeed - I misremembered others, they are all tests for the flag.
> 
>> I'm sorry for all of the coding style problems.
>>
>> By the way is there any script which can be used to check the code
>> before submitting? Something like ./scripts/checkpatch.pl under linux.
> 
> No, there isn't. But avoiding (or spotting) hard tabs should be easy
> enough, and other things you ought to simply inspect your patch for
> - after all that's no different from what reviewers do.
> 
>>>> +    }
>>>> +
>>>> +    /* free percpu free list */
>>>> +    if ( !page_list_empty(local_free_list) )
>>>> +    {
>>>> +        spin_lock(&heap_lock);
>>>> +        page_list_for_each_safe( pg, tmp, local_free_list )
>>>> +        {
>>>> +            order = PFN_ORDER(pg);
>>>> +            page_list_del(pg, local_free_list);
>>>> +            for ( i = 0; i < (1 << order); i++ )
>>>> +	    {
>>>> +                pg[i].count_info |= PGC_state_free;
>>>> +                pg[i].count_info &= ~PGC_need_scrub;
>>>
>>> This needs to happen earlier - the scrub flag should be cleared right
>>> after scrubbing, and the free flag should imo be set when the page
>>> gets freed. That's for two reasons:
>>> 1) Hypervisor allocations don't need scrubbed pages, i.e. they can
>>> allocate memory regardless of the scrub flag's state.
>>
>> AFAIR, the reason I set those flags here is to avoid a panic happen.
> 
> That's pretty vague a statement.
> 
>>> 2) You still detain the memory on the local lists from allocation. On a
>>> many-node system, the 16Mb per node can certainly sum up (which
>>> is not to say that I don't view the 16Mb on a single node as already
>>> problematic).
>>
>> Right, but we can adjust SCRUB_BATCH_ORDER.
>> Anyway I'll take a retry as you suggested.
> 
> You should really drop the idea of removing pages temporarily.
> All you need to do is make sure a page being allocated and getting
> simultaneously scrubbed by another CPU won't get passed to the
> caller until the scrubbing finished. In particular it's no problem if
> the allocating CPU occasionally ends up scrubbing a page already
> being scrubbed elsewhere.
> 

Yes, I also like to drop percpu lists which can make things simper. But
I'm afraid which also means I can't use any spinlock(&heap_lock) any
more because of potential heavy lock contentions. I'm not sure whether
things can work fine without heap_lock.

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-02  6:27         ` Bob Liu
@ 2014-07-07 12:20           ` Bob Liu
  0 siblings, 0 replies; 25+ messages in thread
From: Bob Liu @ 2014-07-07 12:20 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 07/02/2014 02:27 PM, Bob Liu wrote:
> 
> On 07/01/2014 08:59 PM, Jan Beulich wrote:
>>>>> On 01.07.14 at 14:25, <bob.liu@oracle.com> wrote:
>>> On 07/01/2014 05:12 PM, Jan Beulich wrote:
>>>>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>>>>> @@ -948,6 +954,7 @@ static void free_heap_pages(
>>>>>      {
>>>>>          if ( !tainted )
>>>>>          {
>>>>> +            node_need_scrub[node] = 1;
>>>>>              for ( i = 0; i < (1 << order); i++ )
>>>>>                  pg[i].count_info |= PGC_need_scrub;
>>>>>          }
>>>>
>>>> Iirc it was more than this single place where you set
>>>> PGC_need_scrub, and hence where you'd now need to set the
>>>> other flag too.
>>>>
>>>
>>> I'm afraid this is the only place where PGC_need_scrub was set.
>>
>> Ah, indeed - I misremembered others, they are all tests for the flag.
>>
>>> I'm sorry for all of the coding style problems.
>>>
>>> By the way is there any script which can be used to check the code
>>> before submitting? Something like ./scripts/checkpatch.pl under linux.
>>
>> No, there isn't. But avoiding (or spotting) hard tabs should be easy
>> enough, and other things you ought to simply inspect your patch for
>> - after all that's no different from what reviewers do.
>>
>>>>> +    }
>>>>> +
>>>>> +    /* free percpu free list */
>>>>> +    if ( !page_list_empty(local_free_list) )
>>>>> +    {
>>>>> +        spin_lock(&heap_lock);
>>>>> +        page_list_for_each_safe( pg, tmp, local_free_list )
>>>>> +        {
>>>>> +            order = PFN_ORDER(pg);
>>>>> +            page_list_del(pg, local_free_list);
>>>>> +            for ( i = 0; i < (1 << order); i++ )
>>>>> +	    {
>>>>> +                pg[i].count_info |= PGC_state_free;
>>>>> +                pg[i].count_info &= ~PGC_need_scrub;
>>>>
>>>> This needs to happen earlier - the scrub flag should be cleared right
>>>> after scrubbing, and the free flag should imo be set when the page
>>>> gets freed. That's for two reasons:
>>>> 1) Hypervisor allocations don't need scrubbed pages, i.e. they can
>>>> allocate memory regardless of the scrub flag's state.
>>>
>>> AFAIR, the reason I set those flags here is to avoid a panic happen.
>>
>> That's pretty vague a statement.
>>
>>>> 2) You still detain the memory on the local lists from allocation. On a
>>>> many-node system, the 16Mb per node can certainly sum up (which
>>>> is not to say that I don't view the 16Mb on a single node as already
>>>> problematic).
>>>
>>> Right, but we can adjust SCRUB_BATCH_ORDER.
>>> Anyway I'll take a retry as you suggested.
>>
>> You should really drop the idea of removing pages temporarily.
>> All you need to do is make sure a page being allocated and getting
>> simultaneously scrubbed by another CPU won't get passed to the
>> caller until the scrubbing finished. In particular it's no problem if
>> the allocating CPU occasionally ends up scrubbing a page already
>> being scrubbed elsewhere.
>>
> 
> Yes, I also like to drop percpu lists which can make things simper. But
> I'm afraid which also means I can't use any spinlock(&heap_lock) any
> more because of potential heavy lock contentions. I'm not sure whether
> things can work fine without heap_lock.
> 

In my attempt to get rid of heap_lock, there was a panic happen when
iterating the heap free list. My implementation is like this:
scrub_free_pages()
{
    for ( zone = 0; zone < NR_ZONES; zone++ )
    {
        for ( order = MAX_ORDER; order >= 0; order-- )
        {
            page_list_for_each_safe( pg, tmp, &heap(node, zone, order) )
            {
                if ( !test_bit(_PGC_need_scrub, &(pg->count_info)) )
                    continue;
                for ( i = 0; i < (1 << order); i++ )
                {
                    if ( test_bit(_PGC_need_scrub, &(pg->count_info)) )
                    {
                        scrub_one_page(&pg[i]);
                        clear_bit(_PGC_need_scrub, &pg[i].count_info);
                    }
                }
            }
        }
    }
}

The panic was in page_list_next().

I didn't find a good way to iterate the free list without holding
heap_lock, but if holding the lock it might be heavy lock contention
then I have to remove pages temporarily from heap free list to a percpu
list.

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-01 12:59       ` Jan Beulich
  2014-07-02  6:27         ` Bob Liu
@ 2014-07-15  9:16         ` Bob Liu
  2014-07-23  0:38           ` Konrad Rzeszutek Wilk
  2014-07-23  7:28           ` Jan Beulich
  1 sibling, 2 replies; 25+ messages in thread
From: Bob Liu @ 2014-07-15  9:16 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

Hi Jan & Konrad,

On 07/01/2014 08:59 PM, Jan Beulich wrote:
>>>> On 01.07.14 at 14:25, <bob.liu@oracle.com> wrote:
>> On 07/01/2014 05:12 PM, Jan Beulich wrote:
>>>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>>>> @@ -948,6 +954,7 @@ static void free_heap_pages(
>>>>      {
>>>>          if ( !tainted )
>>>>          {
>>>> +            node_need_scrub[node] = 1;
>>>>              for ( i = 0; i < (1 << order); i++ )
>>>>                  pg[i].count_info |= PGC_need_scrub;
>>>>          }
>>>
>>> Iirc it was more than this single place where you set
>>> PGC_need_scrub, and hence where you'd now need to set the
>>> other flag too.
>>>
>>
>> I'm afraid this is the only place where PGC_need_scrub was set.
> 
> Ah, indeed - I misremembered others, they are all tests for the flag.
> 
>> I'm sorry for all of the coding style problems.
>>
>> By the way is there any script which can be used to check the code
>> before submitting? Something like ./scripts/checkpatch.pl under linux.
> 
> No, there isn't. But avoiding (or spotting) hard tabs should be easy
> enough, and other things you ought to simply inspect your patch for
> - after all that's no different from what reviewers do.
> 
>>>> +    }
>>>> +
>>>> +    /* free percpu free list */
>>>> +    if ( !page_list_empty(local_free_list) )
>>>> +    {
>>>> +        spin_lock(&heap_lock);
>>>> +        page_list_for_each_safe( pg, tmp, local_free_list )
>>>> +        {
>>>> +            order = PFN_ORDER(pg);
>>>> +            page_list_del(pg, local_free_list);
>>>> +            for ( i = 0; i < (1 << order); i++ )
>>>> +	    {
>>>> +                pg[i].count_info |= PGC_state_free;
>>>> +                pg[i].count_info &= ~PGC_need_scrub;
>>>
>>> This needs to happen earlier - the scrub flag should be cleared right
>>> after scrubbing, and the free flag should imo be set when the page
>>> gets freed. That's for two reasons:
>>> 1) Hypervisor allocations don't need scrubbed pages, i.e. they can
>>> allocate memory regardless of the scrub flag's state.
>>
>> AFAIR, the reason I set those flags here is to avoid a panic happen.
> 
> That's pretty vague a statement.
> 
>>> 2) You still detain the memory on the local lists from allocation. On a
>>> many-node system, the 16Mb per node can certainly sum up (which
>>> is not to say that I don't view the 16Mb on a single node as already
>>> problematic).
>>
>> Right, but we can adjust SCRUB_BATCH_ORDER.
>> Anyway I'll take a retry as you suggested.
> 
> You should really drop the idea of removing pages temporarily.
> All you need to do is make sure a page being allocated and getting
> simultaneously scrubbed by another CPU won't get passed to the
> caller until the scrubbing finished. In particular it's no problem if
> the allocating CPU occasionally ends up scrubbing a page already
> being scrubbed elsewhere.
> 

After so many days I haven't make a workable solution if don't remove
pages temporarily. The hardest part is iterating the heap free list
without holding heap_lock because if holding the lock it might be heavy
lock contention.
So do you think it's acceptable if fixed all other concerns about this
patch?

And there is another idea based on the assumption that only guest
allocation need all pages to be scrubbed(It's safe for xen hypervisor to
use an un-scrubbed page):
1. set the need_scrub flag in free_heap_pages().
2. alloc pages from heap free list, don't scrub and neither clear the
need_scrub flag.
3. When the guest accessing the real memory(machine pages) the first
time, page fault should happen. We track this event and before build the
right PFN to MFN page table mappings(correct me if it doesn't work as
this), at this place scrub the page if the need_scrub flag was set.

By this way only guest real used pages are needed scrubbing and the time
is scattered.
How do you think of this solution?

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-15  9:16         ` Bob Liu
@ 2014-07-23  0:38           ` Konrad Rzeszutek Wilk
  2014-07-23  1:30             ` Bob Liu
  2014-07-23  7:28           ` Jan Beulich
  1 sibling, 1 reply; 25+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-07-23  0:38 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3,
	Jan Beulich, xen-devel

On Tue, Jul 15, 2014 at 05:16:33PM +0800, Bob Liu wrote:
> Hi Jan & Konrad,
> 
> On 07/01/2014 08:59 PM, Jan Beulich wrote:
> >>>> On 01.07.14 at 14:25, <bob.liu@oracle.com> wrote:
> >> On 07/01/2014 05:12 PM, Jan Beulich wrote:
> >>>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
> >>>> @@ -948,6 +954,7 @@ static void free_heap_pages(
> >>>>      {
> >>>>          if ( !tainted )
> >>>>          {
> >>>> +            node_need_scrub[node] = 1;
> >>>>              for ( i = 0; i < (1 << order); i++ )
> >>>>                  pg[i].count_info |= PGC_need_scrub;
> >>>>          }
> >>>
> >>> Iirc it was more than this single place where you set
> >>> PGC_need_scrub, and hence where you'd now need to set the
> >>> other flag too.
> >>>
> >>
> >> I'm afraid this is the only place where PGC_need_scrub was set.
> > 
> > Ah, indeed - I misremembered others, they are all tests for the flag.
> > 
> >> I'm sorry for all of the coding style problems.
> >>
> >> By the way is there any script which can be used to check the code
> >> before submitting? Something like ./scripts/checkpatch.pl under linux.
> > 
> > No, there isn't. But avoiding (or spotting) hard tabs should be easy
> > enough, and other things you ought to simply inspect your patch for
> > - after all that's no different from what reviewers do.
> > 
> >>>> +    }
> >>>> +
> >>>> +    /* free percpu free list */
> >>>> +    if ( !page_list_empty(local_free_list) )
> >>>> +    {
> >>>> +        spin_lock(&heap_lock);
> >>>> +        page_list_for_each_safe( pg, tmp, local_free_list )
> >>>> +        {
> >>>> +            order = PFN_ORDER(pg);
> >>>> +            page_list_del(pg, local_free_list);
> >>>> +            for ( i = 0; i < (1 << order); i++ )
> >>>> +	    {
> >>>> +                pg[i].count_info |= PGC_state_free;
> >>>> +                pg[i].count_info &= ~PGC_need_scrub;
> >>>
> >>> This needs to happen earlier - the scrub flag should be cleared right
> >>> after scrubbing, and the free flag should imo be set when the page
> >>> gets freed. That's for two reasons:
> >>> 1) Hypervisor allocations don't need scrubbed pages, i.e. they can
> >>> allocate memory regardless of the scrub flag's state.
> >>
> >> AFAIR, the reason I set those flags here is to avoid a panic happen.
> > 
> > That's pretty vague a statement.
> > 
> >>> 2) You still detain the memory on the local lists from allocation. On a
> >>> many-node system, the 16Mb per node can certainly sum up (which
> >>> is not to say that I don't view the 16Mb on a single node as already
> >>> problematic).
> >>
> >> Right, but we can adjust SCRUB_BATCH_ORDER.
> >> Anyway I'll take a retry as you suggested.
> > 
> > You should really drop the idea of removing pages temporarily.
> > All you need to do is make sure a page being allocated and getting
> > simultaneously scrubbed by another CPU won't get passed to the
> > caller until the scrubbing finished. In particular it's no problem if
> > the allocating CPU occasionally ends up scrubbing a page already
> > being scrubbed elsewhere.
> > 
> 
> After so many days I haven't make a workable solution if don't remove
> pages temporarily. The hardest part is iterating the heap free list
> without holding heap_lock because if holding the lock it might be heavy
> lock contention.

Why do you need to hold on the heap_lock?

I presume that the flag changes need an cmpxchg operation. In which case
could you add another flag that would be 'PG_scrubbing_now` - and the
allocator could bypass all of the pages that have said flag (and also
PG_need_scrub).

If the allocator can't find enough pages to satisfy the demands - it goes
back in and looks at PG_need_scrub pages and scrubs those.

In parallel, another thread (threads?) pick the ones that have PG_need_scrub
and change them to PG_scrubbing_now and scrubs them. When it has
completed it removes both flags.

> So do you think it's acceptable if fixed all other concerns about this
> patch?
> 
> And there is another idea based on the assumption that only guest
> allocation need all pages to be scrubbed(It's safe for xen hypervisor to
> use an un-scrubbed page):
> 1. set the need_scrub flag in free_heap_pages().
> 2. alloc pages from heap free list, don't scrub and neither clear the
> need_scrub flag.
> 3. When the guest accessing the real memory(machine pages) the first
> time, page fault should happen. We track this event and before build the

I think you mean that we can set the EPT (or shadow) all of those PG_need_scrub
pages to be r/o. And then when the guest access them we would trap in
the hypervisor and scrub them. However, that means the guest can
still read those pages without trapping - and that means we might
be reading sensitive data from pages that came from another guest!


> right PFN to MFN page table mappings(correct me if it doesn't work as
> this), at this place scrub the page if the need_scrub flag was set.
> 
> By this way only guest real used pages are needed scrubbing and the time
> is scattered.
> How do you think of this solution?
> 
> -- 
> Regards,
> -Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-23  0:38           ` Konrad Rzeszutek Wilk
@ 2014-07-23  1:30             ` Bob Liu
  0 siblings, 0 replies; 25+ messages in thread
From: Bob Liu @ 2014-07-23  1:30 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3,
	Jan Beulich, xen-devel


On 07/23/2014 08:38 AM, Konrad Rzeszutek Wilk wrote:
> On Tue, Jul 15, 2014 at 05:16:33PM +0800, Bob Liu wrote:
>> Hi Jan & Konrad,
>>
>> On 07/01/2014 08:59 PM, Jan Beulich wrote:
>>>>>> On 01.07.14 at 14:25, <bob.liu@oracle.com> wrote:
>>>> On 07/01/2014 05:12 PM, Jan Beulich wrote:
>>>>>>>> On 30.06.14 at 15:39, <lliubbo@gmail.com> wrote:
>>>>>> @@ -948,6 +954,7 @@ static void free_heap_pages(
>>>>>>      {
>>>>>>          if ( !tainted )
>>>>>>          {
>>>>>> +            node_need_scrub[node] = 1;
>>>>>>              for ( i = 0; i < (1 << order); i++ )
>>>>>>                  pg[i].count_info |= PGC_need_scrub;
>>>>>>          }
>>>>>
>>>>> Iirc it was more than this single place where you set
>>>>> PGC_need_scrub, and hence where you'd now need to set the
>>>>> other flag too.
>>>>>
>>>>
>>>> I'm afraid this is the only place where PGC_need_scrub was set.
>>>
>>> Ah, indeed - I misremembered others, they are all tests for the flag.
>>>
>>>> I'm sorry for all of the coding style problems.
>>>>
>>>> By the way is there any script which can be used to check the code
>>>> before submitting? Something like ./scripts/checkpatch.pl under linux.
>>>
>>> No, there isn't. But avoiding (or spotting) hard tabs should be easy
>>> enough, and other things you ought to simply inspect your patch for
>>> - after all that's no different from what reviewers do.
>>>
>>>>>> +    }
>>>>>> +
>>>>>> +    /* free percpu free list */
>>>>>> +    if ( !page_list_empty(local_free_list) )
>>>>>> +    {
>>>>>> +        spin_lock(&heap_lock);
>>>>>> +        page_list_for_each_safe( pg, tmp, local_free_list )
>>>>>> +        {
>>>>>> +            order = PFN_ORDER(pg);
>>>>>> +            page_list_del(pg, local_free_list);
>>>>>> +            for ( i = 0; i < (1 << order); i++ )
>>>>>> +	    {
>>>>>> +                pg[i].count_info |= PGC_state_free;
>>>>>> +                pg[i].count_info &= ~PGC_need_scrub;
>>>>>
>>>>> This needs to happen earlier - the scrub flag should be cleared right
>>>>> after scrubbing, and the free flag should imo be set when the page
>>>>> gets freed. That's for two reasons:
>>>>> 1) Hypervisor allocations don't need scrubbed pages, i.e. they can
>>>>> allocate memory regardless of the scrub flag's state.
>>>>
>>>> AFAIR, the reason I set those flags here is to avoid a panic happen.
>>>
>>> That's pretty vague a statement.
>>>
>>>>> 2) You still detain the memory on the local lists from allocation. On a
>>>>> many-node system, the 16Mb per node can certainly sum up (which
>>>>> is not to say that I don't view the 16Mb on a single node as already
>>>>> problematic).
>>>>
>>>> Right, but we can adjust SCRUB_BATCH_ORDER.
>>>> Anyway I'll take a retry as you suggested.
>>>
>>> You should really drop the idea of removing pages temporarily.
>>> All you need to do is make sure a page being allocated and getting
>>> simultaneously scrubbed by another CPU won't get passed to the
>>> caller until the scrubbing finished. In particular it's no problem if
>>> the allocating CPU occasionally ends up scrubbing a page already
>>> being scrubbed elsewhere.
>>>
>>
>> After so many days I haven't make a workable solution if don't remove
>> pages temporarily. The hardest part is iterating the heap free list
>> without holding heap_lock because if holding the lock it might be heavy
>> lock contention.
> 
> Why do you need to hold on the heap_lock?
> 
> I presume that the flag changes need an cmpxchg operation. In which case
> could you add another flag that would be 'PG_scrubbing_now` - and the
> allocator could bypass all of the pages that have said flag (and also
> PG_need_scrub).

That's what I'm struggling with these days.

> 
> If the allocator can't find enough pages to satisfy the demands - it goes
> back in and looks at PG_need_scrub pages and scrubs those.
> 
> In parallel, another thread (threads?) pick the ones that have PG_need_scrub
> and change them to PG_scrubbing_now and scrubs them. When it has
> completed it removes both flags.
> 

But there is still race condition:

A:alloc path                        B:idle scrub path

                                    page = list_entry(&heap_list);

^^^
Delist the page from heap_list
(Can happen any time)

                                    if (PG_need_scrub)
                                        set PG_scrubbing_now
                                    scrub and clear both flags
                                    page = page_list_next(page);
                                           ^^^ Get panic


Another problem Jan pointed out before is if order=20 and being scrubbed
in idle vcpus, alloc path will fail because of the PG_scrubbing_now
flag. This will introduce a long time we have large memory but can't
allocate any page.

>> So do you think it's acceptable if fixed all other concerns about this
>> patch?
>>
>> And there is another idea based on the assumption that only guest
>> allocation need all pages to be scrubbed(It's safe for xen hypervisor to
>> use an un-scrubbed page):
>> 1. set the need_scrub flag in free_heap_pages().
>> 2. alloc pages from heap free list, don't scrub and neither clear the
>> need_scrub flag.
>> 3. When the guest accessing the real memory(machine pages) the first
>> time, page fault should happen. We track this event and before build the
> 
> I think you mean that we can set the EPT (or shadow) all of those PG_need_scrub

No, I mean at the point when setting the EPT or shadow page table entry,
scrub the related machine page if it has the PG_need_scrub flag.

I think in Pod mode the place is in p2m_set_entry(), scrub the mfn
related page if PG_need_scrub before we set up the entry.

> pages to be r/o. And then when the guest access them we would trap in
> the hypervisor and scrub them. However, that means the guest can
> still read those pages without trapping - and that means we might
> be reading sensitive data from pages that came from another guest!
> 
> 
>> right PFN to MFN page table mappings(correct me if it doesn't work as
>> this), at this place scrub the page if the need_scrub flag was set.
>>
>> By this way only guest real used pages are needed scrubbing and the time
>> is scattered.
>> How do you think of this solution?

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-15  9:16         ` Bob Liu
  2014-07-23  0:38           ` Konrad Rzeszutek Wilk
@ 2014-07-23  7:28           ` Jan Beulich
  2014-07-24  2:08             ` Bob Liu
  1 sibling, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-07-23  7:28 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
> After so many days I haven't make a workable solution if don't remove
> pages temporarily. The hardest part is iterating the heap free list
> without holding heap_lock because if holding the lock it might be heavy
> lock contention.
> So do you think it's acceptable if fixed all other concerns about this
> patch?

No, I don't think so. Instead I'm of the opinion that you may have
worked in the wrong direction: Rather than not taking the heap lock
at all, it may also be sufficient to shrink the lock holding time (i.e.
avoid long loops with the lock held).

> And there is another idea based on the assumption that only guest
> allocation need all pages to be scrubbed(It's safe for xen hypervisor to
> use an un-scrubbed page):
> 1. set the need_scrub flag in free_heap_pages().
> 2. alloc pages from heap free list, don't scrub and neither clear the
> need_scrub flag.
> 3. When the guest accessing the real memory(machine pages) the first
> time, page fault should happen. We track this event and before build the
> right PFN to MFN page table mappings(correct me if it doesn't work as
> this), at this place scrub the page if the need_scrub flag was set.
> 
> By this way only guest real used pages are needed scrubbing and the time
> is scattered.
> How do you think of this solution?

If you can get this to work cleanly for all guest types, that would
certainly be an option.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-23  7:28           ` Jan Beulich
@ 2014-07-24  2:08             ` Bob Liu
  2014-07-24  6:24               ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-07-24  2:08 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>> After so many days I haven't make a workable solution if don't remove
>> pages temporarily. The hardest part is iterating the heap free list
>> without holding heap_lock because if holding the lock it might be heavy
>> lock contention.
>> So do you think it's acceptable if fixed all other concerns about this
>> patch?
> 
> No, I don't think so. Instead I'm of the opinion that you may have
> worked in the wrong direction: Rather than not taking the heap lock
> at all, it may also be sufficient to shrink the lock holding time (i.e.
> avoid long loops with the lock held).
> 

But I still think have to drop pages from heap list temporarily else
heap lock must be taken for a long time to get rid of E.g. below race
condition.

A: alloc path            B: idle loop

                        spin_lock(&heap_lock)
                      page_list_for_each( pg, &heap(node, zone, order) )
                        if _PGC_need_scrub is set, break;
                        spin_unlock(&heap_lock)

                        if ( test_bit(_PGC_need_scrub, pg)

^^^^
spin_lock(&heap_lock)
delist page
spin_unlock(&heap_lock)

write data to this page

                         scrub_one_page(pg)
                         ^^^ will clean useful data


-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-24  2:08             ` Bob Liu
@ 2014-07-24  6:24               ` Jan Beulich
  2014-07-25  0:42                 ` Bob Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-07-24  6:24 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 24.07.14 at 04:08, <bob.liu@oracle.com> wrote:

> On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>>> After so many days I haven't make a workable solution if don't remove
>>> pages temporarily. The hardest part is iterating the heap free list
>>> without holding heap_lock because if holding the lock it might be heavy
>>> lock contention.
>>> So do you think it's acceptable if fixed all other concerns about this
>>> patch?
>> 
>> No, I don't think so. Instead I'm of the opinion that you may have
>> worked in the wrong direction: Rather than not taking the heap lock
>> at all, it may also be sufficient to shrink the lock holding time (i.e.
>> avoid long loops with the lock held).
>> 
> 
> But I still think have to drop pages from heap list temporarily else
> heap lock must be taken for a long time to get rid of E.g. below race
> condition.
> 
> A: alloc path            B: idle loop
> 
>                         spin_lock(&heap_lock)
>                       page_list_for_each( pg, &heap(node, zone, order) )
>                         if _PGC_need_scrub is set, break;
>                         spin_unlock(&heap_lock)
> 
>                         if ( test_bit(_PGC_need_scrub, pg)
> 
> ^^^^
> spin_lock(&heap_lock)
> delist page
> spin_unlock(&heap_lock)
> 
> write data to this page
> 
>                          scrub_one_page(pg)
>                          ^^^ will clean useful data

No (and I'm sure I said so before): The only problem is with the
linked list itself; the page contents are not a problem - the
allocation path can simply wait for the already suggested
_PGC_scrubbing flag to clear before returning. And as already
said (see above), by avoiding page_list_for_each() within the
locked region you already significantly reduce lock contention.
I.e. you need another means to find pages awaiting to be
scrubbed. You may want to leverage that the allocation path
does the scrubbing if needed (e.g. by not scrubbing the first
of any set of contiguous free pages on the idle path, linking up
all other ones recognizing that their link fields are unused while
on an order-greater-than-zero free list).

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-24  6:24               ` Jan Beulich
@ 2014-07-25  0:42                 ` Bob Liu
  2014-07-25  6:51                   ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-07-25  0:42 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 07/24/2014 02:24 PM, Jan Beulich wrote:
>>>> On 24.07.14 at 04:08, <bob.liu@oracle.com> wrote:
> 
>> On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>>>> After so many days I haven't make a workable solution if don't remove
>>>> pages temporarily. The hardest part is iterating the heap free list
>>>> without holding heap_lock because if holding the lock it might be heavy
>>>> lock contention.
>>>> So do you think it's acceptable if fixed all other concerns about this
>>>> patch?
>>>
>>> No, I don't think so. Instead I'm of the opinion that you may have
>>> worked in the wrong direction: Rather than not taking the heap lock
>>> at all, it may also be sufficient to shrink the lock holding time (i.e.
>>> avoid long loops with the lock held).
>>>
>>
>> But I still think have to drop pages from heap list temporarily else
>> heap lock must be taken for a long time to get rid of E.g. below race
>> condition.
>>
>> A: alloc path            B: idle loop
>>
>>                         spin_lock(&heap_lock)
>>                       page_list_for_each( pg, &heap(node, zone, order) )
>>                         if _PGC_need_scrub is set, break;
>>                         spin_unlock(&heap_lock)
>>
>>                         if ( test_bit(_PGC_need_scrub, pg)
>>
>> ^^^^
>> spin_lock(&heap_lock)
>> delist page
>> spin_unlock(&heap_lock)
>>
>> write data to this page
>>
>>                          scrub_one_page(pg)
>>                          ^^^ will clean useful data
> 
> No (and I'm sure I said so before): The only problem is with the
> linked list itself; the page contents are not a problem - the
> allocation path can simply wait for the already suggested
> _PGC_scrubbing flag to clear before returning. And as already

The page contents are a problem if the race condition I mentioned in
previous email happen.

Because there is a time window between checking the PGC_need_scrub flag
and doing the real scrub in idle thread, the idle thread will still
scrub a page after that page have been allocated by allocation path and
been used(and have been written some useful data).

> said (see above), by avoiding page_list_for_each() within the
> locked region you already significantly reduce lock contention.
> I.e. you need another means to find pages awaiting to be

Right, but I don't have better ideas beside using page_list_for_each()
or delist pages temporarily from heap list.

> scrubbed. You may want to leverage that the allocation path
> does the scrubbing if needed (e.g. by not scrubbing the first
> of any set of contiguous free pages on the idle path, linking up
> all other ones recognizing that their link fields are unused while
> on an order-greater-than-zero free list).

It sounds like another list have to be introduced, but I don't think
this can help to get rid of lock contention or the similar race condition.

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-25  0:42                 ` Bob Liu
@ 2014-07-25  6:51                   ` Jan Beulich
  2014-07-25  7:28                     ` Bob Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-07-25  6:51 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 25.07.14 at 02:42, <bob.liu@oracle.com> wrote:

> On 07/24/2014 02:24 PM, Jan Beulich wrote:
>>>>> On 24.07.14 at 04:08, <bob.liu@oracle.com> wrote:
>> 
>>> On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>>>>> After so many days I haven't make a workable solution if don't remove
>>>>> pages temporarily. The hardest part is iterating the heap free list
>>>>> without holding heap_lock because if holding the lock it might be heavy
>>>>> lock contention.
>>>>> So do you think it's acceptable if fixed all other concerns about this
>>>>> patch?
>>>>
>>>> No, I don't think so. Instead I'm of the opinion that you may have
>>>> worked in the wrong direction: Rather than not taking the heap lock
>>>> at all, it may also be sufficient to shrink the lock holding time (i.e.
>>>> avoid long loops with the lock held).
>>>>
>>>
>>> But I still think have to drop pages from heap list temporarily else
>>> heap lock must be taken for a long time to get rid of E.g. below race
>>> condition.
>>>
>>> A: alloc path            B: idle loop
>>>
>>>                         spin_lock(&heap_lock)
>>>                       page_list_for_each( pg, &heap(node, zone, order) )
>>>                         if _PGC_need_scrub is set, break;
>>>                         spin_unlock(&heap_lock)
>>>
>>>                         if ( test_bit(_PGC_need_scrub, pg)
>>>
>>> ^^^^
>>> spin_lock(&heap_lock)
>>> delist page
>>> spin_unlock(&heap_lock)
>>>
>>> write data to this page
>>>
>>>                          scrub_one_page(pg)
>>>                          ^^^ will clean useful data
>> 
>> No (and I'm sure I said so before): The only problem is with the
>> linked list itself; the page contents are not a problem - the
>> allocation path can simply wait for the already suggested
>> _PGC_scrubbing flag to clear before returning. And as already
> 
> The page contents are a problem if the race condition I mentioned in
> previous email happen.
> 
> Because there is a time window between checking the PGC_need_scrub flag
> and doing the real scrub in idle thread, the idle thread will still
> scrub a page after that page have been allocated by allocation path and
> been used(and have been written some useful data).

Did you really read all of my previous reply?

>> said (see above), by avoiding page_list_for_each() within the
>> locked region you already significantly reduce lock contention.
>> I.e. you need another means to find pages awaiting to be
> 
> Right, but I don't have better ideas beside using page_list_for_each()
> or delist pages temporarily from heap list.
> 
>> scrubbed. You may want to leverage that the allocation path
>> does the scrubbing if needed (e.g. by not scrubbing the first
>> of any set of contiguous free pages on the idle path, linking up
>> all other ones recognizing that their link fields are unused while
>> on an order-greater-than-zero free list).
> 
> It sounds like another list have to be introduced, but I don't think
> this can help to get rid of lock contention or the similar race condition.

Yeah, you need to get a little creative here...

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-25  6:51                   ` Jan Beulich
@ 2014-07-25  7:28                     ` Bob Liu
  2014-07-25  7:36                       ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-07-25  7:28 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 07/25/2014 02:51 PM, Jan Beulich wrote:
>>>> On 25.07.14 at 02:42, <bob.liu@oracle.com> wrote:
> 
>> On 07/24/2014 02:24 PM, Jan Beulich wrote:
>>>>>> On 24.07.14 at 04:08, <bob.liu@oracle.com> wrote:
>>>
>>>> On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>>>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>>>>>> After so many days I haven't make a workable solution if don't remove
>>>>>> pages temporarily. The hardest part is iterating the heap free list
>>>>>> without holding heap_lock because if holding the lock it might be heavy
>>>>>> lock contention.
>>>>>> So do you think it's acceptable if fixed all other concerns about this
>>>>>> patch?
>>>>>
>>>>> No, I don't think so. Instead I'm of the opinion that you may have
>>>>> worked in the wrong direction: Rather than not taking the heap lock
>>>>> at all, it may also be sufficient to shrink the lock holding time (i.e.
>>>>> avoid long loops with the lock held).
>>>>>
>>>>
>>>> But I still think have to drop pages from heap list temporarily else
>>>> heap lock must be taken for a long time to get rid of E.g. below race
>>>> condition.
>>>>
>>>> A: alloc path            B: idle loop
>>>>
>>>>                         spin_lock(&heap_lock)
>>>>                       page_list_for_each( pg, &heap(node, zone, order) )
>>>>                         if _PGC_need_scrub is set, break;
>>>>                         spin_unlock(&heap_lock)
>>>>
>>>>                         if ( test_bit(_PGC_need_scrub, pg)
>>>>
>>>> ^^^^
>>>> spin_lock(&heap_lock)
>>>> delist page
>>>> spin_unlock(&heap_lock)
>>>>
>>>> write data to this page
>>>>
>>>>                          scrub_one_page(pg)
>>>>                          ^^^ will clean useful data
>>>
>>> No (and I'm sure I said so before): The only problem is with the
>>> linked list itself; the page contents are not a problem - the
>>> allocation path can simply wait for the already suggested
>>> _PGC_scrubbing flag to clear before returning. And as already
>>
>> The page contents are a problem if the race condition I mentioned in
>> previous email happen.
>>
>> Because there is a time window between checking the PGC_need_scrub flag
>> and doing the real scrub in idle thread, the idle thread will still
>> scrub a page after that page have been allocated by allocation path and
>> been used(and have been written some useful data).
> 
> Did you really read all of my previous reply?
> 

Sure, may be I misunderstood your reply.
If the allocation path can wait for the flag there is no problem, but I
remember you suggested to do the scrubbing also in allocation path in
which case I think this race condition will happen.

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-25  7:28                     ` Bob Liu
@ 2014-07-25  7:36                       ` Jan Beulich
  2014-07-25  8:18                         ` Bob Liu
  0 siblings, 1 reply; 25+ messages in thread
From: Jan Beulich @ 2014-07-25  7:36 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 25.07.14 at 09:28, <bob.liu@oracle.com> wrote:

> On 07/25/2014 02:51 PM, Jan Beulich wrote:
>>>>> On 25.07.14 at 02:42, <bob.liu@oracle.com> wrote:
>> 
>>> On 07/24/2014 02:24 PM, Jan Beulich wrote:
>>>>>>> On 24.07.14 at 04:08, <bob.liu@oracle.com> wrote:
>>>>
>>>>> On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>>>>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>>>>>>> After so many days I haven't make a workable solution if don't remove
>>>>>>> pages temporarily. The hardest part is iterating the heap free list
>>>>>>> without holding heap_lock because if holding the lock it might be heavy
>>>>>>> lock contention.
>>>>>>> So do you think it's acceptable if fixed all other concerns about this
>>>>>>> patch?
>>>>>>
>>>>>> No, I don't think so. Instead I'm of the opinion that you may have
>>>>>> worked in the wrong direction: Rather than not taking the heap lock
>>>>>> at all, it may also be sufficient to shrink the lock holding time (i.e.
>>>>>> avoid long loops with the lock held).
>>>>>>
>>>>>
>>>>> But I still think have to drop pages from heap list temporarily else
>>>>> heap lock must be taken for a long time to get rid of E.g. below race
>>>>> condition.
>>>>>
>>>>> A: alloc path            B: idle loop
>>>>>
>>>>>                         spin_lock(&heap_lock)
>>>>>                       page_list_for_each( pg, &heap(node, zone, order) )
>>>>>                         if _PGC_need_scrub is set, break;
>>>>>                         spin_unlock(&heap_lock)
>>>>>
>>>>>                         if ( test_bit(_PGC_need_scrub, pg)
>>>>>
>>>>> ^^^^
>>>>> spin_lock(&heap_lock)
>>>>> delist page
>>>>> spin_unlock(&heap_lock)
>>>>>
>>>>> write data to this page
>>>>>
>>>>>                          scrub_one_page(pg)
>>>>>                          ^^^ will clean useful data
>>>>
>>>> No (and I'm sure I said so before): The only problem is with the
>>>> linked list itself; the page contents are not a problem - the
>>>> allocation path can simply wait for the already suggested
>>>> _PGC_scrubbing flag to clear before returning. And as already
>>>
>>> The page contents are a problem if the race condition I mentioned in
>>> previous email happen.
>>>
>>> Because there is a time window between checking the PGC_need_scrub flag
>>> and doing the real scrub in idle thread, the idle thread will still
>>> scrub a page after that page have been allocated by allocation path and
>>> been used(and have been written some useful data).
>> 
>> Did you really read all of my previous reply?
> 
> Sure, may be I misunderstood your reply.
> If the allocation path can wait for the flag there is no problem, but I
> remember you suggested to do the scrubbing also in allocation path in
> which case I think this race condition will happen.

Two CPUs scrubbing the same page simultaneously is not a
correctness problem, only a performance one: The ultimate result
is a scrubbed page, but there may be unnecessary bus contention.

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-25  7:36                       ` Jan Beulich
@ 2014-07-25  8:18                         ` Bob Liu
  2014-07-25  8:28                           ` Jan Beulich
  0 siblings, 1 reply; 25+ messages in thread
From: Bob Liu @ 2014-07-25  8:18 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel


On 07/25/2014 03:36 PM, Jan Beulich wrote:
>>>> On 25.07.14 at 09:28, <bob.liu@oracle.com> wrote:
> 
>> On 07/25/2014 02:51 PM, Jan Beulich wrote:
>>>>>> On 25.07.14 at 02:42, <bob.liu@oracle.com> wrote:
>>>
>>>> On 07/24/2014 02:24 PM, Jan Beulich wrote:
>>>>>>>> On 24.07.14 at 04:08, <bob.liu@oracle.com> wrote:
>>>>>
>>>>>> On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>>>>>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>>>>>>>> After so many days I haven't make a workable solution if don't remove
>>>>>>>> pages temporarily. The hardest part is iterating the heap free list
>>>>>>>> without holding heap_lock because if holding the lock it might be heavy
>>>>>>>> lock contention.
>>>>>>>> So do you think it's acceptable if fixed all other concerns about this
>>>>>>>> patch?
>>>>>>>
>>>>>>> No, I don't think so. Instead I'm of the opinion that you may have
>>>>>>> worked in the wrong direction: Rather than not taking the heap lock
>>>>>>> at all, it may also be sufficient to shrink the lock holding time (i.e.
>>>>>>> avoid long loops with the lock held).
>>>>>>>
>>>>>>
>>>>>> But I still think have to drop pages from heap list temporarily else
>>>>>> heap lock must be taken for a long time to get rid of E.g. below race
>>>>>> condition.
>>>>>>
>>>>>> A: alloc path            B: idle loop
>>>>>>
>>>>>>                         spin_lock(&heap_lock)
>>>>>>                       page_list_for_each( pg, &heap(node, zone, order) )
>>>>>>                         if _PGC_need_scrub is set, break;
>>>>>>                         spin_unlock(&heap_lock)
>>>>>>
>>>>>>                         if ( test_bit(_PGC_need_scrub, pg)
>>>>>>
>>>>>> ^^^^
>>>>>> spin_lock(&heap_lock)
>>>>>> delist page
>>>>>> spin_unlock(&heap_lock)
>>>>>>
>>>>>> write data to this page
>>>>>>
>>>>>>                          scrub_one_page(pg)
>>>>>>                          ^^^ will clean useful data
>>>>>
>>>>> No (and I'm sure I said so before): The only problem is with the
>>>>> linked list itself; the page contents are not a problem - the
>>>>> allocation path can simply wait for the already suggested
>>>>> _PGC_scrubbing flag to clear before returning. And as already
>>>>
>>>> The page contents are a problem if the race condition I mentioned in
>>>> previous email happen.
>>>>
>>>> Because there is a time window between checking the PGC_need_scrub flag
>>>> and doing the real scrub in idle thread, the idle thread will still
>>>> scrub a page after that page have been allocated by allocation path and
>>>> been used(and have been written some useful data).
>>>
>>> Did you really read all of my previous reply?
>>
>> Sure, may be I misunderstood your reply.
>> If the allocation path can wait for the flag there is no problem, but I
>> remember you suggested to do the scrubbing also in allocation path in
>> which case I think this race condition will happen.
> 
> Two CPUs scrubbing the same page simultaneously is not a
> correctness problem, only a performance one: The ultimate result
> is a scrubbed page, but there may be unnecessary bus contention.
> 

Thank you for your patient!

If CPU0 scrubbed a page in allocation path and then that page was
allocated successfully and then been written some data, but CPU1 didn't
recognize this situation and will scrub the same page again which will
clear the useful data by mistake.

The race condition is like this:

CPU0:                   CPU1:
allocation path         idle vcpu thread

                        find a need_scrub page from heap list
                        if ( test_bit(_PGC_need_scrub) )
^^^
spin_lock(&heap_lock)
delist the same page
spin_unlock(&heap_lock)
scrub the page

write date to this
page when an user
get this page

                           ^^^^
                           scrub the page
                           clear need_scrub flag

When CPU1 try to do the scrubbing the page has already been allocated
and written by CPU0, that's a serious bad situation.

Currently I still don't have better idea to get rid of all of those
problems if not dropping pages from heap list temporarily.

-- 
Regards,
-Bob

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v2 3/3] xen: use idle vcpus to scrub pages
  2014-07-25  8:18                         ` Bob Liu
@ 2014-07-25  8:28                           ` Jan Beulich
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Beulich @ 2014-07-25  8:28 UTC (permalink / raw)
  To: Bob Liu
  Cc: Bob Liu, keir, ian.campbell, George.Dunlap, andrew.cooper3, xen-devel

>>> On 25.07.14 at 10:18, <bob.liu@oracle.com> wrote:

> On 07/25/2014 03:36 PM, Jan Beulich wrote:
>>>>> On 25.07.14 at 09:28, <bob.liu@oracle.com> wrote:
>> 
>>> On 07/25/2014 02:51 PM, Jan Beulich wrote:
>>>>>>> On 25.07.14 at 02:42, <bob.liu@oracle.com> wrote:
>>>>
>>>>> On 07/24/2014 02:24 PM, Jan Beulich wrote:
>>>>>>>>> On 24.07.14 at 04:08, <bob.liu@oracle.com> wrote:
>>>>>>
>>>>>>> On 07/23/2014 03:28 PM, Jan Beulich wrote:
>>>>>>>>>>> On 15.07.14 at 11:16, <bob.liu@oracle.com> wrote:
>>>>>>>>> After so many days I haven't make a workable solution if don't remove
>>>>>>>>> pages temporarily. The hardest part is iterating the heap free list
>>>>>>>>> without holding heap_lock because if holding the lock it might be heavy
>>>>>>>>> lock contention.
>>>>>>>>> So do you think it's acceptable if fixed all other concerns about this
>>>>>>>>> patch?
>>>>>>>>
>>>>>>>> No, I don't think so. Instead I'm of the opinion that you may have
>>>>>>>> worked in the wrong direction: Rather than not taking the heap lock
>>>>>>>> at all, it may also be sufficient to shrink the lock holding time (i.e.
>>>>>>>> avoid long loops with the lock held).
>>>>>>>>
>>>>>>>
>>>>>>> But I still think have to drop pages from heap list temporarily else
>>>>>>> heap lock must be taken for a long time to get rid of E.g. below race
>>>>>>> condition.
>>>>>>>
>>>>>>> A: alloc path            B: idle loop
>>>>>>>
>>>>>>>                         spin_lock(&heap_lock)
>>>>>>>                       page_list_for_each( pg, &heap(node, zone, order) )
>>>>>>>                         if _PGC_need_scrub is set, break;
>>>>>>>                         spin_unlock(&heap_lock)
>>>>>>>
>>>>>>>                         if ( test_bit(_PGC_need_scrub, pg)
>>>>>>>
>>>>>>> ^^^^
>>>>>>> spin_lock(&heap_lock)
>>>>>>> delist page
>>>>>>> spin_unlock(&heap_lock)
>>>>>>>
>>>>>>> write data to this page
>>>>>>>
>>>>>>>                          scrub_one_page(pg)
>>>>>>>                          ^^^ will clean useful data
>>>>>>
>>>>>> No (and I'm sure I said so before): The only problem is with the
>>>>>> linked list itself; the page contents are not a problem - the
>>>>>> allocation path can simply wait for the already suggested
>>>>>> _PGC_scrubbing flag to clear before returning. And as already
>>>>>
>>>>> The page contents are a problem if the race condition I mentioned in
>>>>> previous email happen.
>>>>>
>>>>> Because there is a time window between checking the PGC_need_scrub flag
>>>>> and doing the real scrub in idle thread, the idle thread will still
>>>>> scrub a page after that page have been allocated by allocation path and
>>>>> been used(and have been written some useful data).
>>>>
>>>> Did you really read all of my previous reply?
>>>
>>> Sure, may be I misunderstood your reply.
>>> If the allocation path can wait for the flag there is no problem, but I
>>> remember you suggested to do the scrubbing also in allocation path in
>>> which case I think this race condition will happen.
>> 
>> Two CPUs scrubbing the same page simultaneously is not a
>> correctness problem, only a performance one: The ultimate result
>> is a scrubbed page, but there may be unnecessary bus contention.
>> 
> 
> Thank you for your patient!
> 
> If CPU0 scrubbed a page in allocation path and then that page was
> allocated successfully and then been written some data, but CPU1 didn't
> recognize this situation and will scrub the same page again which will
> clear the useful data by mistake.

Did you read the earlier "Did you really read all of my previous reply?"
and then go back and _read_ what was written there?

Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2014-07-25  8:28 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-30 13:39 [PATCH v2 1/3] xen: delay page scrubbing to allocation path Bob Liu
2014-06-30 13:39 ` [PATCH v2 2/3] xen: introduce function merge_free_trunks Bob Liu
2014-06-30 15:58   ` Jan Beulich
2014-07-01  8:14     ` Bob Liu
2014-07-01  8:27       ` Jan Beulich
2014-06-30 13:39 ` [PATCH v2 3/3] xen: use idle vcpus to scrub pages Bob Liu
2014-07-01  9:12   ` Jan Beulich
2014-07-01 12:25     ` Bob Liu
2014-07-01 12:59       ` Jan Beulich
2014-07-02  6:27         ` Bob Liu
2014-07-07 12:20           ` Bob Liu
2014-07-15  9:16         ` Bob Liu
2014-07-23  0:38           ` Konrad Rzeszutek Wilk
2014-07-23  1:30             ` Bob Liu
2014-07-23  7:28           ` Jan Beulich
2014-07-24  2:08             ` Bob Liu
2014-07-24  6:24               ` Jan Beulich
2014-07-25  0:42                 ` Bob Liu
2014-07-25  6:51                   ` Jan Beulich
2014-07-25  7:28                     ` Bob Liu
2014-07-25  7:36                       ` Jan Beulich
2014-07-25  8:18                         ` Bob Liu
2014-07-25  8:28                           ` Jan Beulich
2014-06-30 15:56 ` [PATCH v2 1/3] xen: delay page scrubbing to allocation path Jan Beulich
2014-07-01  8:12   ` Bob Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.