[Xen-devel] [PATCH 2/2] xen/mm: Introduce PGC_state_uninitialised

From: David Woodhouse <dwmw2@infradead.org>
To: xen-devel@lists.xenproject.org
Cc: "Stefano Stabellini" <sstabellini@kernel.org>,
	"Julien Grall" <julien@xen.org>, "Wei Liu" <wl@xen.org>,
	"Andrew Cooper" <andrew.cooper3@citrix.com>,
	"Ian Jackson" <ian.jackson@eu.citrix.com>,
	"George Dunlap" <george.dunlap@citrix.com>,
	hongyxia@amazon.com, "Jan Beulich" <jbeulich@suse.com>,
	"Volodymyr Babchuk" <Volodymyr_Babchuk@epam.com>,
	"Roger Pau Monné" <roger.pau@citrix.com>
Subject: [Xen-devel] [PATCH 2/2] xen/mm: Introduce PGC_state_uninitialised
Date: Thu, 19 Mar 2020 21:21:50 +0000	[thread overview]
Message-ID: <20200319212150.2651419-2-dwmw2@infradead.org> (raw)
In-Reply-To: <20200319212150.2651419-1-dwmw2@infradead.org>

From: David Woodhouse <dwmw@amazon.co.uk>

It is possible for pages to enter general circulation without ever
being process by init_heap_pages().

For example, pages of the multiboot module containing the initramfs may
be assigned via assign_pages() to dom0 as it is created. And some code
including map_pages_to_xen() has checks on 'system_state' to determine
whether to use the boot or the heap allocator, but it seems impossible
to prove that pages allocated by the boot allocator are not subsequently
freed with free_heap_pages().

This actually works fine in the majority of cases; there are only a few
esoteric corner cases which init_heap_pages() handles before handing the
page range off to free_heap_pages():
 • Excluding MFN #0 to avoid inappropriate cross-zone merging.
 • Ensuring that the node information structures exist, when the first
   page(s) of a given node are handled.
 • High order allocations crossing from one node to another.

To handle this case, shift PG_state_inuse from its current value of
zero, to another value. Use zero, which is the initial state of the
entire frame table, as PG_state_uninitialised.

Fix a couple of assertions which were assuming that PG_state_inuse is
zero, and make them cope with the PG_state_uninitialised case too where
appopriate.

Finally, make free_heap_pages() call through to init_heap_pages() when
given a page range which has not been initialised. This cannot keep
recursing because init_heap_pages() will set each page state to
PGC_state_inuse before passing it back to free_heap_pages() for the
second time.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 xen/arch/x86/mm.c        |  3 ++-
 xen/common/page_alloc.c  | 44 +++++++++++++++++++++++++++++-----------
 xen/include/asm-arm/mm.h |  3 ++-
 xen/include/asm-x86/mm.h |  3 ++-
 4 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 62507ca651..5f0581c072 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -491,7 +491,8 @@ void share_xen_page_with_guest(struct page_info *page, struct domain *d,
 
     page_set_owner(page, d);
     smp_wmb(); /* install valid domain ptr before updating refcnt. */
-    ASSERT((page->count_info & ~PGC_xen_heap) == 0);
+    ASSERT((page->count_info & ~PGC_xen_heap) == PGC_state_inuse ||
+           (page->count_info & ~PGC_xen_heap) == PGC_state_uninitialised);
 
     /* Only add to the allocation list if the domain isn't dying. */
     if ( !d->is_dying )
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 8d72a64f4e..4f7971f2a1 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -252,6 +252,8 @@ struct bootmem_region {
 static struct bootmem_region __initdata
     bootmem_region_list[PAGE_SIZE / sizeof(struct bootmem_region)];
 static unsigned int __initdata nr_bootmem_regions;
+static void init_heap_pages(struct page_info *pg, unsigned long nr_pages,
+                            bool scrub);
 
 struct scrub_region {
     unsigned long offset;
@@ -1390,6 +1392,17 @@ static void free_heap_pages(
     ASSERT(order <= MAX_ORDER);
     ASSERT(node >= 0);
 
+    if ( page_state_is(pg, uninitialised) )
+    {
+        init_heap_pages(pg, 1 << order, need_scrub);
+        /*
+         * init_heap_pages() will call back into free_heap_pages() for
+         * each page but cannot keep recursing because each page will
+         * be set to PGC_state_inuse first.
+         */
+        return;
+    }
+
     spin_lock(&heap_lock);
 
     for ( i = 0; i < (1 << order); i++ )
@@ -1771,11 +1784,10 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
  * latter is not on a MAX_ORDER boundary, then we reserve the page by
  * not freeing it to the buddy allocator.
  */
-static void init_heap_pages(
-    struct page_info *pg, unsigned long nr_pages)
+static void init_heap_pages(struct page_info *pg, unsigned long nr_pages,
+                            bool scrub)
 {
     unsigned long i;
-    bool idle_scrub = false;
 
     /*
      * Keep MFN 0 away from the buddy allocator to avoid crossing zone
@@ -1800,7 +1812,7 @@ static void init_heap_pages(
     spin_unlock(&heap_lock);
 
     if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )
-        idle_scrub = true;
+        scrub = true;
 
     for ( i = 0; i < nr_pages; i++ )
     {
@@ -1828,7 +1840,8 @@ static void init_heap_pages(
             nr_pages -= n;
         }
 
-        free_heap_pages(pg + i, 0, scrub_debug || idle_scrub);
+        pg[i].count_info = PGC_state_inuse;
+        free_heap_pages(pg + i, 0, scrub_debug || scrub);
     }
 }
 
@@ -1864,7 +1877,7 @@ void __init end_boot_allocator(void)
         if ( (r->s < r->e) &&
              (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
         {
-            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
+            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s, false);
             r->e = r->s;
             break;
         }
@@ -1873,7 +1886,7 @@ void __init end_boot_allocator(void)
     {
         struct bootmem_region *r = &bootmem_region_list[i];
         if ( r->s < r->e )
-            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
+            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s, false);
     }
     nr_bootmem_regions = 0;
 
@@ -2142,7 +2155,7 @@ void init_xenheap_pages(paddr_t ps, paddr_t pe)
 
     memguard_guard_range(maddr_to_virt(ps), pe - ps);
 
-    init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
+    init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT, false);
 }
 
 
@@ -2251,7 +2264,7 @@ void init_domheap_pages(paddr_t ps, paddr_t pe)
     if ( mfn_x(emfn) <= mfn_x(smfn) )
         return;
 
-    init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn));
+    init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn), false);
 }
 
 
@@ -2280,7 +2293,8 @@ int assign_pages(
 
         for ( i = 0; i < (1ul << order); i++ )
         {
-            ASSERT(!(pg[i].count_info & ~PGC_extra));
+            ASSERT((pg[i].count_info & ~PGC_extra) == PGC_state_inuse ||
+                   (pg[i].count_info & ~PGC_extra) == PGC_state_uninitialised);
             if ( pg[i].count_info & PGC_extra )
                 extra_pages++;
         }
@@ -2316,10 +2330,16 @@ int assign_pages(
     for ( i = 0; i < (1 << order); i++ )
     {
         ASSERT(page_get_owner(&pg[i]) == NULL);
+        /*
+         * Note: Not using page_state_is() here. The ASSERT requires that
+         * all other bits in count_info are zero, in addition to PGC_state
+         * being appropriate.
+         */
+        ASSERT((pg[i].count_info & ~PGC_extra) == PGC_state_inuse ||
+               (pg[i].count_info & ~PGC_extra) == PGC_state_uninitialised);
         page_set_owner(&pg[i], d);
         smp_wmb(); /* Domain pointer must be visible before updating refcnt. */
-        pg[i].count_info =
-            (pg[i].count_info & PGC_extra) | PGC_allocated | 1;
+        pg[i].count_info = (pg[i].count_info & PGC_state) | PGC_allocated | 1;
         page_list_add_tail(&pg[i], &d->page_list);
     }
 
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index a877791d1c..49663fa98a 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -113,12 +113,13 @@ struct page_info
   * { inuse, offlining, offlined, free, broken_offlining, broken }
   */
 #define PGC_state                  PG_mask(7, 9)
-#define PGC_state_inuse            PG_mask(0, 9)
+#define PGC_state_uninitialised    PG_mask(0, 9)
 #define PGC_state_offlining        PG_mask(1, 9)
 #define PGC_state_offlined         PG_mask(2, 9)
 #define PGC_state_free             PG_mask(3, 9)
 #define PGC_state_broken_offlining PG_mask(4, 9) /* Broken and offlining */
 #define PGC_state_broken           PG_mask(5, 9) /* Broken and offlined */
+#define PGC_state_inuse            PG_mask(6, 9)
 
 #define pgc_is(pgc, st)            (((pgc) & PGC_state) == PGC_state_##st)
 #define page_state_is(pg, st)       pgc_is((pg)->count_info, st)
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 1203f1b179..5fbbca5f05 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -72,12 +72,13 @@
   * { inuse, offlining, offlined, free, broken_offlining, broken }
   */
 #define PGC_state                  PG_mask(7, 9)
-#define PGC_state_inuse            PG_mask(0, 9)
+#define PGC_state_uninitialised    PG_mask(0, 9)
 #define PGC_state_offlining        PG_mask(1, 9)
 #define PGC_state_offlined         PG_mask(2, 9)
 #define PGC_state_free             PG_mask(3, 9)
 #define PGC_state_broken_offlining PG_mask(4, 9) /* Broken and offlining */
 #define PGC_state_broken           PG_mask(5, 9) /* Broken and offlined */
+#define PGC_state_inuse            PG_mask(6, 9)
 
 #define pgc_is(pgc, st)            (((pgc) & PGC_state) == PGC_state_##st)
 #define page_state_is(pg, st)       pgc_is((pg)->count_info, st)
-- 
2.21.0


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel