On Fri, 2020-02-07 at 17:06 +0000, David Woodhouse wrote: > On 7 February 2020 16:40:01 GMT, "Xia, Hongyan" wrote: > > On Fri, 2020-02-07 at 16:32 +0000, David Woodhouse wrote: > > > > > > ... > > > > > > Ideally not because init_heap_pages() then calls free_heap_pages() > > > and the "recursion" is best avoided. > > > > Kind of depends on where we change its pg to initialised. If we do that > > in free_heap_pages this does recurse, but if it is done in > > init_heap_pages before calling free it does not. > > True. It would *look* like it might recurse, but never should in practice. Looks like this. Less duplication of the 'if (uninitialised) init_heap_pages() else free_heap_pages()' construct, more scope for people to naïvely complain that it "recurses". I think I prefer it this way. From: David Woodhouse Date: Fri, 7 Feb 2020 13:01:36 +0000 Subject: [PATCH] xen/mm: Introduce PG_state_uninitialised It is possible for pages to enter general circulation without ever being process by init_heap_pages(). For example, pages of the multiboot module containing the initramfs may be assigned via assign_pages() to dom0 as it is created. And some code including map_pages_to_xen() has checks on 'system_state' to determine whether to use the boot or the heap allocator, but it seems impossible to prove that pages allocated by the boot allocator are not subsequently freed with free_heap_pages(). This actually works fine in the majority of cases; there are only a few esoteric corner cases which init_heap_pages() handles before handing the page range off to free_heap_pages(): • Excluding MFN #0 to avoid inappropriate cross-zone merging. • Ensuring that the node information structures exist, when the first page(s) of a given node are handled. • High order allocations crossing from one node to another. To handle this case, shift PG_state_inuse from its current value of zero, to another value. Use zero, which is the initial state of the entire frame table, as PG_state_uninitialised. Fix a couple of assertions which were assuming that PG_state_inuse is zero, and make them cope with the PG_state_uninitialised case too where appopriate. Finally, make free_heap_pages() call through to init_heap_pages() when given a page range which has not been initialised. This cannot keep recursing because init_heap_pages() will set each page state to PGC_state_inuse before passing it back to free_heap_pages() for the second time. Signed-off-by: David Woodhouse --- xen/arch/x86/mm.c | 3 ++- xen/common/page_alloc.c | 40 ++++++++++++++++++++++++++++------------ xen/include/asm-arm/mm.h | 3 ++- xen/include/asm-x86/mm.h | 3 ++- 4 files changed, 34 insertions(+), 15 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 9b33829084..bf660ee8eb 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -488,7 +488,8 @@ void share_xen_page_with_guest(struct page_info *page, struct domain *d, page_set_owner(page, d); smp_wmb(); /* install valid domain ptr before updating refcnt. */ - ASSERT((page->count_info & ~PGC_xen_heap) == 0); + ASSERT((page->count_info & ~PGC_xen_heap) == PGC_state_inuse || + (page->count_info & ~PGC_xen_heap) == PGC_state_uninitialised); /* Only add to the allocation list if the domain isn't dying. */ if ( !d->is_dying ) diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 4084503554..95984d6de0 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -252,6 +252,8 @@ struct bootmem_region { static struct bootmem_region __initdata bootmem_region_list[PAGE_SIZE / sizeof(struct bootmem_region)]; static unsigned int __initdata nr_bootmem_regions; +static void init_heap_pages(struct page_info *pg, unsigned long nr_pages, + bool scrub); struct scrub_region { unsigned long offset; @@ -1389,6 +1391,16 @@ static void free_heap_pages( ASSERT(order <= MAX_ORDER); ASSERT(node >= 0); + if ( page_state_is(pg, uninitialised) ) + { + init_heap_pages(pg, 1 << order, need_scrub); + /* + * init_heap_pages() will call back into free_heap_pages() for + * each page but cannot keep recursing because each page will + * be set to PGC_state_inuse first. + */ + return; + } spin_lock(&heap_lock); for ( i = 0; i < (1 << order); i++ ) @@ -1780,11 +1792,10 @@ int query_page_offline(mfn_t mfn, uint32_t *status) * latter is not on a MAX_ORDER boundary, then we reserve the page by * not freeing it to the buddy allocator. */ -static void init_heap_pages( - struct page_info *pg, unsigned long nr_pages) +static void init_heap_pages(struct page_info *pg, unsigned long nr_pages, + bool scrub) { unsigned long i; - bool idle_scrub = false; /* * Keep MFN 0 away from the buddy allocator to avoid crossing zone @@ -1809,7 +1820,7 @@ static void init_heap_pages( spin_unlock(&heap_lock); if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE ) - idle_scrub = true; + scrub = true; for ( i = 0; i < nr_pages; i++ ) { @@ -1837,7 +1848,8 @@ static void init_heap_pages( nr_pages -= n; } - free_heap_pages(pg + i, 0, scrub_debug || idle_scrub); + pg[i].count_info = PGC_state_inuse; + free_heap_pages(pg + i, 0, scrub_debug || scrub); } } @@ -1873,7 +1885,7 @@ void __init end_boot_allocator(void) if ( (r->s < r->e) && (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) ) { - init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s); + init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s, false); r->e = r->s; break; } @@ -1882,7 +1894,7 @@ void __init end_boot_allocator(void) { struct bootmem_region *r = &bootmem_region_list[i]; if ( r->s < r->e ) - init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s); + init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s, false); } nr_bootmem_regions = 0; @@ -2151,7 +2163,7 @@ void init_xenheap_pages(paddr_t ps, paddr_t pe) memguard_guard_range(maddr_to_virt(ps), pe - ps); - init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT); + init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT, false); } @@ -2260,7 +2275,7 @@ void init_domheap_pages(paddr_t ps, paddr_t pe) if ( mfn_x(emfn) <= mfn_x(smfn) ) return; - init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn)); + init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn), false); } @@ -2301,10 +2316,11 @@ int assign_pages( for ( i = 0; i < (1 << order); i++ ) { ASSERT(page_get_owner(&pg[i]) == NULL); - ASSERT(!pg[i].count_info); + ASSERT(pg[i].count_info == PGC_state_inuse || + pg[i].count_info == PGC_state_uninitialised); page_set_owner(&pg[i], d); smp_wmb(); /* Domain pointer must be visible before updating refcnt. */ - pg[i].count_info = PGC_allocated | 1; + pg[i].count_info |= PGC_allocated | 1; page_list_add_tail(&pg[i], &d->page_list); } diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h index c9466c8ca0..c696941600 100644 --- a/xen/include/asm-arm/mm.h +++ b/xen/include/asm-arm/mm.h @@ -117,12 +117,13 @@ struct page_info * { inuse, offlining, offlined, free, broken_offlining, broken } */ #define PGC_state PG_mask(7, 9) -#define PGC_state_inuse PG_mask(0, 9) +#define PGC_state_uninitialised PG_mask(0, 9) #define PGC_state_offlining PG_mask(1, 9) #define PGC_state_offlined PG_mask(2, 9) #define PGC_state_free PG_mask(3, 9) #define PGC_state_broken_offlining PG_mask(4, 9) #define PGC_state_broken PG_mask(5, 9) +#define PGC_state_inuse PG_mask(6, 9) #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) #define page_is_broken(pg) (page_state_is((pg), broken_offlining) || \ diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 3edadf7a7c..645368e6a9 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -72,12 +72,13 @@ * { inuse, offlining, offlined, free, broken_offlining, broken } */ #define PGC_state PG_mask(7, 9) -#define PGC_state_inuse PG_mask(0, 9) +#define PGC_state_uninitialised PG_mask(0, 9) #define PGC_state_offlining PG_mask(1, 9) #define PGC_state_offlined PG_mask(2, 9) #define PGC_state_free PG_mask(3, 9) #define PGC_state_broken_offlining PG_mask(4, 9) #define PGC_state_broken PG_mask(5, 9) +#define PGC_state_inuse PG_mask(6, 9) #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) #define page_is_broken(pg) (page_state_is((pg), broken_offlining) || \ -- 2.17.1