All of lore.kernel.org
 help / color / mirror / Atom feed
From: Wei Liu <wei.liu2@citrix.com>
To: Xen-devel <xen-devel@lists.xenproject.org>
Cc: "Andrew Cooper" <andrew.cooper3@citrix.com>,
	"Wei Liu" <wei.liu2@citrix.com>,
	"Jan Beulich" <JBeulich@suse.com>,
	"Roger Pau Monné" <roger.pau@citrix.com>
Subject: [PATCH v3 3/5] x86: split PV dom0 builder to pv/dom0_builder.c
Date: Mon, 20 Mar 2017 14:14:24 +0000	[thread overview]
Message-ID: <20170320141426.20780-4-wei.liu2@citrix.com> (raw)
In-Reply-To: <20170320141426.20780-1-wei.liu2@citrix.com>

Long term we want to be able to disentangle PV and HVM code. Move the PV
domain builder to a dedicated file.

This in turn requires exposing a few functions and variables via a new
header dom0_build.h. These functions and variables are now prefixed with
"dom0_" if they weren't already so.

No functional change.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
v3:
1. fix indentation
2. adapt to previous patch

v2:
1. put file under pv directory
2. use dom0_ prefix
3. header now in asm-x86
---
 xen/arch/x86/dom0_build.c        | 912 +-------------------------------------
 xen/arch/x86/pv/Makefile         |   1 +
 xen/arch/x86/pv/dom0_build.c     | 913 +++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/dom0_build.h |  33 ++
 4 files changed, 963 insertions(+), 896 deletions(-)
 create mode 100644 xen/arch/x86/pv/dom0_build.c
 create mode 100644 xen/include/asm-x86/dom0_build.h

diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c
index 7ca847e19b..cb44a20792 100644
--- a/xen/arch/x86/dom0_build.c
+++ b/xen/arch/x86/dom0_build.c
@@ -28,6 +28,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
+#include <asm/dom0_build.h>
 #include <asm/i387.h>
 #include <asm/paging.h>
 #include <asm/p2m.h>
@@ -156,9 +157,9 @@ custom_param("dom0_nodes", parse_dom0_nodes);
 
 static cpumask_t __initdata dom0_cpus;
 
-static struct vcpu *__init setup_dom0_vcpu(struct domain *d,
-                                           unsigned int vcpu_id,
-                                           unsigned int prev_cpu)
+struct vcpu *__init dom0_setup_vcpu(struct domain *d,
+                                    unsigned int vcpu_id,
+                                    unsigned int prev_cpu)
 {
     unsigned int cpu = cpumask_cycle(prev_cpu, &dom0_cpus);
     struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu);
@@ -216,7 +217,7 @@ struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0)
         return NULL;
     dom0->max_vcpus = max_vcpus;
 
-    return setup_dom0_vcpu(dom0, 0,
+    return dom0_setup_vcpu(dom0, 0,
                            cpumask_last(&dom0_cpus) /* so it wraps around to first pcpu */);
 }
 
@@ -260,66 +261,7 @@ string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
 static bool_t __initdata ro_hpet = 1;
 boolean_param("ro-hpet", ro_hpet);
 
-/* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
-#define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
-#define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
-/* ... except for compatibility mode guests. */
-#define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
-#define L2_PROT (BASE_PROT|_PAGE_DIRTY)
-#define L3_PROT (BASE_PROT|_PAGE_DIRTY)
-#define L4_PROT (BASE_PROT|_PAGE_DIRTY)
-
-static unsigned int __initdata memflags = MEMF_no_dma|MEMF_exact_node;
-
-static struct page_info * __init alloc_chunk(
-    struct domain *d, unsigned long max_pages)
-{
-    static unsigned int __initdata last_order = MAX_ORDER;
-    struct page_info *page;
-    unsigned int order = get_order_from_pages(max_pages), free_order;
-
-    if ( order > last_order )
-        order = last_order;
-    else if ( max_pages & (max_pages - 1) )
-        --order;
-    while ( (page = alloc_domheap_pages(d, order, memflags)) == NULL )
-        if ( order-- == 0 )
-            break;
-    if ( page )
-        last_order = order;
-    else if ( memflags )
-    {
-        /*
-         * Allocate up to 2MB at a time: It prevents allocating very large
-         * chunks from DMA pools before the >4GB pool is fully depleted.
-         */
-        last_order = 21 - PAGE_SHIFT;
-        memflags = 0;
-        return alloc_chunk(d, max_pages);
-    }
-
-    /*
-     * Make a reasonable attempt at finding a smaller chunk at a higher
-     * address, to avoid allocating from low memory as much as possible.
-     */
-    for ( free_order = order; !memflags && page && order--; )
-    {
-        struct page_info *pg2;
-
-        if ( d->tot_pages + (1 << order) > d->max_pages )
-            continue;
-        pg2 = alloc_domheap_pages(d, order, MEMF_exact_node);
-        if ( pg2 > page )
-        {
-            free_domheap_pages(page, free_order);
-            page = pg2;
-            free_order = order;
-        }
-        else if ( pg2 )
-            free_domheap_pages(pg2, order);
-    }
-    return page;
-}
+unsigned int __initdata dom0_memflags = MEMF_no_dma|MEMF_exact_node;
 
 static unsigned long __init dom0_paging_pages(const struct domain *d,
                                               unsigned long nr_pages)
@@ -332,7 +274,7 @@ static unsigned long __init dom0_paging_pages(const struct domain *d,
     return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
 }
 
-static unsigned long __init compute_dom0_nr_pages(
+unsigned long __init dom0_compute_nr_pages(
     struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
 {
     nodeid_t node;
@@ -469,199 +411,7 @@ static void __init process_dom0_ioports_disable(struct domain *dom0)
     }
 }
 
-static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
-                                   unsigned long mfn, unsigned long vphysmap_s)
-{
-    if ( !is_pv_32bit_domain(d) )
-        ((unsigned long *)vphysmap_s)[pfn] = mfn;
-    else
-        ((unsigned int *)vphysmap_s)[pfn] = mfn;
-
-    set_gpfn_from_mfn(mfn, pfn);
-}
-
-static __init void mark_pv_pt_pages_rdonly(struct domain *d,
-                                           l4_pgentry_t *l4start,
-                                           unsigned long vpt_start,
-                                           unsigned long nr_pt_pages)
-{
-    unsigned long count;
-    struct page_info *page;
-    l4_pgentry_t *pl4e;
-    l3_pgentry_t *pl3e;
-    l2_pgentry_t *pl2e;
-    l1_pgentry_t *pl1e;
-
-    pl4e = l4start + l4_table_offset(vpt_start);
-    pl3e = l4e_to_l3e(*pl4e);
-    pl3e += l3_table_offset(vpt_start);
-    pl2e = l3e_to_l2e(*pl3e);
-    pl2e += l2_table_offset(vpt_start);
-    pl1e = l2e_to_l1e(*pl2e);
-    pl1e += l1_table_offset(vpt_start);
-    for ( count = 0; count < nr_pt_pages; count++ )
-    {
-        l1e_remove_flags(*pl1e, _PAGE_RW);
-        page = mfn_to_page(l1e_get_pfn(*pl1e));
-
-        /* Read-only mapping + PGC_allocated + page-table page. */
-        page->count_info         = PGC_allocated | 3;
-        page->u.inuse.type_info |= PGT_validated | 1;
-
-        /* Top-level p.t. is pinned. */
-        if ( (page->u.inuse.type_info & PGT_type_mask) ==
-             (!is_pv_32bit_domain(d) ?
-              PGT_l4_page_table : PGT_l3_page_table) )
-        {
-            page->count_info        += 1;
-            page->u.inuse.type_info += 1 | PGT_pinned;
-        }
-
-        /* Iterate. */
-        if ( !((unsigned long)++pl1e & (PAGE_SIZE - 1)) )
-        {
-            if ( !((unsigned long)++pl2e & (PAGE_SIZE - 1)) )
-            {
-                if ( !((unsigned long)++pl3e & (PAGE_SIZE - 1)) )
-                    pl3e = l4e_to_l3e(*++pl4e);
-                pl2e = l3e_to_l2e(*pl3e);
-            }
-            pl1e = l2e_to_l1e(*pl2e);
-        }
-    }
-}
-
-static __init void setup_pv_physmap(struct domain *d, unsigned long pgtbl_pfn,
-                                    unsigned long v_start, unsigned long v_end,
-                                    unsigned long vphysmap_start,
-                                    unsigned long vphysmap_end,
-                                    unsigned long nr_pages)
-{
-    struct page_info *page = NULL;
-    l4_pgentry_t *pl4e, *l4start = map_domain_page(_mfn(pgtbl_pfn));
-    l3_pgentry_t *pl3e = NULL;
-    l2_pgentry_t *pl2e = NULL;
-    l1_pgentry_t *pl1e = NULL;
-
-    if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
-        panic("DOM0 P->M table overlaps initial mapping");
-
-    while ( vphysmap_start < vphysmap_end )
-    {
-        if ( d->tot_pages + ((round_pgup(vphysmap_end) - vphysmap_start)
-                             >> PAGE_SHIFT) + 3 > nr_pages )
-            panic("Dom0 allocation too small for initial P->M table");
-
-        if ( pl1e )
-        {
-            unmap_domain_page(pl1e);
-            pl1e = NULL;
-        }
-        if ( pl2e )
-        {
-            unmap_domain_page(pl2e);
-            pl2e = NULL;
-        }
-        if ( pl3e )
-        {
-            unmap_domain_page(pl3e);
-            pl3e = NULL;
-        }
-        pl4e = l4start + l4_table_offset(vphysmap_start);
-        if ( !l4e_get_intpte(*pl4e) )
-        {
-            page = alloc_domheap_page(d, 0);
-            if ( !page )
-                break;
-
-            /* No mapping, PGC_allocated + page-table page. */
-            page->count_info = PGC_allocated | 2;
-            page->u.inuse.type_info = PGT_l3_page_table | PGT_validated | 1;
-            pl3e = __map_domain_page(page);
-            clear_page(pl3e);
-            *pl4e = l4e_from_page(page, L4_PROT);
-        } else
-            pl3e = map_domain_page(_mfn(l4e_get_pfn(*pl4e)));
-
-        pl3e += l3_table_offset(vphysmap_start);
-        if ( !l3e_get_intpte(*pl3e) )
-        {
-            if ( cpu_has_page1gb &&
-                 !(vphysmap_start & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
-                 vphysmap_end >= vphysmap_start + (1UL << L3_PAGETABLE_SHIFT) &&
-                 (page = alloc_domheap_pages(d,
-                                             L3_PAGETABLE_SHIFT - PAGE_SHIFT,
-                                             0)) != NULL )
-            {
-                *pl3e = l3e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
-                vphysmap_start += 1UL << L3_PAGETABLE_SHIFT;
-                continue;
-            }
-            if ( (page = alloc_domheap_page(d, 0)) == NULL )
-                break;
-
-            /* No mapping, PGC_allocated + page-table page. */
-            page->count_info = PGC_allocated | 2;
-            page->u.inuse.type_info = PGT_l2_page_table | PGT_validated | 1;
-            pl2e = __map_domain_page(page);
-            clear_page(pl2e);
-            *pl3e = l3e_from_page(page, L3_PROT);
-        }
-        else
-            pl2e = map_domain_page(_mfn(l3e_get_pfn(*pl3e)));
-
-        pl2e += l2_table_offset(vphysmap_start);
-        if ( !l2e_get_intpte(*pl2e) )
-        {
-            if ( !(vphysmap_start & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
-                 vphysmap_end >= vphysmap_start + (1UL << L2_PAGETABLE_SHIFT) &&
-                 (page = alloc_domheap_pages(d,
-                                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                                             0)) != NULL )
-            {
-                *pl2e = l2e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
-                if ( opt_allow_superpage )
-                    get_superpage(page_to_mfn(page), d);
-                vphysmap_start += 1UL << L2_PAGETABLE_SHIFT;
-                continue;
-            }
-            if ( (page = alloc_domheap_page(d, 0)) == NULL )
-                break;
-
-            /* No mapping, PGC_allocated + page-table page. */
-            page->count_info = PGC_allocated | 2;
-            page->u.inuse.type_info = PGT_l1_page_table | PGT_validated | 1;
-            pl1e = __map_domain_page(page);
-            clear_page(pl1e);
-            *pl2e = l2e_from_page(page, L2_PROT);
-        }
-        else
-            pl1e = map_domain_page(_mfn(l2e_get_pfn(*pl2e)));
-
-        pl1e += l1_table_offset(vphysmap_start);
-        BUG_ON(l1e_get_intpte(*pl1e));
-        page = alloc_domheap_page(d, 0);
-        if ( !page )
-            break;
-
-        *pl1e = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
-        vphysmap_start += PAGE_SIZE;
-        vphysmap_start &= PAGE_MASK;
-    }
-    if ( !page )
-        panic("Not enough RAM for DOM0 P->M table");
-
-    if ( pl1e )
-        unmap_domain_page(pl1e);
-    if ( pl2e )
-        unmap_domain_page(pl2e);
-    if ( pl3e )
-        unmap_domain_page(pl3e);
-
-    unmap_domain_page(l4start);
-}
-
-static int __init setup_permissions(struct domain *d)
+int __init dom0_setup_permissions(struct domain *d)
 {
     unsigned long mfn;
     unsigned int i;
@@ -743,636 +493,6 @@ static int __init setup_permissions(struct domain *d)
     return rc;
 }
 
-static int __init construct_dom0_pv(
-    struct domain *d,
-    const module_t *image, unsigned long image_headroom,
-    module_t *initrd,
-    void *(*bootstrap_map)(const module_t *),
-    char *cmdline)
-{
-    int i, cpu, rc, compatible, compat32, order, machine;
-    struct cpu_user_regs *regs;
-    unsigned long pfn, mfn;
-    unsigned long nr_pages;
-    unsigned long nr_pt_pages;
-    unsigned long alloc_spfn;
-    unsigned long alloc_epfn;
-    unsigned long initrd_pfn = -1, initrd_mfn = 0;
-    unsigned long count;
-    struct page_info *page = NULL;
-    start_info_t *si;
-    struct vcpu *v = d->vcpu[0];
-    unsigned long long value;
-    void *image_base = bootstrap_map(image);
-    unsigned long image_len = image->mod_end;
-    void *image_start = image_base + image_headroom;
-    unsigned long initrd_len = initrd ? initrd->mod_end : 0;
-    l4_pgentry_t *l4tab = NULL, *l4start = NULL;
-    l3_pgentry_t *l3tab = NULL, *l3start = NULL;
-    l2_pgentry_t *l2tab = NULL, *l2start = NULL;
-    l1_pgentry_t *l1tab = NULL, *l1start = NULL;
-
-    /*
-     * This fully describes the memory layout of the initial domain. All 
-     * *_start address are page-aligned, except v_start (and v_end) which are 
-     * superpage-aligned.
-     */
-    struct elf_binary elf;
-    struct elf_dom_parms parms;
-    unsigned long vkern_start;
-    unsigned long vkern_end;
-    unsigned long vinitrd_start;
-    unsigned long vinitrd_end;
-    unsigned long vphysmap_start;
-    unsigned long vphysmap_end;
-    unsigned long vstartinfo_start;
-    unsigned long vstartinfo_end;
-    unsigned long vstack_start;
-    unsigned long vstack_end;
-    unsigned long vpt_start;
-    unsigned long vpt_end;
-    unsigned long v_start;
-    unsigned long v_end;
-
-    /* Machine address of next candidate page-table page. */
-    paddr_t mpt_alloc;
-
-    printk("*** LOADING DOMAIN 0 ***\n");
-
-    d->max_pages = ~0U;
-
-    if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 )
-        return rc;
-
-    if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
-        return rc;
-#ifdef CONFIG_VERBOSE_DEBUG
-    elf_set_verbose(&elf);
-#endif
-    elf_parse_binary(&elf);
-    if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
-        goto out;
-
-    /* compatibility check */
-    compatible = 0;
-    compat32   = 0;
-    machine = elf_uval(&elf, elf.ehdr, e_machine);
-    printk(" Xen  kernel: 64-bit, lsb, compat32\n");
-    if (elf_32bit(&elf) && parms.pae == XEN_PAE_BIMODAL)
-        parms.pae = XEN_PAE_EXTCR3;
-    if (elf_32bit(&elf) && parms.pae && machine == EM_386)
-    {
-        compat32 = 1;
-        compatible = 1;
-    }
-    if (elf_64bit(&elf) && machine == EM_X86_64)
-        compatible = 1;
-    printk(" Dom0 kernel: %s%s, %s, paddr %#" PRIx64 " -> %#" PRIx64 "\n",
-           elf_64bit(&elf) ? "64-bit" : "32-bit",
-           parms.pae       ? ", PAE"  : "",
-           elf_msb(&elf)   ? "msb"    : "lsb",
-           elf.pstart, elf.pend);
-    if ( elf.bsd_symtab_pstart )
-        printk(" Dom0 symbol map %#" PRIx64 " -> %#" PRIx64 "\n",
-               elf.bsd_symtab_pstart, elf.bsd_symtab_pend);
-
-    if ( !compatible )
-    {
-        printk("Mismatch between Xen and DOM0 kernel\n");
-        rc = -EINVAL;
-        goto out;
-    }
-
-    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE )
-    {
-        if ( !test_bit(XENFEAT_dom0, parms.f_supported) )
-        {
-            printk("Kernel does not support Dom0 operation\n");
-            rc = -EINVAL;
-            goto out;
-        }
-    }
-
-    if ( compat32 )
-    {
-        d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
-        v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
-        if ( setup_compat_arg_xlat(v) != 0 )
-            BUG();
-    }
-
-    nr_pages = compute_dom0_nr_pages(d, &parms, initrd_len);
-
-    if ( parms.pae == XEN_PAE_EXTCR3 )
-            set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
-
-    if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) )
-    {
-        unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
-        value = (parms.virt_hv_start_low + mask) & ~mask;
-        BUG_ON(!is_pv_32bit_domain(d));
-        if ( value > __HYPERVISOR_COMPAT_VIRT_START )
-            panic("Domain 0 expects too high a hypervisor start address");
-        HYPERVISOR_COMPAT_VIRT_START(d) =
-            max_t(unsigned int, m2p_compat_vstart, value);
-    }
-
-    if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
-    {
-        printk(XENLOG_WARNING "P2M table base ignored\n");
-        parms.p2m_base = UNSET_ADDR;
-    }
-
-    domain_set_alloc_bitsize(d);
-
-    /*
-     * Why do we need this? The number of page-table frames depends on the 
-     * size of the bootstrap address space. But the size of the address space 
-     * depends on the number of page-table frames (since each one is mapped 
-     * read-only). We have a pair of simultaneous equations in two unknowns, 
-     * which we solve by exhaustive search.
-     */
-    v_start          = parms.virt_base;
-    vkern_start      = parms.virt_kstart;
-    vkern_end        = parms.virt_kend;
-    if ( parms.unmapped_initrd )
-    {
-        vinitrd_start  = vinitrd_end = 0;
-        vphysmap_start = round_pgup(vkern_end);
-    }
-    else
-    {
-        vinitrd_start  = round_pgup(vkern_end);
-        vinitrd_end    = vinitrd_start + initrd_len;
-        vphysmap_start = round_pgup(vinitrd_end);
-    }
-    vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32bit_domain(d) ?
-                                                     sizeof(unsigned long) :
-                                                     sizeof(unsigned int)));
-    if ( parms.p2m_base != UNSET_ADDR )
-        vphysmap_end = vphysmap_start;
-    vstartinfo_start = round_pgup(vphysmap_end);
-    vstartinfo_end   = (vstartinfo_start +
-                        sizeof(struct start_info) +
-                        sizeof(struct dom0_vga_console_info));
-
-    vpt_start        = round_pgup(vstartinfo_end);
-    for ( nr_pt_pages = 2; ; nr_pt_pages++ )
-    {
-        vpt_end          = vpt_start + (nr_pt_pages * PAGE_SIZE);
-        vstack_start     = vpt_end;
-        vstack_end       = vstack_start + PAGE_SIZE;
-        v_end            = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
-        if ( (v_end - vstack_end) < (512UL << 10) )
-            v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
-#define NR(_l,_h,_s) \
-    (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
-       ((_l) & ~((1UL<<(_s))-1))) >> (_s))
-        if ( (!is_pv_32bit_domain(d) + /* # L4 */
-              NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
-              (!is_pv_32bit_domain(d) ?
-               NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
-               4) + /* # compat L2 */
-              NR(v_start, v_end, L2_PAGETABLE_SHIFT))  /* # L1 */
-             <= nr_pt_pages )
-            break;
-    }
-
-    count = v_end - v_start;
-    if ( vinitrd_start )
-        count -= PAGE_ALIGN(initrd_len);
-    order = get_order_from_bytes(count);
-    if ( (1UL << order) + PFN_UP(initrd_len) > nr_pages )
-        panic("Domain 0 allocation is too small for kernel image");
-
-    if ( parms.p2m_base != UNSET_ADDR )
-    {
-        vphysmap_start = parms.p2m_base;
-        vphysmap_end   = vphysmap_start + nr_pages * sizeof(unsigned long);
-    }
-    page = alloc_domheap_pages(d, order, 0);
-    if ( page == NULL )
-        panic("Not enough RAM for domain 0 allocation");
-    alloc_spfn = page_to_mfn(page);
-    alloc_epfn = alloc_spfn + d->tot_pages;
-
-    if ( initrd_len )
-    {
-        initrd_pfn = vinitrd_start ?
-                     (vinitrd_start - v_start) >> PAGE_SHIFT :
-                     d->tot_pages;
-        initrd_mfn = mfn = initrd->mod_start;
-        count = PFN_UP(initrd_len);
-        if ( d->arch.physaddr_bitsize &&
-             ((mfn + count - 1) >> (d->arch.physaddr_bitsize - PAGE_SHIFT)) )
-        {
-            order = get_order_from_pages(count);
-            page = alloc_domheap_pages(d, order, 0);
-            if ( !page )
-                panic("Not enough RAM for domain 0 initrd");
-            for ( count = -count; order--; )
-                if ( count & (1UL << order) )
-                {
-                    free_domheap_pages(page, order);
-                    page += 1UL << order;
-                }
-            memcpy(page_to_virt(page), mfn_to_virt(initrd->mod_start),
-                   initrd_len);
-            mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT;
-            init_domheap_pages(mpt_alloc,
-                               mpt_alloc + PAGE_ALIGN(initrd_len));
-            initrd->mod_start = initrd_mfn = page_to_mfn(page);
-        }
-        else
-        {
-            while ( count-- )
-                if ( assign_pages(d, mfn_to_page(mfn++), 0, 0) )
-                    BUG();
-        }
-        initrd->mod_end = 0;
-    }
-
-    printk("PHYSICAL MEMORY ARRANGEMENT:\n"
-           " Dom0 alloc.:   %"PRIpaddr"->%"PRIpaddr,
-           pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
-    if ( d->tot_pages < nr_pages )
-        printk(" (%lu pages to be allocated)",
-               nr_pages - d->tot_pages);
-    if ( initrd )
-    {
-        mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT;
-        printk("\n Init. ramdisk: %"PRIpaddr"->%"PRIpaddr,
-               mpt_alloc, mpt_alloc + initrd_len);
-    }
-    printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
-           " Loaded kernel: %p->%p\n"
-           " Init. ramdisk: %p->%p\n"
-           " Phys-Mach map: %p->%p\n"
-           " Start info:    %p->%p\n"
-           " Page tables:   %p->%p\n"
-           " Boot stack:    %p->%p\n"
-           " TOTAL:         %p->%p\n",
-           _p(vkern_start), _p(vkern_end),
-           _p(vinitrd_start), _p(vinitrd_end),
-           _p(vphysmap_start), _p(vphysmap_end),
-           _p(vstartinfo_start), _p(vstartinfo_end),
-           _p(vpt_start), _p(vpt_end),
-           _p(vstack_start), _p(vstack_end),
-           _p(v_start), _p(v_end));
-    printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
-
-    process_pending_softirqs();
-
-    mpt_alloc = (vpt_start - v_start) + pfn_to_paddr(alloc_spfn);
-    if ( vinitrd_start )
-        mpt_alloc -= PAGE_ALIGN(initrd_len);
-
-    /* Overlap with Xen protected area? */
-    if ( !is_pv_32bit_domain(d) ?
-         ((v_start < HYPERVISOR_VIRT_END) &&
-          (v_end > HYPERVISOR_VIRT_START)) :
-         (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
-    {
-        printk("DOM0 image overlaps with Xen private area.\n");
-        rc = -EINVAL;
-        goto out;
-    }
-
-    if ( is_pv_32bit_domain(d) )
-    {
-        v->arch.pv_vcpu.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
-        v->arch.pv_vcpu.event_callback_cs    = FLAT_COMPAT_KERNEL_CS;
-    }
-
-    /* WARNING: The new domain must have its 'processor' field filled in! */
-    if ( !is_pv_32bit_domain(d) )
-    {
-        maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
-        l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
-    }
-    else
-    {
-        page = alloc_domheap_page(d, MEMF_no_owner);
-        if ( !page )
-            panic("Not enough RAM for domain 0 PML4");
-        page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
-        l4start = l4tab = page_to_virt(page);
-        maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
-        l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
-    }
-    clear_page(l4tab);
-    init_guest_l4_table(l4tab, d, 0);
-    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
-    if ( is_pv_32bit_domain(d) )
-        v->arch.guest_table_user = v->arch.guest_table;
-
-    l4tab += l4_table_offset(v_start);
-    pfn = alloc_spfn;
-    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
-    {
-        if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
-        {
-            maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
-            l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
-            clear_page(l1tab);
-            if ( count == 0 )
-                l1tab += l1_table_offset(v_start);
-            if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
-            {
-                maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
-                l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
-                clear_page(l2tab);
-                if ( count == 0 )
-                    l2tab += l2_table_offset(v_start);
-                if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
-                {
-                    if ( count || !l3start )
-                    {
-                        maddr_to_page(mpt_alloc)->u.inuse.type_info =
-                            PGT_l3_page_table;
-                        l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
-                    }
-                    l3tab = l3start;
-                    clear_page(l3tab);
-                    if ( count == 0 )
-                        l3tab += l3_table_offset(v_start);
-                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
-                    l4tab++;
-                }
-                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
-                l3tab++;
-            }
-            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
-            l2tab++;
-        }
-        if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
-            mfn = pfn++;
-        else
-            mfn = initrd_mfn++;
-        *l1tab = l1e_from_pfn(mfn, (!is_pv_32bit_domain(d) ?
-                                    L1_PROT : COMPAT_L1_PROT));
-        l1tab++;
-
-        if ( !paging_mode_translate(d) )
-        {
-            page = mfn_to_page(mfn);
-            if ( !page->u.inuse.type_info &&
-                 !get_page_and_type(page, d, PGT_writable_page) )
-                BUG();
-        }
-    }
-
-    if ( is_pv_32bit_domain(d) )
-    {
-        /* Ensure the first four L3 entries are all populated. */
-        for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
-        {
-            if ( !l3e_get_intpte(*l3tab) )
-            {
-                maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
-                l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
-                clear_page(l2tab);
-                *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
-            }
-            if ( i == 3 )
-                l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
-        }
-        /* Install read-only guest visible MPT mapping. */
-        l2tab = l3e_to_l2e(l3start[3]);
-        memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
-               &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
-               COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
-    }
-
-    /* Pages that are part of page tables must be read only. */
-    if  ( is_pv_domain(d) )
-        mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages);
-
-    /* Mask all upcalls... */
-    for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
-        shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
-
-    printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus);
-
-    cpu = v->processor;
-    for ( i = 1; i < d->max_vcpus; i++ )
-    {
-        struct vcpu *p = setup_dom0_vcpu(d, i, cpu);
-
-        if ( p )
-            cpu = p->processor;
-    }
-
-    d->arch.paging.mode = 0;
-
-    /* Set up CR3 value for write_ptbase */
-    if ( paging_mode_enabled(d) )
-        paging_update_paging_modes(v);
-    else
-        update_cr3(v);
-
-    /* We run on dom0's page tables for the final part of the build process. */
-    write_ptbase(v);
-    mapcache_override_current(v);
-
-    /* Copy the OS image and free temporary buffer. */
-    elf.dest_base = (void*)vkern_start;
-    elf.dest_size = vkern_end - vkern_start;
-    elf_set_vcpu(&elf, v);
-    rc = elf_load_binary(&elf);
-    if ( rc < 0 )
-    {
-        printk("Failed to load the kernel binary\n");
-        goto out;
-    }
-    bootstrap_map(NULL);
-
-    if ( UNSET_ADDR != parms.virt_hypercall )
-    {
-        if ( (parms.virt_hypercall < v_start) ||
-             (parms.virt_hypercall >= v_end) )
-        {
-            mapcache_override_current(NULL);
-            write_ptbase(current);
-            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
-            rc = -1;
-            goto out;
-        }
-        hypercall_page_initialise(
-            d, (void *)(unsigned long)parms.virt_hypercall);
-    }
-
-    /* Free temporary buffers. */
-    discard_initial_images();
-
-    /* Set up start info area. */
-    si = (start_info_t *)vstartinfo_start;
-    clear_page(si);
-    si->nr_pages = nr_pages;
-
-    si->shared_info = virt_to_maddr(d->shared_info);
-
-    si->flags        = SIF_PRIVILEGED | SIF_INITDOMAIN;
-    if ( !vinitrd_start && initrd_len )
-        si->flags   |= SIF_MOD_START_PFN;
-    si->flags       |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
-    si->pt_base      = vpt_start;
-    si->nr_pt_frames = nr_pt_pages;
-    si->mfn_list     = vphysmap_start;
-    snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
-             elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
-
-    count = d->tot_pages;
-
-    /* Set up the phys->machine table if not part of the initial mapping. */
-    if ( is_pv_domain(d) && parms.p2m_base != UNSET_ADDR )
-    {
-        pfn = pagetable_get_pfn(v->arch.guest_table);
-        setup_pv_physmap(d, pfn, v_start, v_end, vphysmap_start, vphysmap_end,
-                         nr_pages);
-    }
-
-    /* Write the phys->machine and machine->phys table entries. */
-    for ( pfn = 0; pfn < count; pfn++ )
-    {
-        mfn = pfn + alloc_spfn;
-        if ( pfn >= initrd_pfn )
-        {
-            if ( pfn < initrd_pfn + PFN_UP(initrd_len) )
-                mfn = initrd->mod_start + (pfn - initrd_pfn);
-            else
-                mfn -= PFN_UP(initrd_len);
-        }
-#ifndef NDEBUG
-#define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
-        if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
-            mfn = alloc_epfn - (pfn - REVERSE_START);
-#endif
-        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
-        if (!(pfn & 0xfffff))
-            process_pending_softirqs();
-    }
-    si->first_p2m_pfn = pfn;
-    si->nr_p2m_frames = d->tot_pages - count;
-    page_list_for_each ( page, &d->page_list )
-    {
-        mfn = page_to_mfn(page);
-        BUG_ON(SHARED_M2P(get_gpfn_from_mfn(mfn)));
-        if ( get_gpfn_from_mfn(mfn) >= count )
-        {
-            BUG_ON(is_pv_32bit_domain(d));
-            if ( !paging_mode_translate(d) && !page->u.inuse.type_info &&
-                 !get_page_and_type(page, d, PGT_writable_page) )
-                BUG();
-
-            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
-            ++pfn;
-            if (!(pfn & 0xfffff))
-                process_pending_softirqs();
-        }
-    }
-    BUG_ON(pfn != d->tot_pages);
-#ifndef NDEBUG
-    alloc_epfn += PFN_UP(initrd_len) + si->nr_p2m_frames;
-#endif
-    while ( pfn < nr_pages )
-    {
-        if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
-            panic("Not enough RAM for DOM0 reservation");
-        while ( pfn < d->tot_pages )
-        {
-            mfn = page_to_mfn(page);
-#ifndef NDEBUG
-#define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
-#endif
-            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
-#undef pfn
-            page++; pfn++;
-            if (!(pfn & 0xfffff))
-                process_pending_softirqs();
-        }
-    }
-
-    if ( initrd_len != 0 )
-    {
-        si->mod_start = vinitrd_start ?: initrd_pfn;
-        si->mod_len   = initrd_len;
-    }
-
-    memset(si->cmd_line, 0, sizeof(si->cmd_line));
-    if ( cmdline != NULL )
-        strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
-
-    if ( fill_console_start_info((void *)(si + 1)) )
-    {
-        si->console.dom0.info_off  = sizeof(struct start_info);
-        si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
-    }
-
-    if ( is_pv_32bit_domain(d) )
-        xlat_start_info(si, XLAT_start_info_console_dom0);
-
-    /* Return to idle domain's page tables. */
-    mapcache_override_current(NULL);
-    write_ptbase(current);
-
-    update_domain_wallclock_time(d);
-
-    v->is_initialised = 1;
-    clear_bit(_VPF_down, &v->pause_flags);
-
-    /*
-     * Initial register values:
-     *  DS,ES,FS,GS = FLAT_KERNEL_DS
-     *       CS:rIP = FLAT_KERNEL_CS:start_pc
-     *       SS:rSP = FLAT_KERNEL_SS:start_stack
-     *          rSI = start_info
-     *  [rAX,rBX,rCX,rDX,rDI,rBP,R8-R15 are zero]
-     */
-    regs = &v->arch.user_regs;
-    regs->ds = regs->es = regs->fs = regs->gs =
-        !is_pv_32bit_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
-    regs->ss = (!is_pv_32bit_domain(d) ?
-                FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
-    regs->cs = (!is_pv_32bit_domain(d) ?
-                FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
-    regs->rip = parms.virt_entry;
-    regs->rsp = vstack_end;
-    regs->rsi = vstartinfo_start;
-    regs->eflags = X86_EFLAGS_IF;
-
-#ifdef CONFIG_SHADOW_PAGING
-    if ( opt_dom0_shadow && paging_enable(d, PG_SH_enable) == 0 )
-        paging_update_paging_modes(v);
-#endif
-
-    /*
-     * PVH Fixme: XENFEAT_supervisor_mode_kernel has been reused in PVH with a
-     * different meaning.
-     */
-    if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
-        panic("Dom0 requires supervisor-mode execution");
-
-    rc = setup_permissions(d);
-    BUG_ON(rc != 0);
-
-    if ( elf_check_broken(&elf) )
-        printk(" Xen warning: dom0 kernel broken ELF: %s\n",
-               elf_check_broken(&elf));
-
-    if ( d->domain_id == hardware_domid )
-        iommu_hwdom_init(d);
-
-    return 0;
-
-out:
-    if ( elf_check_broken(&elf) )
-        printk(" Xen dom0 kernel broken ELF: %s\n",
-               elf_check_broken(&elf));
-
-    return rc;
-}
-
 static int __init modify_identity_mmio(struct domain *d, unsigned long pfn,
                                        unsigned long nr_pages, const bool map)
 {
@@ -1415,13 +535,13 @@ static int __init pvh_populate_memory_range(struct domain *d,
         unsigned int range_order = get_order_from_pages(nr_pages + 1);
 
         order = min(range_order ? range_order - 1 : 0, order);
-        page = alloc_domheap_pages(d, order, memflags);
+        page = alloc_domheap_pages(d, order, dom0_memflags);
         if ( page == NULL )
         {
-            if ( order == 0 && memflags )
+            if ( order == 0 && dom0_memflags )
             {
-                /* Try again without any memflags. */
-                memflags = 0;
+                /* Try again without any dom0_memflags. */
+                dom0_memflags = 0;
                 order = MAX_ORDER;
                 continue;
             }
@@ -1695,7 +815,7 @@ static int __init pvh_setup_p2m(struct domain *d)
     bool preempted;
 #define MB1_PAGES PFN_DOWN(MB(1))
 
-    nr_pages = compute_dom0_nr_pages(d, NULL, 0);
+    nr_pages = dom0_compute_nr_pages(d, NULL, 0);
 
     pvh_setup_e820(d, nr_pages);
     do {
@@ -1906,7 +1026,7 @@ static int __init pvh_setup_cpus(struct domain *d, paddr_t entry,
     cpu = v->processor;
     for ( i = 1; i < d->max_vcpus; i++ )
     {
-        struct vcpu *p = setup_dom0_vcpu(d, i, cpu);
+        struct vcpu *p = dom0_setup_vcpu(d, i, cpu);
 
         if ( p )
             cpu = p->processor;
@@ -1919,7 +1039,7 @@ static int __init pvh_setup_cpus(struct domain *d, paddr_t entry,
         return rc;
     }
 
-    rc = setup_permissions(d);
+    rc = dom0_setup_permissions(d);
     if ( rc )
     {
         panic("Unable to setup Dom0 permissions: %d\n", rc);
@@ -2429,7 +1549,7 @@ int __init construct_dom0(struct domain *d, const module_t *image,
 
     process_pending_softirqs();
 
-    return (is_hvm_domain(d) ? construct_dom0_pvh : construct_dom0_pv)
+    return (is_hvm_domain(d) ? construct_dom0_pvh : dom0_construct_pv)
            (d, image, image_headroom, initrd,bootstrap_map, cmdline);
 }
 
diff --git a/xen/arch/x86/pv/Makefile b/xen/arch/x86/pv/Makefile
index de21937129..ea94599438 100644
--- a/xen/arch/x86/pv/Makefile
+++ b/xen/arch/x86/pv/Makefile
@@ -1 +1,2 @@
 obj-y += hypercall.o
+obj-bin-y += dom0_build.init.o
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
new file mode 100644
index 0000000000..dda617f907
--- /dev/null
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -0,0 +1,913 @@
+/******************************************************************************
+ * pv/dom0_build.c
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ */
+
+#include <xen/console.h>
+#include <xen/domain.h>
+#include <xen/domain_page.h>
+#include <xen/init.h>
+#include <xen/libelf.h>
+#include <xen/multiboot.h>
+#include <xen/paging.h>
+#include <xen/pfn.h>
+#include <xen/sched.h>
+#include <xen/softirq.h>
+
+#include <asm/bzimage.h>
+#include <asm/dom0_build.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+
+/* Allow ring-3 access in long mode as guest cannot use ring 1 ... */
+#define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
+#define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL)
+/* ... except for compatibility mode guests. */
+#define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
+#define L2_PROT (BASE_PROT|_PAGE_DIRTY)
+#define L3_PROT (BASE_PROT|_PAGE_DIRTY)
+#define L4_PROT (BASE_PROT|_PAGE_DIRTY)
+
+static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
+                                       unsigned long mfn,
+                                       unsigned long vphysmap_s)
+{
+    if ( !is_pv_32bit_domain(d) )
+        ((unsigned long *)vphysmap_s)[pfn] = mfn;
+    else
+        ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+    set_gpfn_from_mfn(mfn, pfn);
+}
+
+static __init void mark_pv_pt_pages_rdonly(struct domain *d,
+                                           l4_pgentry_t *l4start,
+                                           unsigned long vpt_start,
+                                           unsigned long nr_pt_pages)
+{
+    unsigned long count;
+    struct page_info *page;
+    l4_pgentry_t *pl4e;
+    l3_pgentry_t *pl3e;
+    l2_pgentry_t *pl2e;
+    l1_pgentry_t *pl1e;
+
+    pl4e = l4start + l4_table_offset(vpt_start);
+    pl3e = l4e_to_l3e(*pl4e);
+    pl3e += l3_table_offset(vpt_start);
+    pl2e = l3e_to_l2e(*pl3e);
+    pl2e += l2_table_offset(vpt_start);
+    pl1e = l2e_to_l1e(*pl2e);
+    pl1e += l1_table_offset(vpt_start);
+    for ( count = 0; count < nr_pt_pages; count++ )
+    {
+        l1e_remove_flags(*pl1e, _PAGE_RW);
+        page = mfn_to_page(l1e_get_pfn(*pl1e));
+
+        /* Read-only mapping + PGC_allocated + page-table page. */
+        page->count_info         = PGC_allocated | 3;
+        page->u.inuse.type_info |= PGT_validated | 1;
+
+        /* Top-level p.t. is pinned. */
+        if ( (page->u.inuse.type_info & PGT_type_mask) ==
+             (!is_pv_32bit_domain(d) ?
+              PGT_l4_page_table : PGT_l3_page_table) )
+        {
+            page->count_info        += 1;
+            page->u.inuse.type_info += 1 | PGT_pinned;
+        }
+
+        /* Iterate. */
+        if ( !((unsigned long)++pl1e & (PAGE_SIZE - 1)) )
+        {
+            if ( !((unsigned long)++pl2e & (PAGE_SIZE - 1)) )
+            {
+                if ( !((unsigned long)++pl3e & (PAGE_SIZE - 1)) )
+                    pl3e = l4e_to_l3e(*++pl4e);
+                pl2e = l3e_to_l2e(*pl3e);
+            }
+            pl1e = l2e_to_l1e(*pl2e);
+        }
+    }
+}
+
+static __init void setup_pv_physmap(struct domain *d, unsigned long pgtbl_pfn,
+                                    unsigned long v_start, unsigned long v_end,
+                                    unsigned long vphysmap_start,
+                                    unsigned long vphysmap_end,
+                                    unsigned long nr_pages)
+{
+    struct page_info *page = NULL;
+    l4_pgentry_t *pl4e, *l4start = map_domain_page(_mfn(pgtbl_pfn));
+    l3_pgentry_t *pl3e = NULL;
+    l2_pgentry_t *pl2e = NULL;
+    l1_pgentry_t *pl1e = NULL;
+
+    if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
+        panic("DOM0 P->M table overlaps initial mapping");
+
+    while ( vphysmap_start < vphysmap_end )
+    {
+        if ( d->tot_pages + ((round_pgup(vphysmap_end) - vphysmap_start)
+                             >> PAGE_SHIFT) + 3 > nr_pages )
+            panic("Dom0 allocation too small for initial P->M table");
+
+        if ( pl1e )
+        {
+            unmap_domain_page(pl1e);
+            pl1e = NULL;
+        }
+        if ( pl2e )
+        {
+            unmap_domain_page(pl2e);
+            pl2e = NULL;
+        }
+        if ( pl3e )
+        {
+            unmap_domain_page(pl3e);
+            pl3e = NULL;
+        }
+        pl4e = l4start + l4_table_offset(vphysmap_start);
+        if ( !l4e_get_intpte(*pl4e) )
+        {
+            page = alloc_domheap_page(d, 0);
+            if ( !page )
+                break;
+
+            /* No mapping, PGC_allocated + page-table page. */
+            page->count_info = PGC_allocated | 2;
+            page->u.inuse.type_info = PGT_l3_page_table | PGT_validated | 1;
+            pl3e = __map_domain_page(page);
+            clear_page(pl3e);
+            *pl4e = l4e_from_page(page, L4_PROT);
+        } else
+            pl3e = map_domain_page(_mfn(l4e_get_pfn(*pl4e)));
+
+        pl3e += l3_table_offset(vphysmap_start);
+        if ( !l3e_get_intpte(*pl3e) )
+        {
+            if ( cpu_has_page1gb &&
+                 !(vphysmap_start & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
+                 vphysmap_end >= vphysmap_start + (1UL << L3_PAGETABLE_SHIFT) &&
+                 (page = alloc_domheap_pages(d,
+                                             L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                             0)) != NULL )
+            {
+                *pl3e = l3e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                vphysmap_start += 1UL << L3_PAGETABLE_SHIFT;
+                continue;
+            }
+            if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                break;
+
+            /* No mapping, PGC_allocated + page-table page. */
+            page->count_info = PGC_allocated | 2;
+            page->u.inuse.type_info = PGT_l2_page_table | PGT_validated | 1;
+            pl2e = __map_domain_page(page);
+            clear_page(pl2e);
+            *pl3e = l3e_from_page(page, L3_PROT);
+        }
+        else
+            pl2e = map_domain_page(_mfn(l3e_get_pfn(*pl3e)));
+
+        pl2e += l2_table_offset(vphysmap_start);
+        if ( !l2e_get_intpte(*pl2e) )
+        {
+            if ( !(vphysmap_start & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
+                 vphysmap_end >= vphysmap_start + (1UL << L2_PAGETABLE_SHIFT) &&
+                 (page = alloc_domheap_pages(d,
+                                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                             0)) != NULL )
+            {
+                *pl2e = l2e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                if ( opt_allow_superpage )
+                    get_superpage(page_to_mfn(page), d);
+                vphysmap_start += 1UL << L2_PAGETABLE_SHIFT;
+                continue;
+            }
+            if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                break;
+
+            /* No mapping, PGC_allocated + page-table page. */
+            page->count_info = PGC_allocated | 2;
+            page->u.inuse.type_info = PGT_l1_page_table | PGT_validated | 1;
+            pl1e = __map_domain_page(page);
+            clear_page(pl1e);
+            *pl2e = l2e_from_page(page, L2_PROT);
+        }
+        else
+            pl1e = map_domain_page(_mfn(l2e_get_pfn(*pl2e)));
+
+        pl1e += l1_table_offset(vphysmap_start);
+        BUG_ON(l1e_get_intpte(*pl1e));
+        page = alloc_domheap_page(d, 0);
+        if ( !page )
+            break;
+
+        *pl1e = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
+        vphysmap_start += PAGE_SIZE;
+        vphysmap_start &= PAGE_MASK;
+    }
+    if ( !page )
+        panic("Not enough RAM for DOM0 P->M table");
+
+    if ( pl1e )
+        unmap_domain_page(pl1e);
+    if ( pl2e )
+        unmap_domain_page(pl2e);
+    if ( pl3e )
+        unmap_domain_page(pl3e);
+
+    unmap_domain_page(l4start);
+}
+
+static struct page_info * __init alloc_chunk(struct domain *d,
+                                             unsigned long max_pages)
+{
+    static unsigned int __initdata last_order = MAX_ORDER;
+    struct page_info *page;
+    unsigned int order = get_order_from_pages(max_pages), free_order;
+
+    if ( order > last_order )
+        order = last_order;
+    else if ( max_pages & (max_pages - 1) )
+        --order;
+    while ( (page = alloc_domheap_pages(d, order, dom0_memflags)) == NULL )
+        if ( order-- == 0 )
+            break;
+    if ( page )
+        last_order = order;
+    else if ( dom0_memflags )
+    {
+        /*
+         * Allocate up to 2MB at a time: It prevents allocating very large
+         * chunks from DMA pools before the >4GB pool is fully depleted.
+         */
+        last_order = 21 - PAGE_SHIFT;
+        dom0_memflags = 0;
+        return alloc_chunk(d, max_pages);
+    }
+
+    /*
+     * Make a reasonable attempt at finding a smaller chunk at a higher
+     * address, to avoid allocating from low memory as much as possible.
+     */
+    for ( free_order = order; !dom0_memflags && page && order--; )
+    {
+        struct page_info *pg2;
+
+        if ( d->tot_pages + (1 << order) > d->max_pages )
+            continue;
+        pg2 = alloc_domheap_pages(d, order, MEMF_exact_node);
+        if ( pg2 > page )
+        {
+            free_domheap_pages(page, free_order);
+            page = pg2;
+            free_order = order;
+        }
+        else if ( pg2 )
+            free_domheap_pages(pg2, order);
+    }
+    return page;
+}
+
+int __init dom0_construct_pv(struct domain *d,
+                             const module_t *image,
+                             unsigned long image_headroom,
+                             module_t *initrd,
+                             void *(*bootstrap_map)(const module_t *),
+                             char *cmdline)
+{
+    int i, cpu, rc, compatible, compat32, order, machine;
+    struct cpu_user_regs *regs;
+    unsigned long pfn, mfn;
+    unsigned long nr_pages;
+    unsigned long nr_pt_pages;
+    unsigned long alloc_spfn;
+    unsigned long alloc_epfn;
+    unsigned long initrd_pfn = -1, initrd_mfn = 0;
+    unsigned long count;
+    struct page_info *page = NULL;
+    start_info_t *si;
+    struct vcpu *v = d->vcpu[0];
+    unsigned long long value;
+    void *image_base = bootstrap_map(image);
+    unsigned long image_len = image->mod_end;
+    void *image_start = image_base + image_headroom;
+    unsigned long initrd_len = initrd ? initrd->mod_end : 0;
+    l4_pgentry_t *l4tab = NULL, *l4start = NULL;
+    l3_pgentry_t *l3tab = NULL, *l3start = NULL;
+    l2_pgentry_t *l2tab = NULL, *l2start = NULL;
+    l1_pgentry_t *l1tab = NULL, *l1start = NULL;
+
+    /*
+     * This fully describes the memory layout of the initial domain. All
+     * *_start address are page-aligned, except v_start (and v_end) which are
+     * superpage-aligned.
+     */
+    struct elf_binary elf;
+    struct elf_dom_parms parms;
+    unsigned long vkern_start;
+    unsigned long vkern_end;
+    unsigned long vinitrd_start;
+    unsigned long vinitrd_end;
+    unsigned long vphysmap_start;
+    unsigned long vphysmap_end;
+    unsigned long vstartinfo_start;
+    unsigned long vstartinfo_end;
+    unsigned long vstack_start;
+    unsigned long vstack_end;
+    unsigned long vpt_start;
+    unsigned long vpt_end;
+    unsigned long v_start;
+    unsigned long v_end;
+
+    /* Machine address of next candidate page-table page. */
+    paddr_t mpt_alloc;
+
+    printk("*** LOADING DOMAIN 0 ***\n");
+
+    d->max_pages = ~0U;
+
+    if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 )
+        return rc;
+
+    if ( (rc = elf_init(&elf, image_start, image_len)) != 0 )
+        return rc;
+#ifdef CONFIG_VERBOSE_DEBUG
+    elf_set_verbose(&elf);
+#endif
+    elf_parse_binary(&elf);
+    if ( (rc = elf_xen_parse(&elf, &parms)) != 0 )
+        goto out;
+
+    /* compatibility check */
+    compatible = 0;
+    compat32   = 0;
+    machine = elf_uval(&elf, elf.ehdr, e_machine);
+    printk(" Xen  kernel: 64-bit, lsb, compat32\n");
+    if (elf_32bit(&elf) && parms.pae == XEN_PAE_BIMODAL)
+        parms.pae = XEN_PAE_EXTCR3;
+    if (elf_32bit(&elf) && parms.pae && machine == EM_386)
+    {
+        compat32 = 1;
+        compatible = 1;
+    }
+    if (elf_64bit(&elf) && machine == EM_X86_64)
+        compatible = 1;
+    printk(" Dom0 kernel: %s%s, %s, paddr %#" PRIx64 " -> %#" PRIx64 "\n",
+           elf_64bit(&elf) ? "64-bit" : "32-bit",
+           parms.pae       ? ", PAE"  : "",
+           elf_msb(&elf)   ? "msb"    : "lsb",
+           elf.pstart, elf.pend);
+    if ( elf.bsd_symtab_pstart )
+        printk(" Dom0 symbol map %#" PRIx64 " -> %#" PRIx64 "\n",
+               elf.bsd_symtab_pstart, elf.bsd_symtab_pend);
+
+    if ( !compatible )
+    {
+        printk("Mismatch between Xen and DOM0 kernel\n");
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE )
+    {
+        if ( !test_bit(XENFEAT_dom0, parms.f_supported) )
+        {
+            printk("Kernel does not support Dom0 operation\n");
+            rc = -EINVAL;
+            goto out;
+        }
+    }
+
+    if ( compat32 )
+    {
+        d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
+        v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
+        if ( setup_compat_arg_xlat(v) != 0 )
+            BUG();
+    }
+
+    nr_pages = dom0_compute_nr_pages(d, &parms, initrd_len);
+
+    if ( parms.pae == XEN_PAE_EXTCR3 )
+            set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
+
+    if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) )
+    {
+        unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
+        value = (parms.virt_hv_start_low + mask) & ~mask;
+        BUG_ON(!is_pv_32bit_domain(d));
+        if ( value > __HYPERVISOR_COMPAT_VIRT_START )
+            panic("Domain 0 expects too high a hypervisor start address");
+        HYPERVISOR_COMPAT_VIRT_START(d) =
+            max_t(unsigned int, m2p_compat_vstart, value);
+    }
+
+    if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
+    {
+        printk(XENLOG_WARNING "P2M table base ignored\n");
+        parms.p2m_base = UNSET_ADDR;
+    }
+
+    domain_set_alloc_bitsize(d);
+
+    /*
+     * Why do we need this? The number of page-table frames depends on the
+     * size of the bootstrap address space. But the size of the address space
+     * depends on the number of page-table frames (since each one is mapped
+     * read-only). We have a pair of simultaneous equations in two unknowns,
+     * which we solve by exhaustive search.
+     */
+    v_start          = parms.virt_base;
+    vkern_start      = parms.virt_kstart;
+    vkern_end        = parms.virt_kend;
+    if ( parms.unmapped_initrd )
+    {
+        vinitrd_start  = vinitrd_end = 0;
+        vphysmap_start = round_pgup(vkern_end);
+    }
+    else
+    {
+        vinitrd_start  = round_pgup(vkern_end);
+        vinitrd_end    = vinitrd_start + initrd_len;
+        vphysmap_start = round_pgup(vinitrd_end);
+    }
+    vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32bit_domain(d) ?
+                                                     sizeof(unsigned long) :
+                                                     sizeof(unsigned int)));
+    if ( parms.p2m_base != UNSET_ADDR )
+        vphysmap_end = vphysmap_start;
+    vstartinfo_start = round_pgup(vphysmap_end);
+    vstartinfo_end   = (vstartinfo_start +
+                        sizeof(struct start_info) +
+                        sizeof(struct dom0_vga_console_info));
+
+    vpt_start        = round_pgup(vstartinfo_end);
+    for ( nr_pt_pages = 2; ; nr_pt_pages++ )
+    {
+        vpt_end          = vpt_start + (nr_pt_pages * PAGE_SIZE);
+        vstack_start     = vpt_end;
+        vstack_end       = vstack_start + PAGE_SIZE;
+        v_end            = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
+        if ( (v_end - vstack_end) < (512UL << 10) )
+            v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
+#define NR(_l,_h,_s) \
+    (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
+       ((_l) & ~((1UL<<(_s))-1))) >> (_s))
+        if ( (!is_pv_32bit_domain(d) + /* # L4 */
+              NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
+              (!is_pv_32bit_domain(d) ?
+               NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */
+               4) + /* # compat L2 */
+              NR(v_start, v_end, L2_PAGETABLE_SHIFT))  /* # L1 */
+             <= nr_pt_pages )
+            break;
+    }
+
+    count = v_end - v_start;
+    if ( vinitrd_start )
+        count -= PAGE_ALIGN(initrd_len);
+    order = get_order_from_bytes(count);
+    if ( (1UL << order) + PFN_UP(initrd_len) > nr_pages )
+        panic("Domain 0 allocation is too small for kernel image");
+
+    if ( parms.p2m_base != UNSET_ADDR )
+    {
+        vphysmap_start = parms.p2m_base;
+        vphysmap_end   = vphysmap_start + nr_pages * sizeof(unsigned long);
+    }
+    page = alloc_domheap_pages(d, order, 0);
+    if ( page == NULL )
+        panic("Not enough RAM for domain 0 allocation");
+    alloc_spfn = page_to_mfn(page);
+    alloc_epfn = alloc_spfn + d->tot_pages;
+
+    if ( initrd_len )
+    {
+        initrd_pfn = vinitrd_start ?
+                     (vinitrd_start - v_start) >> PAGE_SHIFT :
+                     d->tot_pages;
+        initrd_mfn = mfn = initrd->mod_start;
+        count = PFN_UP(initrd_len);
+        if ( d->arch.physaddr_bitsize &&
+             ((mfn + count - 1) >> (d->arch.physaddr_bitsize - PAGE_SHIFT)) )
+        {
+            order = get_order_from_pages(count);
+            page = alloc_domheap_pages(d, order, 0);
+            if ( !page )
+                panic("Not enough RAM for domain 0 initrd");
+            for ( count = -count; order--; )
+                if ( count & (1UL << order) )
+                {
+                    free_domheap_pages(page, order);
+                    page += 1UL << order;
+                }
+            memcpy(page_to_virt(page), mfn_to_virt(initrd->mod_start),
+                   initrd_len);
+            mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT;
+            init_domheap_pages(mpt_alloc,
+                               mpt_alloc + PAGE_ALIGN(initrd_len));
+            initrd->mod_start = initrd_mfn = page_to_mfn(page);
+        }
+        else
+        {
+            while ( count-- )
+                if ( assign_pages(d, mfn_to_page(mfn++), 0, 0) )
+                    BUG();
+        }
+        initrd->mod_end = 0;
+    }
+
+    printk("PHYSICAL MEMORY ARRANGEMENT:\n"
+           " Dom0 alloc.:   %"PRIpaddr"->%"PRIpaddr,
+           pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn));
+    if ( d->tot_pages < nr_pages )
+        printk(" (%lu pages to be allocated)",
+               nr_pages - d->tot_pages);
+    if ( initrd )
+    {
+        mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT;
+        printk("\n Init. ramdisk: %"PRIpaddr"->%"PRIpaddr,
+               mpt_alloc, mpt_alloc + initrd_len);
+    }
+    printk("\nVIRTUAL MEMORY ARRANGEMENT:\n"
+           " Loaded kernel: %p->%p\n"
+           " Init. ramdisk: %p->%p\n"
+           " Phys-Mach map: %p->%p\n"
+           " Start info:    %p->%p\n"
+           " Page tables:   %p->%p\n"
+           " Boot stack:    %p->%p\n"
+           " TOTAL:         %p->%p\n",
+           _p(vkern_start), _p(vkern_end),
+           _p(vinitrd_start), _p(vinitrd_end),
+           _p(vphysmap_start), _p(vphysmap_end),
+           _p(vstartinfo_start), _p(vstartinfo_end),
+           _p(vpt_start), _p(vpt_end),
+           _p(vstack_start), _p(vstack_end),
+           _p(v_start), _p(v_end));
+    printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry));
+
+    process_pending_softirqs();
+
+    mpt_alloc = (vpt_start - v_start) + pfn_to_paddr(alloc_spfn);
+    if ( vinitrd_start )
+        mpt_alloc -= PAGE_ALIGN(initrd_len);
+
+    /* Overlap with Xen protected area? */
+    if ( !is_pv_32bit_domain(d) ?
+         ((v_start < HYPERVISOR_VIRT_END) &&
+          (v_end > HYPERVISOR_VIRT_START)) :
+         (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) )
+    {
+        printk("DOM0 image overlaps with Xen private area.\n");
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ( is_pv_32bit_domain(d) )
+    {
+        v->arch.pv_vcpu.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS;
+        v->arch.pv_vcpu.event_callback_cs    = FLAT_COMPAT_KERNEL_CS;
+    }
+
+    /* WARNING: The new domain must have its 'processor' field filled in! */
+    if ( !is_pv_32bit_domain(d) )
+    {
+        maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
+        l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+    }
+    else
+    {
+        page = alloc_domheap_page(d, MEMF_no_owner);
+        if ( !page )
+            panic("Not enough RAM for domain 0 PML4");
+        page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
+        l4start = l4tab = page_to_virt(page);
+        maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
+        l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+    }
+    clear_page(l4tab);
+    init_guest_l4_table(l4tab, d, 0);
+    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+    if ( is_pv_32bit_domain(d) )
+        v->arch.guest_table_user = v->arch.guest_table;
+
+    l4tab += l4_table_offset(v_start);
+    pfn = alloc_spfn;
+    for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
+    {
+        if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
+        {
+            maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
+            l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+            clear_page(l1tab);
+            if ( count == 0 )
+                l1tab += l1_table_offset(v_start);
+            if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
+            {
+                maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
+                l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+                clear_page(l2tab);
+                if ( count == 0 )
+                    l2tab += l2_table_offset(v_start);
+                if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) )
+                {
+                    if ( count || !l3start )
+                    {
+                        maddr_to_page(mpt_alloc)->u.inuse.type_info =
+                            PGT_l3_page_table;
+                        l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+                    }
+                    l3tab = l3start;
+                    clear_page(l3tab);
+                    if ( count == 0 )
+                        l3tab += l3_table_offset(v_start);
+                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+                    l4tab++;
+                }
+                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+                l3tab++;
+            }
+            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+            l2tab++;
+        }
+        if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
+            mfn = pfn++;
+        else
+            mfn = initrd_mfn++;
+        *l1tab = l1e_from_pfn(mfn, (!is_pv_32bit_domain(d) ?
+                                    L1_PROT : COMPAT_L1_PROT));
+        l1tab++;
+
+        if ( !paging_mode_translate(d) )
+        {
+            page = mfn_to_page(mfn);
+            if ( !page->u.inuse.type_info &&
+                 !get_page_and_type(page, d, PGT_writable_page) )
+                BUG();
+        }
+    }
+
+    if ( is_pv_32bit_domain(d) )
+    {
+        /* Ensure the first four L3 entries are all populated. */
+        for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
+        {
+            if ( !l3e_get_intpte(*l3tab) )
+            {
+                maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
+                l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+                clear_page(l2tab);
+                *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
+            }
+            if ( i == 3 )
+                l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
+        }
+        /* Install read-only guest visible MPT mapping. */
+        l2tab = l3e_to_l2e(l3start[3]);
+        memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
+               &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
+               COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
+    }
+
+    /* Pages that are part of page tables must be read only. */
+    if  ( is_pv_domain(d) )
+        mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages);
+
+    /* Mask all upcalls... */
+    for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
+        shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
+
+    printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus);
+
+    cpu = v->processor;
+    for ( i = 1; i < d->max_vcpus; i++ )
+    {
+        struct vcpu *p = dom0_setup_vcpu(d, i, cpu);
+
+        if ( p )
+            cpu = p->processor;
+    }
+
+    d->arch.paging.mode = 0;
+
+    /* Set up CR3 value for write_ptbase */
+    if ( paging_mode_enabled(d) )
+        paging_update_paging_modes(v);
+    else
+        update_cr3(v);
+
+    /* We run on dom0's page tables for the final part of the build process. */
+    write_ptbase(v);
+    mapcache_override_current(v);
+
+    /* Copy the OS image and free temporary buffer. */
+    elf.dest_base = (void*)vkern_start;
+    elf.dest_size = vkern_end - vkern_start;
+    elf_set_vcpu(&elf, v);
+    rc = elf_load_binary(&elf);
+    if ( rc < 0 )
+    {
+        printk("Failed to load the kernel binary\n");
+        goto out;
+    }
+    bootstrap_map(NULL);
+
+    if ( UNSET_ADDR != parms.virt_hypercall )
+    {
+        if ( (parms.virt_hypercall < v_start) ||
+             (parms.virt_hypercall >= v_end) )
+        {
+            mapcache_override_current(NULL);
+            write_ptbase(current);
+            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
+            rc = -1;
+            goto out;
+        }
+        hypercall_page_initialise(
+            d, (void *)(unsigned long)parms.virt_hypercall);
+    }
+
+    /* Free temporary buffers. */
+    discard_initial_images();
+
+    /* Set up start info area. */
+    si = (start_info_t *)vstartinfo_start;
+    clear_page(si);
+    si->nr_pages = nr_pages;
+
+    si->shared_info = virt_to_maddr(d->shared_info);
+
+    si->flags        = SIF_PRIVILEGED | SIF_INITDOMAIN;
+    if ( !vinitrd_start && initrd_len )
+        si->flags   |= SIF_MOD_START_PFN;
+    si->flags       |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
+    si->pt_base      = vpt_start;
+    si->nr_pt_frames = nr_pt_pages;
+    si->mfn_list     = vphysmap_start;
+    snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
+             elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
+
+    count = d->tot_pages;
+
+    /* Set up the phys->machine table if not part of the initial mapping. */
+    if ( is_pv_domain(d) && parms.p2m_base != UNSET_ADDR )
+    {
+        pfn = pagetable_get_pfn(v->arch.guest_table);
+        setup_pv_physmap(d, pfn, v_start, v_end, vphysmap_start, vphysmap_end,
+                         nr_pages);
+    }
+
+    /* Write the phys->machine and machine->phys table entries. */
+    for ( pfn = 0; pfn < count; pfn++ )
+    {
+        mfn = pfn + alloc_spfn;
+        if ( pfn >= initrd_pfn )
+        {
+            if ( pfn < initrd_pfn + PFN_UP(initrd_len) )
+                mfn = initrd->mod_start + (pfn - initrd_pfn);
+            else
+                mfn -= PFN_UP(initrd_len);
+        }
+#ifndef NDEBUG
+#define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT)
+        if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
+            mfn = alloc_epfn - (pfn - REVERSE_START);
+#endif
+        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
+        if (!(pfn & 0xfffff))
+            process_pending_softirqs();
+    }
+    si->first_p2m_pfn = pfn;
+    si->nr_p2m_frames = d->tot_pages - count;
+    page_list_for_each ( page, &d->page_list )
+    {
+        mfn = page_to_mfn(page);
+        BUG_ON(SHARED_M2P(get_gpfn_from_mfn(mfn)));
+        if ( get_gpfn_from_mfn(mfn) >= count )
+        {
+            BUG_ON(is_pv_32bit_domain(d));
+            if ( !paging_mode_translate(d) && !page->u.inuse.type_info &&
+                 !get_page_and_type(page, d, PGT_writable_page) )
+                BUG();
+
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
+            ++pfn;
+            if (!(pfn & 0xfffff))
+                process_pending_softirqs();
+        }
+    }
+    BUG_ON(pfn != d->tot_pages);
+#ifndef NDEBUG
+    alloc_epfn += PFN_UP(initrd_len) + si->nr_p2m_frames;
+#endif
+    while ( pfn < nr_pages )
+    {
+        if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
+            panic("Not enough RAM for DOM0 reservation");
+        while ( pfn < d->tot_pages )
+        {
+            mfn = page_to_mfn(page);
+#ifndef NDEBUG
+#define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
+#endif
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
+#undef pfn
+            page++; pfn++;
+            if (!(pfn & 0xfffff))
+                process_pending_softirqs();
+        }
+    }
+
+    if ( initrd_len != 0 )
+    {
+        si->mod_start = vinitrd_start ?: initrd_pfn;
+        si->mod_len   = initrd_len;
+    }
+
+    memset(si->cmd_line, 0, sizeof(si->cmd_line));
+    if ( cmdline != NULL )
+        strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line));
+
+    if ( fill_console_start_info((void *)(si + 1)) )
+    {
+        si->console.dom0.info_off  = sizeof(struct start_info);
+        si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
+    }
+
+    if ( is_pv_32bit_domain(d) )
+        xlat_start_info(si, XLAT_start_info_console_dom0);
+
+    /* Return to idle domain's page tables. */
+    mapcache_override_current(NULL);
+    write_ptbase(current);
+
+    update_domain_wallclock_time(d);
+
+    v->is_initialised = 1;
+    clear_bit(_VPF_down, &v->pause_flags);
+
+    /*
+     * Initial register values:
+     *  DS,ES,FS,GS = FLAT_KERNEL_DS
+     *       CS:rIP = FLAT_KERNEL_CS:start_pc
+     *       SS:rSP = FLAT_KERNEL_SS:start_stack
+     *          rSI = start_info
+     *  [rAX,rBX,rCX,rDX,rDI,rBP,R8-R15 are zero]
+     */
+    regs = &v->arch.user_regs;
+    regs->ds = regs->es = regs->fs = regs->gs =
+        !is_pv_32bit_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS;
+    regs->ss = (!is_pv_32bit_domain(d) ?
+                FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS);
+    regs->cs = (!is_pv_32bit_domain(d) ?
+                FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS);
+    regs->rip = parms.virt_entry;
+    regs->rsp = vstack_end;
+    regs->rsi = vstartinfo_start;
+    regs->eflags = X86_EFLAGS_IF;
+
+#ifdef CONFIG_SHADOW_PAGING
+    if ( opt_dom0_shadow && paging_enable(d, PG_SH_enable) == 0 )
+        paging_update_paging_modes(v);
+#endif
+
+    /*
+     * PVH Fixme: XENFEAT_supervisor_mode_kernel has been reused in PVH with a
+     * different meaning.
+     */
+    if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) )
+        panic("Dom0 requires supervisor-mode execution");
+
+    rc = dom0_setup_permissions(d);
+    BUG_ON(rc != 0);
+
+    if ( elf_check_broken(&elf) )
+        printk(" Xen warning: dom0 kernel broken ELF: %s\n",
+               elf_check_broken(&elf));
+
+    if ( d->domain_id == hardware_domid )
+        iommu_hwdom_init(d);
+
+    return 0;
+
+out:
+    if ( elf_check_broken(&elf) )
+        printk(" Xen dom0 kernel broken ELF: %s\n",
+               elf_check_broken(&elf));
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/dom0_build.h b/xen/include/asm-x86/dom0_build.h
new file mode 100644
index 0000000000..5d093673d9
--- /dev/null
+++ b/xen/include/asm-x86/dom0_build.h
@@ -0,0 +1,33 @@
+#ifndef _DOM0_BUILD_H_
+#define _DOM0_BUILD_H_
+
+#include <xen/sched.h>
+
+#include <asm/setup.h>
+
+extern unsigned int dom0_memflags;
+
+unsigned long dom0_compute_nr_pages(struct domain *d,
+                                    struct elf_dom_parms *parms,
+                                    unsigned long initrd_len);
+struct vcpu *dom0_setup_vcpu(struct domain *d, unsigned int vcpu_id,
+                             unsigned int cpu);
+int dom0_setup_permissions(struct domain *d);
+
+int dom0_construct_pv(struct domain *d, const module_t *image,
+                      unsigned long image_headroom,
+                      module_t *initrd,
+                      void *(*bootstrap_map)(const module_t *),
+                      char *cmdline);
+
+#endif	/* _DOM0_BUILD_H_ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
2.11.0


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

  parent reply	other threads:[~2017-03-20 14:15 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-20 14:14 [PATCH v3 0/5] Refactor x86 dom0 builder Wei Liu
2017-03-20 14:14 ` [PATCH v3 1/5] x86: rename domain_build.c to dom0_build.c Wei Liu
2017-03-20 14:14 ` [PATCH v3 2/5] x86: modify setup_dom0_vcpu to use dom0_cpus internally Wei Liu
2017-03-20 14:21   ` Andrew Cooper
2017-03-20 15:19   ` Jan Beulich
2017-03-20 15:20     ` Wei Liu
2017-03-20 14:14 ` Wei Liu [this message]
2017-03-20 14:43   ` [PATCH v3 3/5] x86: split PV dom0 builder to pv/dom0_builder.c Andrew Cooper
2017-03-20 14:14 ` [PATCH v3 4/5] x86: split PVH dom0 builder to hvm/dom0_build.c Wei Liu
2017-03-20 14:51   ` Andrew Cooper
2017-03-20 14:14 ` [PATCH v3 5/5] x86: clean up header files in dom0_build.c Wei Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170320141426.20780-4-wei.liu2@citrix.com \
    --to=wei.liu2@citrix.com \
    --cc=JBeulich@suse.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=roger.pau@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.