[PATCH 17/18 V2]: PVH xen: PVH dom0 creation...

* [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
@ 2013-03-16  1:06 Mukesh Rathor
  2013-03-18 13:01 ` Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Mukesh Rathor @ 2013-03-16  1:06 UTC (permalink / raw)
  To: Xen-devel

 Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in
 PVH mode. Introduce, opt_dom0pvh, which when specified in the command
 line, causes dom0 to boot in PVH mode.

Change in V2:
  - Map the entire IO region upfront in the P2M for PVH dom0.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/domain_build.c |  241 +++++++++++++++++++++++++++++++++----------
 xen/arch/x86/mm/hap/hap.c   |   17 +++-
 xen/arch/x86/setup.c        |   10 ++-
 xen/include/asm-x86/hap.h   |    1 +
 4 files changed, 212 insertions(+), 57 deletions(-)

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 8c5b27a..72aa70b 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -35,6 +35,8 @@
 #include <asm/setup.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
+#include <asm/hap.h>
+#include <asm/debugger.h>
 
 #include <public/version.h>
 
@@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void)
     }
 }
 
+/* 
+ * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
+ * the entire io region mapped in the EPT/NPT.
+ */
+static __init void  pvh_map_all_iomem(struct domain *d)
+{
+    unsigned long start = 0;
+    const struct e820entry *entry;
+    int rc, i, nump;
+
+    for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
+        unsigned long end = entry->addr + entry->size;
+
+        if (entry->type == E820_RAM || i == e820.nr_map - 1) {
+            unsigned long start_pfn = PFN_DOWN(start);
+            unsigned long end_pfn = PFN_UP(end);
+
+            if (entry->type == E820_RAM)
+                end_pfn = PFN_UP(entry->addr);
+
+            if (start_pfn < end_pfn) {
+                nump = end_pfn - start_pfn + 1;
+                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
+                BUG_ON(rc);
+            }
+            start = end;
+        }
+    }
+}
+
+static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
+                                   unsigned long mfn, unsigned long vphysmap_s)
+{
+    if ( is_pvh_domain(d) ) {
+        int rc = guest_physmap_add_page(d, pfn, mfn, 0);
+        BUG_ON(rc);
+        return;
+    }
+    if ( !is_pv_32on64_domain(d) )
+        ((unsigned long *)vphysmap_s)[pfn] = mfn;
+    else
+        ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+    set_gpfn_from_mfn(mfn, pfn);
+}
+
+static __init void copy_pvh(char *dest, char *src, int bytes)
+{
+    /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr 
+     * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't use that.
+     * So we just use dbg_rw_mem().
+     */
+    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0);
+    if (rem) {
+        printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem);
+        BUG();
+    }
+}
+
 int __init construct_dom0(
     struct domain *d,
     const module_t *image, unsigned long image_headroom,
@@ -314,6 +375,7 @@ int __init construct_dom0(
     void *(*bootstrap_map)(const module_t *),
     char *cmdline)
 {
+    char *si_buf=NULL, *tmp_buf=NULL;
     int i, cpu, rc, compatible, compat32, order, machine;
     struct cpu_user_regs *regs;
     unsigned long pfn, mfn;
@@ -322,7 +384,7 @@ int __init construct_dom0(
     unsigned long alloc_spfn;
     unsigned long alloc_epfn;
     unsigned long initrd_pfn = -1, initrd_mfn = 0;
-    unsigned long count;
+    unsigned long count, shared_info_pfn_addr = 0;
     struct page_info *page = NULL;
     start_info_t *si;
     struct vcpu *v = d->vcpu[0];
@@ -416,6 +478,13 @@ int __init construct_dom0(
     {
         printk("Kernel does not support Dom0 operation\n");
         return -EINVAL;
+
+        if ( is_pvh_domain(d) && 
+             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) 
+        {
+            printk("Kernel does not support PVH mode\n");
+            return -EINVAL;
+        }
     }
 
     if ( compat32 )
@@ -480,6 +549,12 @@ int __init construct_dom0(
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
                         sizeof(struct dom0_vga_console_info));
+
+    if ( is_pvh_domain(d) ) {
+        shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start;
+        vstartinfo_end   += PAGE_SIZE;
+    }
+
     vpt_start        = round_pgup(vstartinfo_end);
     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
     {
@@ -621,16 +696,26 @@ int __init construct_dom0(
         maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
         l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
     }
-    clear_page(l4tab);
-    init_guest_l4_table(l4tab, d);
-    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
-    if ( is_pv_32on64_domain(d) )
-        v->arch.guest_table_user = v->arch.guest_table;
+    if ( is_pvh_domain(d) )
+    {
+        v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);
+        pfn = 0;
+    } else { 
+        clear_page(l4tab);
+        init_guest_l4_table(l4tab, d);
+        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+        if ( is_pv_32on64_domain(d) )
+            v->arch.guest_table_user = v->arch.guest_table;
+        pfn = alloc_spfn;
+    }
 
     l4tab += l4_table_offset(v_start);
-    pfn = alloc_spfn;
     for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
     {
+        /* initrd chunk's mfns are separate, so we need to adjust for them */
+        signed long pvh_adj = is_pvh_domain(d) ?
+                              (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT : 0;
+
         if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
         {
             maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
@@ -657,16 +742,17 @@ int __init construct_dom0(
                     clear_page(l3tab);
                     if ( count == 0 )
                         l3tab += l3_table_offset(v_start);
-                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+                    *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT);
                     l4tab++;
                 }
-                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+                *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT);
                 l3tab++;
             }
-            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+            *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT);
             l2tab++;
         }
-        if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
+        if ( is_pvh_domain(d) ||
+             count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
             mfn = pfn++;
         else
             mfn = initrd_mfn++;
@@ -674,6 +760,9 @@ int __init construct_dom0(
                                     L1_PROT : COMPAT_L1_PROT));
         l1tab++;
 
+        if ( is_pvh_domain(d) )
+            continue;
+
         page = mfn_to_page(mfn);
         if ( (page->u.inuse.type_info == 0) &&
              !get_page_and_type(page, d, PGT_writable_page) )
@@ -702,6 +791,9 @@ int __init construct_dom0(
                COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
     }
 
+    if  ( is_pvh_domain(d) )
+        goto pvh_skip_pt_rdonly;
+
     /* Pages that are part of page tables must be read only. */
     l4tab = l4start + l4_table_offset(vpt_start);
     l3start = l3tab = l4e_to_l3e(*l4tab);
@@ -741,6 +833,8 @@ int __init construct_dom0(
         }
     }
 
+pvh_skip_pt_rdonly:
+
     /* Mask all upcalls... */
     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
         shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
@@ -754,6 +848,11 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, cpu);
     }
 
+    if ( is_pvh_domain(d) )
+    {
+        v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] =
+                        (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT;
+    }
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
@@ -764,35 +863,16 @@ int __init construct_dom0(
     write_ptbase(v);
     mapcache_override_current(v);
 
-    /* Copy the OS image and free temporary buffer. */
-    elf.dest = (void*)vkern_start;
-    rc = elf_load_binary(&elf, 0);
-    if ( rc < 0 )
-    {
-        printk("Failed to load the kernel binary\n");
-        return rc;
-    }
-    bootstrap_map(NULL);
-
-    if ( UNSET_ADDR != parms.virt_hypercall )
-    {
-        if ( (parms.virt_hypercall < v_start) ||
-             (parms.virt_hypercall >= v_end) )
-        {
-            mapcache_override_current(NULL);
-            write_ptbase(current);
-            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
-            return -1;
+    /* Set up start info area. */
+    if ( is_pvh_domain(d) ) {
+        if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) {
+            printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE);
+            return -ENOMEM;
         }
-        hypercall_page_initialise(
-            d, (void *)(unsigned long)parms.virt_hypercall);
-    }
-
-    /* Free temporary buffers. */
-    discard_initial_images();
+        si = (start_info_t *)si_buf;
+    } else
+        si = (start_info_t *)vstartinfo_start;
 
-    /* Set up start info area. */
-    si = (start_info_t *)vstartinfo_start;
     clear_page(si);
     si->nr_pages = nr_pages;
 
@@ -814,7 +894,7 @@ int __init construct_dom0(
     l2tab = NULL;
     l1tab = NULL;
     /* Set up the phys->machine table if not part of the initial mapping. */
-    if ( parms.p2m_base != UNSET_ADDR )
+    if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) )
     {
         unsigned long va = vphysmap_start;
 
@@ -935,6 +1015,9 @@ int __init construct_dom0(
         unmap_domain_page(l3tab);
     unmap_domain_page(l4start);
 
+    if (is_pvh_domain(d) )
+        hap_set_pvh_alloc_for_dom0(d, nr_pages);
+
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
     {
@@ -951,11 +1034,8 @@ int __init construct_dom0(
         if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
             mfn = alloc_epfn - (pfn - REVERSE_START);
 #endif
-        if ( !is_pv_32on64_domain(d) )
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-        else
-            ((unsigned int *)vphysmap_start)[pfn] = mfn;
-        set_gpfn_from_mfn(mfn, pfn);
+        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
+
         if (!(pfn & 0xfffff))
             process_pending_softirqs();
     }
@@ -971,8 +1051,8 @@ int __init construct_dom0(
             if ( !page->u.inuse.type_info &&
                  !get_page_and_type(page, d, PGT_writable_page) )
                 BUG();
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
             ++pfn;
             if (!(pfn & 0xfffff))
                 process_pending_softirqs();
@@ -992,11 +1072,7 @@ int __init construct_dom0(
 #ifndef NDEBUG
 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
 #endif
-            if ( !is_pv_32on64_domain(d) )
-                ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            else
-                ((unsigned int *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
 #undef pfn
             page++; pfn++;
             if (!(pfn & 0xfffff))
@@ -1004,6 +1080,47 @@ int __init construct_dom0(
         }
     }
 
+    /* Copy the OS image and free temporary buffer. */
+    elf.dest = (void*)vkern_start;
+    rc = elf_load_binary(&elf, is_pvh_domain(d) );
+    if ( rc < 0 )
+    {
+        printk("Failed to load the kernel binary\n");
+        return rc;
+    }
+    bootstrap_map(NULL);
+
+    if ( UNSET_ADDR != parms.virt_hypercall )
+    {
+        void *addr;
+
+        if ( is_pvh_domain(d) ) {
+            if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) {
+                printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE);
+                return -ENOMEM;
+            }
+            addr = tmp_buf;
+        } else 
+            addr = (void *)parms.virt_hypercall;
+
+        if ( (parms.virt_hypercall < v_start) ||
+             (parms.virt_hypercall >= v_end) )
+        {
+            write_ptbase(current);
+            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
+            return -1;
+        }
+        hypercall_page_initialise(d, addr);
+
+        if ( is_pvh_domain(d) ) {
+            copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE);
+            xfree(tmp_buf);
+        }
+    }
+
+    /* Free temporary buffers. */
+    discard_initial_images();
+
     if ( initrd_len != 0 )
     {
         si->mod_start = vinitrd_start ?: initrd_pfn;
@@ -1019,6 +1136,15 @@ int __init construct_dom0(
         si->console.dom0.info_off  = sizeof(struct start_info);
         si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
     }
+    if ( is_pvh_domain(d) ) {
+        unsigned long mfn = virt_to_mfn(d->shared_info);
+        unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT;
+        si->shared_info = shared_info_pfn_addr;
+        dom0_update_physmap(d, pfn, mfn, 0);
+
+        copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE);
+        xfree(si_buf);
+    }
 
     if ( is_pv_32on64_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
@@ -1050,12 +1176,16 @@ int __init construct_dom0(
     regs->eip = parms.virt_entry;
     regs->esp = vstack_end;
     regs->esi = vstartinfo_start;
-    regs->eflags = X86_EFLAGS_IF;
+    regs->eflags = X86_EFLAGS_IF | 0x2;
 
-    if ( opt_dom0_shadow )
+    if ( opt_dom0_shadow ) {
+        if ( is_pvh_domain(d) ) {
+            printk("Invalid option dom0_shadow for PVH\n");
+            return -EINVAL;
+        }
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
-
+    }
     if ( supervisor_mode_kernel )
     {
         v->arch.pv_vcpu.kernel_ss &= ~3;
@@ -1132,6 +1262,9 @@ int __init construct_dom0(
 
     BUG_ON(rc != 0);
 
+    if ( is_pvh_domain(d) )
+        pvh_map_all_iomem(d);
+
     iommu_dom0_init(dom0);
 
     return 0;
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 055833d..d3d5697 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
     }
 }
 
+/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
+{
+    int rc;
+    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
+
+    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+    num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);
+    paging_lock(d);
+    rc = hap_set_allocation(d, num_pages, NULL);
+    paging_unlock(d);
+    BUG_ON(rc);
+}
+
 static const struct paging_mode hap_paging_real_mode;
 static const struct paging_mode hap_paging_protected_mode;
 static const struct paging_mode hap_paging_pae_mode;
@@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking)
 const struct paging_mode *
 hap_paging_get_mode(struct vcpu *v)
 {
-    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
+    return is_pvh_vcpu(v) ? &hap_paging_long_mode :
+        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
         hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
         hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
                                    &hap_paging_protected_mode;
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 43301a5..f307f24 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus);
 static bool_t __initdata disable_smep;
 invbool_param("smep", disable_smep);
 
+/* Boot dom0 in PVH mode */
+static bool_t __initdata opt_dom0pvh;
+boolean_param("dom0pvh", opt_dom0pvh);
+
 /* **** Linux config option: propagated to domain0. */
 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
 /* "acpi=force":  Override the disable blacklist.                   */
@@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p)
 {
     char *memmap_type = NULL;
     char *cmdline, *kextra, *loader;
-    unsigned int initrdidx;
+    unsigned int initrdidx, domcr_flags = 0;
     multiboot_info_t *mbi = __va(mbi_p);
     module_t *mod = (module_t *)__va(mbi->mods_addr);
     unsigned long nr_pages, modules_headroom, *module_map;
@@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p)
         panic("Could not protect TXT memory regions\n");
 
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, 0);
+    domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0);
+    domcr_flags |= DOMCRF_s3_integrity;
+    dom0 = domain_create(0, domcr_flags, 0);
     if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) )
         panic("Error creating domain 0\n");
 
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index e03f983..aab8558 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
 
 #endif /* XEN_HAP_H */
 
-- 
1.7.2.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread