All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
@ 2013-03-16  1:06 Mukesh Rathor
  2013-03-18 13:01 ` Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Mukesh Rathor @ 2013-03-16  1:06 UTC (permalink / raw)
  To: Xen-devel

 Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in
 PVH mode. Introduce, opt_dom0pvh, which when specified in the command
 line, causes dom0 to boot in PVH mode.

Change in V2:
  - Map the entire IO region upfront in the P2M for PVH dom0.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/domain_build.c |  241 +++++++++++++++++++++++++++++++++----------
 xen/arch/x86/mm/hap/hap.c   |   17 +++-
 xen/arch/x86/setup.c        |   10 ++-
 xen/include/asm-x86/hap.h   |    1 +
 4 files changed, 212 insertions(+), 57 deletions(-)

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 8c5b27a..72aa70b 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -35,6 +35,8 @@
 #include <asm/setup.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
+#include <asm/hap.h>
+#include <asm/debugger.h>
 
 #include <public/version.h>
 
@@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void)
     }
 }
 
+/* 
+ * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
+ * the entire io region mapped in the EPT/NPT.
+ */
+static __init void  pvh_map_all_iomem(struct domain *d)
+{
+    unsigned long start = 0;
+    const struct e820entry *entry;
+    int rc, i, nump;
+
+    for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
+        unsigned long end = entry->addr + entry->size;
+
+        if (entry->type == E820_RAM || i == e820.nr_map - 1) {
+            unsigned long start_pfn = PFN_DOWN(start);
+            unsigned long end_pfn = PFN_UP(end);
+
+            if (entry->type == E820_RAM)
+                end_pfn = PFN_UP(entry->addr);
+
+            if (start_pfn < end_pfn) {
+                nump = end_pfn - start_pfn + 1;
+                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
+                BUG_ON(rc);
+            }
+            start = end;
+        }
+    }
+}
+
+static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
+                                   unsigned long mfn, unsigned long vphysmap_s)
+{
+    if ( is_pvh_domain(d) ) {
+        int rc = guest_physmap_add_page(d, pfn, mfn, 0);
+        BUG_ON(rc);
+        return;
+    }
+    if ( !is_pv_32on64_domain(d) )
+        ((unsigned long *)vphysmap_s)[pfn] = mfn;
+    else
+        ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+    set_gpfn_from_mfn(mfn, pfn);
+}
+
+static __init void copy_pvh(char *dest, char *src, int bytes)
+{
+    /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr 
+     * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't use that.
+     * So we just use dbg_rw_mem().
+     */
+    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0);
+    if (rem) {
+        printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem);
+        BUG();
+    }
+}
+
 int __init construct_dom0(
     struct domain *d,
     const module_t *image, unsigned long image_headroom,
@@ -314,6 +375,7 @@ int __init construct_dom0(
     void *(*bootstrap_map)(const module_t *),
     char *cmdline)
 {
+    char *si_buf=NULL, *tmp_buf=NULL;
     int i, cpu, rc, compatible, compat32, order, machine;
     struct cpu_user_regs *regs;
     unsigned long pfn, mfn;
@@ -322,7 +384,7 @@ int __init construct_dom0(
     unsigned long alloc_spfn;
     unsigned long alloc_epfn;
     unsigned long initrd_pfn = -1, initrd_mfn = 0;
-    unsigned long count;
+    unsigned long count, shared_info_pfn_addr = 0;
     struct page_info *page = NULL;
     start_info_t *si;
     struct vcpu *v = d->vcpu[0];
@@ -416,6 +478,13 @@ int __init construct_dom0(
     {
         printk("Kernel does not support Dom0 operation\n");
         return -EINVAL;
+
+        if ( is_pvh_domain(d) && 
+             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) 
+        {
+            printk("Kernel does not support PVH mode\n");
+            return -EINVAL;
+        }
     }
 
     if ( compat32 )
@@ -480,6 +549,12 @@ int __init construct_dom0(
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
                         sizeof(struct dom0_vga_console_info));
+
+    if ( is_pvh_domain(d) ) {
+        shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start;
+        vstartinfo_end   += PAGE_SIZE;
+    }
+
     vpt_start        = round_pgup(vstartinfo_end);
     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
     {
@@ -621,16 +696,26 @@ int __init construct_dom0(
         maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
         l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
     }
-    clear_page(l4tab);
-    init_guest_l4_table(l4tab, d);
-    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
-    if ( is_pv_32on64_domain(d) )
-        v->arch.guest_table_user = v->arch.guest_table;
+    if ( is_pvh_domain(d) )
+    {
+        v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);
+        pfn = 0;
+    } else { 
+        clear_page(l4tab);
+        init_guest_l4_table(l4tab, d);
+        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+        if ( is_pv_32on64_domain(d) )
+            v->arch.guest_table_user = v->arch.guest_table;
+        pfn = alloc_spfn;
+    }
 
     l4tab += l4_table_offset(v_start);
-    pfn = alloc_spfn;
     for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
     {
+        /* initrd chunk's mfns are separate, so we need to adjust for them */
+        signed long pvh_adj = is_pvh_domain(d) ?
+                              (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT : 0;
+
         if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
         {
             maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
@@ -657,16 +742,17 @@ int __init construct_dom0(
                     clear_page(l3tab);
                     if ( count == 0 )
                         l3tab += l3_table_offset(v_start);
-                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+                    *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT);
                     l4tab++;
                 }
-                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+                *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT);
                 l3tab++;
             }
-            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+            *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT);
             l2tab++;
         }
-        if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
+        if ( is_pvh_domain(d) ||
+             count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
             mfn = pfn++;
         else
             mfn = initrd_mfn++;
@@ -674,6 +760,9 @@ int __init construct_dom0(
                                     L1_PROT : COMPAT_L1_PROT));
         l1tab++;
 
+        if ( is_pvh_domain(d) )
+            continue;
+
         page = mfn_to_page(mfn);
         if ( (page->u.inuse.type_info == 0) &&
              !get_page_and_type(page, d, PGT_writable_page) )
@@ -702,6 +791,9 @@ int __init construct_dom0(
                COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
     }
 
+    if  ( is_pvh_domain(d) )
+        goto pvh_skip_pt_rdonly;
+
     /* Pages that are part of page tables must be read only. */
     l4tab = l4start + l4_table_offset(vpt_start);
     l3start = l3tab = l4e_to_l3e(*l4tab);
@@ -741,6 +833,8 @@ int __init construct_dom0(
         }
     }
 
+pvh_skip_pt_rdonly:
+
     /* Mask all upcalls... */
     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
         shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
@@ -754,6 +848,11 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, cpu);
     }
 
+    if ( is_pvh_domain(d) )
+    {
+        v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] =
+                        (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT;
+    }
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
@@ -764,35 +863,16 @@ int __init construct_dom0(
     write_ptbase(v);
     mapcache_override_current(v);
 
-    /* Copy the OS image and free temporary buffer. */
-    elf.dest = (void*)vkern_start;
-    rc = elf_load_binary(&elf, 0);
-    if ( rc < 0 )
-    {
-        printk("Failed to load the kernel binary\n");
-        return rc;
-    }
-    bootstrap_map(NULL);
-
-    if ( UNSET_ADDR != parms.virt_hypercall )
-    {
-        if ( (parms.virt_hypercall < v_start) ||
-             (parms.virt_hypercall >= v_end) )
-        {
-            mapcache_override_current(NULL);
-            write_ptbase(current);
-            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
-            return -1;
+    /* Set up start info area. */
+    if ( is_pvh_domain(d) ) {
+        if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) {
+            printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE);
+            return -ENOMEM;
         }
-        hypercall_page_initialise(
-            d, (void *)(unsigned long)parms.virt_hypercall);
-    }
-
-    /* Free temporary buffers. */
-    discard_initial_images();
+        si = (start_info_t *)si_buf;
+    } else
+        si = (start_info_t *)vstartinfo_start;
 
-    /* Set up start info area. */
-    si = (start_info_t *)vstartinfo_start;
     clear_page(si);
     si->nr_pages = nr_pages;
 
@@ -814,7 +894,7 @@ int __init construct_dom0(
     l2tab = NULL;
     l1tab = NULL;
     /* Set up the phys->machine table if not part of the initial mapping. */
-    if ( parms.p2m_base != UNSET_ADDR )
+    if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) )
     {
         unsigned long va = vphysmap_start;
 
@@ -935,6 +1015,9 @@ int __init construct_dom0(
         unmap_domain_page(l3tab);
     unmap_domain_page(l4start);
 
+    if (is_pvh_domain(d) )
+        hap_set_pvh_alloc_for_dom0(d, nr_pages);
+
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
     {
@@ -951,11 +1034,8 @@ int __init construct_dom0(
         if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
             mfn = alloc_epfn - (pfn - REVERSE_START);
 #endif
-        if ( !is_pv_32on64_domain(d) )
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-        else
-            ((unsigned int *)vphysmap_start)[pfn] = mfn;
-        set_gpfn_from_mfn(mfn, pfn);
+        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
+
         if (!(pfn & 0xfffff))
             process_pending_softirqs();
     }
@@ -971,8 +1051,8 @@ int __init construct_dom0(
             if ( !page->u.inuse.type_info &&
                  !get_page_and_type(page, d, PGT_writable_page) )
                 BUG();
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
             ++pfn;
             if (!(pfn & 0xfffff))
                 process_pending_softirqs();
@@ -992,11 +1072,7 @@ int __init construct_dom0(
 #ifndef NDEBUG
 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
 #endif
-            if ( !is_pv_32on64_domain(d) )
-                ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            else
-                ((unsigned int *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
 #undef pfn
             page++; pfn++;
             if (!(pfn & 0xfffff))
@@ -1004,6 +1080,47 @@ int __init construct_dom0(
         }
     }
 
+    /* Copy the OS image and free temporary buffer. */
+    elf.dest = (void*)vkern_start;
+    rc = elf_load_binary(&elf, is_pvh_domain(d) );
+    if ( rc < 0 )
+    {
+        printk("Failed to load the kernel binary\n");
+        return rc;
+    }
+    bootstrap_map(NULL);
+
+    if ( UNSET_ADDR != parms.virt_hypercall )
+    {
+        void *addr;
+
+        if ( is_pvh_domain(d) ) {
+            if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) {
+                printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE);
+                return -ENOMEM;
+            }
+            addr = tmp_buf;
+        } else 
+            addr = (void *)parms.virt_hypercall;
+
+        if ( (parms.virt_hypercall < v_start) ||
+             (parms.virt_hypercall >= v_end) )
+        {
+            write_ptbase(current);
+            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
+            return -1;
+        }
+        hypercall_page_initialise(d, addr);
+
+        if ( is_pvh_domain(d) ) {
+            copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE);
+            xfree(tmp_buf);
+        }
+    }
+
+    /* Free temporary buffers. */
+    discard_initial_images();
+
     if ( initrd_len != 0 )
     {
         si->mod_start = vinitrd_start ?: initrd_pfn;
@@ -1019,6 +1136,15 @@ int __init construct_dom0(
         si->console.dom0.info_off  = sizeof(struct start_info);
         si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
     }
+    if ( is_pvh_domain(d) ) {
+        unsigned long mfn = virt_to_mfn(d->shared_info);
+        unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT;
+        si->shared_info = shared_info_pfn_addr;
+        dom0_update_physmap(d, pfn, mfn, 0);
+
+        copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE);
+        xfree(si_buf);
+    }
 
     if ( is_pv_32on64_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
@@ -1050,12 +1176,16 @@ int __init construct_dom0(
     regs->eip = parms.virt_entry;
     regs->esp = vstack_end;
     regs->esi = vstartinfo_start;
-    regs->eflags = X86_EFLAGS_IF;
+    regs->eflags = X86_EFLAGS_IF | 0x2;
 
-    if ( opt_dom0_shadow )
+    if ( opt_dom0_shadow ) {
+        if ( is_pvh_domain(d) ) {
+            printk("Invalid option dom0_shadow for PVH\n");
+            return -EINVAL;
+        }
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
-
+    }
     if ( supervisor_mode_kernel )
     {
         v->arch.pv_vcpu.kernel_ss &= ~3;
@@ -1132,6 +1262,9 @@ int __init construct_dom0(
 
     BUG_ON(rc != 0);
 
+    if ( is_pvh_domain(d) )
+        pvh_map_all_iomem(d);
+
     iommu_dom0_init(dom0);
 
     return 0;
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 055833d..d3d5697 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
     }
 }
 
+/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
+{
+    int rc;
+    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
+
+    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+    num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);
+    paging_lock(d);
+    rc = hap_set_allocation(d, num_pages, NULL);
+    paging_unlock(d);
+    BUG_ON(rc);
+}
+
 static const struct paging_mode hap_paging_real_mode;
 static const struct paging_mode hap_paging_protected_mode;
 static const struct paging_mode hap_paging_pae_mode;
@@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking)
 const struct paging_mode *
 hap_paging_get_mode(struct vcpu *v)
 {
-    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
+    return is_pvh_vcpu(v) ? &hap_paging_long_mode :
+        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
         hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
         hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
                                    &hap_paging_protected_mode;
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 43301a5..f307f24 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus);
 static bool_t __initdata disable_smep;
 invbool_param("smep", disable_smep);
 
+/* Boot dom0 in PVH mode */
+static bool_t __initdata opt_dom0pvh;
+boolean_param("dom0pvh", opt_dom0pvh);
+
 /* **** Linux config option: propagated to domain0. */
 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
 /* "acpi=force":  Override the disable blacklist.                   */
@@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p)
 {
     char *memmap_type = NULL;
     char *cmdline, *kextra, *loader;
-    unsigned int initrdidx;
+    unsigned int initrdidx, domcr_flags = 0;
     multiboot_info_t *mbi = __va(mbi_p);
     module_t *mod = (module_t *)__va(mbi->mods_addr);
     unsigned long nr_pages, modules_headroom, *module_map;
@@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p)
         panic("Could not protect TXT memory regions\n");
 
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, 0);
+    domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0);
+    domcr_flags |= DOMCRF_s3_integrity;
+    dom0 = domain_create(0, domcr_flags, 0);
     if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) )
         panic("Error creating domain 0\n");
 
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index e03f983..aab8558 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
 
 #endif /* XEN_HAP_H */
 
-- 
1.7.2.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-16  1:06 [PATCH 17/18 V2]: PVH xen: PVH dom0 creation Mukesh Rathor
@ 2013-03-18 13:01 ` Jan Beulich
  2013-03-27  0:34   ` Mukesh Rathor
  2013-03-29  0:32   ` Mukesh Rathor
  2013-03-18 20:06 ` Konrad Rzeszutek Wilk
  2013-03-19  9:27 ` Jan Beulich
  2 siblings, 2 replies; 10+ messages in thread
From: Jan Beulich @ 2013-03-18 13:01 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel

 >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> @@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void)
>      }
>  }
>  
> +/* 
> + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
> + * the entire io region mapped in the EPT/NPT.
> + */
> +static __init void  pvh_map_all_iomem(struct domain *d)
> +{
> +    unsigned long start = 0;
> +    const struct e820entry *entry;
> +    int rc, i, nump;
> +
> +    for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
> +        unsigned long end = entry->addr + entry->size;
> +
> +        if (entry->type == E820_RAM || i == e820.nr_map - 1) {
> +            unsigned long start_pfn = PFN_DOWN(start);
> +            unsigned long end_pfn = PFN_UP(end);
> +
> +            if (entry->type == E820_RAM)
> +                end_pfn = PFN_UP(entry->addr);
> +
> +            if (start_pfn < end_pfn) {
> +                nump = end_pfn - start_pfn + 1;
> +                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
> +                BUG_ON(rc);
> +            }
> +            start = end;
> +        }
> +    }

At least E820_UNUSABLE must be excluded here.

And as you're mapping the holes only - how do you deal with
the MMIO range past end of RAM? And perhaps even more
important - how do you deal with the split between RAM and
MMIO not being at the end of currently populated RAM, but
at the end of possible hotpluggable regions.

> +static __init void copy_pvh(char *dest, char *src, int bytes)
> +{
> +    /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr 
> +     * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't use that.
> +     * So we just use dbg_rw_mem().
> +     */
> +    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0);

Same comment as before: This is not acceptable for a submission
of a patch intended to be committed (i.e. non-RFC). You should
have worked out a suitable solution to this before posting.

> @@ -416,6 +478,13 @@ int __init construct_dom0(
>      {
>          printk("Kernel does not support Dom0 operation\n");
>          return -EINVAL;
> +
> +        if ( is_pvh_domain(d) && 
> +             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) 
> +        {
> +            printk("Kernel does not support PVH mode\n");
> +            return -EINVAL;
> +        }

Adding dead code (after a return).

> @@ -621,16 +696,26 @@ int __init construct_dom0(
>          maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
>          l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
>      }
> -    clear_page(l4tab);
> -    init_guest_l4_table(l4tab, d);
> -    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
> -    if ( is_pv_32on64_domain(d) )
> -        v->arch.guest_table_user = v->arch.guest_table;
> +    if ( is_pvh_domain(d) )
> +    {
> +        v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);

Am I understanding right that you're making v->arch.guest_table
store a guest physical address rather than a host physical one?
Can that really be done consistently across of uses of this field?

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-16  1:06 [PATCH 17/18 V2]: PVH xen: PVH dom0 creation Mukesh Rathor
  2013-03-18 13:01 ` Jan Beulich
@ 2013-03-18 20:06 ` Konrad Rzeszutek Wilk
  2013-03-26 23:25   ` Mukesh Rathor
  2013-03-19  9:27 ` Jan Beulich
  2 siblings, 1 reply; 10+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-03-18 20:06 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: Xen-devel

On Fri, Mar 15, 2013 at 06:06:45PM -0700, Mukesh Rathor wrote:
>  Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in
>  PVH mode. Introduce, opt_dom0pvh, which when specified in the command
>  line, causes dom0 to boot in PVH mode.
> 
> Change in V2:
>   - Map the entire IO region upfront in the P2M for PVH dom0.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
> ---
>  xen/arch/x86/domain_build.c |  241 +++++++++++++++++++++++++++++++++----------
>  xen/arch/x86/mm/hap/hap.c   |   17 +++-
>  xen/arch/x86/setup.c        |   10 ++-
>  xen/include/asm-x86/hap.h   |    1 +
>  4 files changed, 212 insertions(+), 57 deletions(-)
> 
> diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
> index 8c5b27a..72aa70b 100644
> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -35,6 +35,8 @@
>  #include <asm/setup.h>
>  #include <asm/bzimage.h> /* for bzimage_parse */
>  #include <asm/io_apic.h>
> +#include <asm/hap.h>
> +#include <asm/debugger.h>
>  
>  #include <public/version.h>
>  
> @@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void)
>      }
>  }
>  
> +/* 
> + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
> + * the entire io region mapped in the EPT/NPT.
> + */
> +static __init void  pvh_map_all_iomem(struct domain *d)
> +{
> +    unsigned long start = 0;
> +    const struct e820entry *entry;
> +    int rc, i, nump;

unsigned int for 'nump'.

Could it be called 'pfns' ?

> +
> +    for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
> +        unsigned long end = entry->addr + entry->size;
> +
> +        if (entry->type == E820_RAM || i == e820.nr_map - 1) {
> +            unsigned long start_pfn = PFN_DOWN(start);
> +            unsigned long end_pfn = PFN_UP(end);
> +
> +            if (entry->type == E820_RAM)
> +                end_pfn = PFN_UP(entry->addr);
> +
> +            if (start_pfn < end_pfn) {
> +                nump = end_pfn - start_pfn + 1;
> +                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);

Probably want a comment for the '1' by saying: /* adding */.

> +                BUG_ON(rc);

It would help to have a bit more details.
Perhaps
		if (rc) {
			printk(XENLOG_ERR "1-1 mapping on %lx -> %lx for %ld failed with %d!\n",
				start_pfn, end_pfn, pfns, rc);
			BUG_ON(rc);
> +            }
> +            start = end;
> +        }
> +    }
> +}
> +
> +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
> +                                   unsigned long mfn, unsigned long vphysmap_s)
> +{
> +    if ( is_pvh_domain(d) ) {
> +        int rc = guest_physmap_add_page(d, pfn, mfn, 0);

A comment about the 0 would be good. And you can collapse this in:

	BUG_ON( guest_physmap_add_page(d, pfn, mfn, 0 /* remove *) );

> +        BUG_ON(rc);
> +        return;
> +    }
> +    if ( !is_pv_32on64_domain(d) )
> +        ((unsigned long *)vphysmap_s)[pfn] = mfn;
> +    else
> +        ((unsigned int *)vphysmap_s)[pfn] = mfn;
> +
> +    set_gpfn_from_mfn(mfn, pfn);
> +}
> +
> +static __init void copy_pvh(char *dest, char *src, int bytes)
> +{
> +    /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr 
> +     * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't use that.
> +     * So we just use dbg_rw_mem().
> +     */
> +    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 0);
> +    if (rem) {
> +        printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem);
> +        BUG();

Wait a minute? This is debug code but you use for copying?! Why not
__copy_to_user?

> +    }
> +}
> +
>  int __init construct_dom0(
>      struct domain *d,
>      const module_t *image, unsigned long image_headroom,
> @@ -314,6 +375,7 @@ int __init construct_dom0(
>      void *(*bootstrap_map)(const module_t *),
>      char *cmdline)
>  {
> +    char *si_buf=NULL, *tmp_buf=NULL;
>      int i, cpu, rc, compatible, compat32, order, machine;
>      struct cpu_user_regs *regs;
>      unsigned long pfn, mfn;
> @@ -322,7 +384,7 @@ int __init construct_dom0(
>      unsigned long alloc_spfn;
>      unsigned long alloc_epfn;
>      unsigned long initrd_pfn = -1, initrd_mfn = 0;
> -    unsigned long count;
> +    unsigned long count, shared_info_pfn_addr = 0;
>      struct page_info *page = NULL;
>      start_info_t *si;
>      struct vcpu *v = d->vcpu[0];
> @@ -416,6 +478,13 @@ int __init construct_dom0(
>      {
>          printk("Kernel does not support Dom0 operation\n");
>          return -EINVAL;

You are adding it right above the return - so it will never reach this.

> +
> +        if ( is_pvh_domain(d) && 
> +             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) 
> +        {
> +            printk("Kernel does not support PVH mode\n");
> +            return -EINVAL;
> +        }
>      }
>  
>      if ( compat32 )
> @@ -480,6 +549,12 @@ int __init construct_dom0(


So .. what about  (which is right above the vstartinfo_end)

 474     vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
 475                                                      sizeof(unsigned long) :
 476                                                      sizeof(unsigned int)));

That is the code the figures out how big the P2M array should be.
Shouldn't that be just
	vphysmap_end = vphysmap_start?

Or are doing some cunning work and setting the XEN_ELFNOTE_INIT_P2M to
point to some wacky address and later on in this code checking for the
base (params.p2m_base) and if so, then not creating the neccessary space for the P2M array?

>      vstartinfo_end   = (vstartinfo_start +
>                          sizeof(struct start_info) +
>                          sizeof(struct dom0_vga_console_info));
> +
> +    if ( is_pvh_domain(d) ) {
> +        shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start;

Huh. I don't think that is the PFN. Rather it is the physical address
offset. Perhaps it should be called 'shared_info_gaddr' ? Looking at the code
above, this means it is past the kernel, past the ramdisk, past the P2M array,
and past the 'struct start_info' and 'dom0_vga_console_info'.

Which means that in include/public/xen.h also needs this:

 *  1. The domain is started within contiguous virtual-memory region.
 *  2. The contiguous region ends on an aligned 4MB boundary.
 *  3. This the order of bootstrap elements in the initial virtual region:
 *      a. relocated kernel image
 *      b. initial ram disk              [mod_start, mod_len]
 *      c. list of allocated page frames [mfn_list, nr_pages]
 *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
 *      d. start_info_t structure        [register ESI (x86)]

==>     d1. And struct shared_info_t structure	[shared_info]
	    if autotranslated guest.

 *      e. bootstrap page tables         [pt_base and CR3 (x86)]
 *      f. bootstrap stack               [register ESP (x86)]


Which looks to match what the toolstack does.


> +        vstartinfo_end   += PAGE_SIZE;
> +    }
> +
>      vpt_start        = round_pgup(vstartinfo_end);
>      for ( nr_pt_pages = 2; ; nr_pt_pages++ )
>      {
> @@ -621,16 +696,26 @@ int __init construct_dom0(
>          maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
>          l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
>      }
> -    clear_page(l4tab);
> -    init_guest_l4_table(l4tab, d);
> -    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
> -    if ( is_pv_32on64_domain(d) )
> -        v->arch.guest_table_user = v->arch.guest_table;
> +    if ( is_pvh_domain(d) )
> +    {
> +        v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);

Couldn't we use mpt_alloc? It should point to the same location?
So with this code we still allocated one page for the L4:

 616         page = alloc_domheap_page(NULL, 0);

Do you think it makes sense to free it? You are not
using it - you are using the mpt_alloc as your L4. And you did
not clean it either. Perhaps also have

	clear_page(mpt_alloc);


> +        pfn = 0;

OK, so instead of starting the alloc_spfn which is the starting MFN
for the page that covers the kernel, initrd, P2M array and the three
various PAGE_SIZE structs, you start at zero.

I think this warrants a comment. I believe this is b/c it does not
matter to Xen - you are just creating a nice initial page-tables
that the guest can use and the MFNs in it - have to be PFNs as
it is running in auto-translated mode right?

In which case you could also give this extra comment:

	/* Since the guest is going to run in auto-translated
	 * mode and none of the pagiging is done via PV MMU calls
	 * we create the page-tables from GPFN=0 up to boostrap.
	 * TODO: Instead of using 4KB L4 we could use 2MB or so
	 * similar to how the P2M array can if parms.p2m_base is set
	 */

which begs a question. You are doing it from PFN 0 up to d->tot_pages
and also include in this region the P2M array. Why?

> +    } else { 
> +        clear_page(l4tab);
> +        init_guest_l4_table(l4tab, d);
> +        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
> +        if ( is_pv_32on64_domain(d) )
> +            v->arch.guest_table_user = v->arch.guest_table;
> +        pfn = alloc_spfn;
> +    }
>  
>      l4tab += l4_table_offset(v_start);
> -    pfn = alloc_spfn;
>      for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
>      {
> +        /* initrd chunk's mfns are separate, so we need to adjust for them */

Perhaps you can say:

	/* initrd chunk (see the 'alloc_domheap_pages' calls above) is allocated
	   from a seperate MFN chunk. Hence we need to adjust for that. */


> +        signed long pvh_adj = is_pvh_domain(d) ?

Just 'long' and perhaps 'pvh_offset'? Or
'initrd_offset'?

> +                              (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT : 0;

Oh gosh, what a mix. So alloc_spfn is actually the MFN of the memory allocated
for the kernel. The PFN_UP(initrd_len) gives the length of the initrd in pfns.
And you are using 'GPFN's as 'pfn' here. And the code 'if (count < initrd_pfn)'
is shortcircuited - so we will only put in 'pfn 0->d_tot_pages'

So a 50MB initrd is 12800 pfns. Wait. Why are you subtracting that value from the
MFN of the memory that will be used for the kernel (say it is at 0x100 MFN for fun)?

I am not sure I understand that. Could you explain it to me please?

> +
>          if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
>          {
>              maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
> @@ -657,16 +742,17 @@ int __init construct_dom0(
>                      clear_page(l3tab);
>                      if ( count == 0 )
>                          l3tab += l3_table_offset(v_start);
> -                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
> +                    *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT);
>                      l4tab++;
>                  }
> -                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
> +                *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT);
>                  l3tab++;
>              }
> -            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
> +            *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT);
>              l2tab++;
>          }
> -        if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
> +        if ( is_pvh_domain(d) ||
> +             count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
>              mfn = pfn++;
>          else
>              mfn = initrd_mfn++;
> @@ -674,6 +760,9 @@ int __init construct_dom0(
>                                      L1_PROT : COMPAT_L1_PROT));
>          l1tab++;
>  
> +        if ( is_pvh_domain(d) )
> +            continue;
> +
>          page = mfn_to_page(mfn);
>          if ( (page->u.inuse.type_info == 0) &&
>               !get_page_and_type(page, d, PGT_writable_page) )
> @@ -702,6 +791,9 @@ int __init construct_dom0(
>                 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
>      }
>  
> +    if  ( is_pvh_domain(d) )
> +        goto pvh_skip_pt_rdonly;
> +
>      /* Pages that are part of page tables must be read only. */
>      l4tab = l4start + l4_table_offset(vpt_start);
>      l3start = l3tab = l4e_to_l3e(*l4tab);
> @@ -741,6 +833,8 @@ int __init construct_dom0(
>          }
>      }
>  
> +pvh_skip_pt_rdonly:
> +

Hm, so the pagetable creation for the initial page-tables virtual addresses
has been eliminated. That means the initial calculation of 'order'
(alloc_domheap_pages) and the big loop of:

484     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
485     {

calculates the wrong amount right?

This needs a bit of mulling over. Since most of the Xen PV MMU code is disabled
that means (from a Linux kernel perspective), that we do bootup with

	init_level4_pgt[272]
		-> level3_ident_pgt[0]
			level2_ident_pgt [0-511]
				[copy from the L2 that we created above.]
	init_level4_pgt[511]
		-> level3_kernel_pgt [510]
			-> level2_kernel_pgt [0-511]
				[copy from the L2 that we created above.]
		

Since this is a whole-sale copy and only created page-tables from
[0 -> end of 'shared info' structure] and did create any page-table entries
for the L4, L3, L2, L1, L1, L1.... page-tables.

So any __va or __ka will work. And cleanup_highmap takes care of ripping out
any entries from _brk_limit up to max_pfn_mapped (end of ramdisk).

OK, so far so good. Wait, not it is not good - max_pfn_mapped is up to:
xen_start_info->mfn_list. That is the ramdisk.

But that means that __ka for the shared_struct and dom0_vga_console and
start_info are <p2m array + xen_start_info->mfn_list> in. And cleanup_highmap
would rip that out. Ah, we do have a bunch of revectoring:

	__va(xen_start_info->shared_info);

and
	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));

in the initial boot paths. So that is OK.


OK. The xen_pagetable_init uses level2_kernel_pgt (so in Linux __ka space)
to walk and clear up the PMD entries.

And since we do not use the __ka to update the pagetables, but switch
over to using __va as fast as possible this does look to work.


But this needs to be documented. And we probably should not allocate
that extra space for L1 entries that we are never going to use.


>      /* Mask all upcalls... */
>      for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
>          shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
> @@ -754,6 +848,11 @@ int __init construct_dom0(
>          (void)alloc_vcpu(d, i, cpu);
>      }
>  
> +    if ( is_pvh_domain(d) )
> +    {
> +        v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] =
> +                        (pagetable_get_pfn(v->arch.guest_table)) << PAGE_SHIFT;
> +    }
>      /* Set up CR3 value for write_ptbase */
>      if ( paging_mode_enabled(d) )
>          paging_update_paging_modes(v);
> @@ -764,35 +863,16 @@ int __init construct_dom0(
>      write_ptbase(v);
>      mapcache_override_current(v);
>  
> -    /* Copy the OS image and free temporary buffer. */
> -    elf.dest = (void*)vkern_start;
> -    rc = elf_load_binary(&elf, 0);
> -    if ( rc < 0 )
> -    {
> -        printk("Failed to load the kernel binary\n");
> -        return rc;
> -    }
> -    bootstrap_map(NULL);
> -
> -    if ( UNSET_ADDR != parms.virt_hypercall )
> -    {
> -        if ( (parms.virt_hypercall < v_start) ||
> -             (parms.virt_hypercall >= v_end) )
> -        {
> -            mapcache_override_current(NULL);
> -            write_ptbase(current);
> -            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
> -            return -1;

Wait, you just ripped out this code? Why?!

> +    /* Set up start info area. */
> +    if ( is_pvh_domain(d) ) {
> +        if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) {

Odd style.
> +            printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE);
> +            return -ENOMEM;
>          }
> -        hypercall_page_initialise(
> -            d, (void *)(unsigned long)parms.virt_hypercall);
> -    }
> -
> -    /* Free temporary buffers. */
> -    discard_initial_images();
> +        si = (start_info_t *)si_buf;
> +    } else
> +        si = (start_info_t *)vstartinfo_start;

Why do we allocate a page for this? Isn't the allocation for how
many pages are need for this space already taken care of in:

479     vstartinfo_start = round_pgup(vphysmap_end);
 480     vstartinfo_end   = (vstartinfo_start +
 481                         sizeof(struct start_info) +
 482                         sizeof(struct dom0_vga_console_info));
 483     vpt_start        = round_pgup(vstartinfo_end);

and then

 517     page = alloc_domheap_pages(d, order, 0);

allocates the appropiate amount?


>  
> -    /* Set up start info area. */
> -    si = (start_info_t *)vstartinfo_start;
>      clear_page(si);
>      si->nr_pages = nr_pages;
>  
> @@ -814,7 +894,7 @@ int __init construct_dom0(
>      l2tab = NULL;
>      l1tab = NULL;
>      /* Set up the phys->machine table if not part of the initial mapping. */
> -    if ( parms.p2m_base != UNSET_ADDR )
> +    if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) )

AHA! I knew it! You could also do something like this at the start of the code:

	params.p2m_base = 0xdeadbeef;

Or something better? 0xdeadf00d ?

And then in here check if we use 0xdeadbeef and we skip over setting
up the page-table entries for the P2M?
>      {
>          unsigned long va = vphysmap_start;
>  
> @@ -935,6 +1015,9 @@ int __init construct_dom0(
>          unmap_domain_page(l3tab);
>      unmap_domain_page(l4start);
>  
> +    if (is_pvh_domain(d) )
> +        hap_set_pvh_alloc_for_dom0(d, nr_pages);
> +
>      /* Write the phys->machine and machine->phys table entries. */
>      for ( pfn = 0; pfn < count; pfn++ )
>      {
> @@ -951,11 +1034,8 @@ int __init construct_dom0(
>          if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
>              mfn = alloc_epfn - (pfn - REVERSE_START);
>  #endif
> -        if ( !is_pv_32on64_domain(d) )
> -            ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -        else
> -            ((unsigned int *)vphysmap_start)[pfn] = mfn;
> -        set_gpfn_from_mfn(mfn, pfn);
> +        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
> +
>          if (!(pfn & 0xfffff))
>              process_pending_softirqs();
>      }
> @@ -971,8 +1051,8 @@ int __init construct_dom0(
>              if ( !page->u.inuse.type_info &&
>                   !get_page_and_type(page, d, PGT_writable_page) )
>                  BUG();
> -            ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -            set_gpfn_from_mfn(mfn, pfn);
> +            
> +            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
>              ++pfn;
>              if (!(pfn & 0xfffff))
>                  process_pending_softirqs();
> @@ -992,11 +1072,7 @@ int __init construct_dom0(
>  #ifndef NDEBUG
>  #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
>  #endif
> -            if ( !is_pv_32on64_domain(d) )
> -                ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -            else
> -                ((unsigned int *)vphysmap_start)[pfn] = mfn;
> -            set_gpfn_from_mfn(mfn, pfn);
> +            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
>  #undef pfn
>              page++; pfn++;
>              if (!(pfn & 0xfffff))
> @@ -1004,6 +1080,47 @@ int __init construct_dom0(
>          }
>      }
>  
> +    /* Copy the OS image and free temporary buffer. */
> +    elf.dest = (void*)vkern_start;
> +    rc = elf_load_binary(&elf, is_pvh_domain(d) );
> +    if ( rc < 0 )
> +    {
> +        printk("Failed to load the kernel binary\n");
> +        return rc;
> +    }
> +    bootstrap_map(NULL);
> +
> +    if ( UNSET_ADDR != parms.virt_hypercall )
> +    {
> +        void *addr;
> +
> +        if ( is_pvh_domain(d) ) {
> +            if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) {
> +                printk("xzalloc failed for tmp_buf. %ld bytes.\n", PAGE_SIZE);
> +                return -ENOMEM;
> +            }
> +            addr = tmp_buf;
> +        } else 
> +            addr = (void *)parms.virt_hypercall;
> +
> +        if ( (parms.virt_hypercall < v_start) ||
> +             (parms.virt_hypercall >= v_end) )
> +        {
> +            write_ptbase(current);
> +            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
> +            return -1;
> +        }
> +        hypercall_page_initialise(d, addr);
> +
> +        if ( is_pvh_domain(d) ) {
> +            copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE);
> +            xfree(tmp_buf);
> +        }
> +    }

Wait? Why the move of the code here?

> +
> +    /* Free temporary buffers. */
> +    discard_initial_images();
> +
>      if ( initrd_len != 0 )
>      {
>          si->mod_start = vinitrd_start ?: initrd_pfn;
> @@ -1019,6 +1136,15 @@ int __init construct_dom0(
>          si->console.dom0.info_off  = sizeof(struct start_info);
>          si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
>      }
> +    if ( is_pvh_domain(d) ) {
> +        unsigned long mfn = virt_to_mfn(d->shared_info);
> +        unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT;
> +        si->shared_info = shared_info_pfn_addr;

So that is not an PFN. That is the guest physical address I reckon?

Ah, which matches (kind of) the comment:

725     unsigned long shared_info;  /* MACHINE address of shared info struct. */


> +        dom0_update_physmap(d, pfn, mfn, 0);

, 0 /* ignored. */

> +
> +        copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE);
> +        xfree(si_buf);

Oh. Why the copy? Why not just operate directly on the vstartinfo_start virtual
address?


> +    }
>  
>      if ( is_pv_32on64_domain(d) )
>          xlat_start_info(si, XLAT_start_info_console_dom0);
> @@ -1050,12 +1176,16 @@ int __init construct_dom0(
>      regs->eip = parms.virt_entry;
>      regs->esp = vstack_end;
>      regs->esi = vstartinfo_start;
> -    regs->eflags = X86_EFLAGS_IF;
> +    regs->eflags = X86_EFLAGS_IF | 0x2;

Ahem!

>  
> -    if ( opt_dom0_shadow )
> +    if ( opt_dom0_shadow ) {
> +        if ( is_pvh_domain(d) ) {
> +            printk("Invalid option dom0_shadow for PVH\n");
> +            return -EINVAL;
> +        }
>          if ( paging_enable(d, PG_SH_enable) == 0 ) 
>              paging_update_paging_modes(v);
> -
> +    }
>      if ( supervisor_mode_kernel )
>      {
>          v->arch.pv_vcpu.kernel_ss &= ~3;
> @@ -1132,6 +1262,9 @@ int __init construct_dom0(
>  
>      BUG_ON(rc != 0);
>  
> +    if ( is_pvh_domain(d) )
> +        pvh_map_all_iomem(d);
> +
>      iommu_dom0_init(dom0);
>  
>      return 0;
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index 055833d..d3d5697 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
>      }
>  }
>  
> +/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */
> +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
> +{
> +    int rc;
> +    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
> +
> +    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));

Could you include an explanation behind this formula please?

> +    num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);

num_pages = memkb >> 2 ?

> +    paging_lock(d);
> +    rc = hap_set_allocation(d, num_pages, NULL);
> +    paging_unlock(d);
> +    BUG_ON(rc);
> +}
> +
>  static const struct paging_mode hap_paging_real_mode;
>  static const struct paging_mode hap_paging_protected_mode;
>  static const struct paging_mode hap_paging_pae_mode;
> @@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking)
>  const struct paging_mode *
>  hap_paging_get_mode(struct vcpu *v)
>  {
> -    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
> +    return is_pvh_vcpu(v) ? &hap_paging_long_mode :
> +        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
>          hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
>          hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
>                                     &hap_paging_protected_mode;
> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> index 43301a5..f307f24 100644
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus);
>  static bool_t __initdata disable_smep;
>  invbool_param("smep", disable_smep);
>  
> +/* Boot dom0 in PVH mode */
> +static bool_t __initdata opt_dom0pvh;
> +boolean_param("dom0pvh", opt_dom0pvh);
> +
>  /* **** Linux config option: propagated to domain0. */
>  /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
>  /* "acpi=force":  Override the disable blacklist.                   */
> @@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p)
>  {
>      char *memmap_type = NULL;
>      char *cmdline, *kextra, *loader;
> -    unsigned int initrdidx;
> +    unsigned int initrdidx, domcr_flags = 0;
>      multiboot_info_t *mbi = __va(mbi_p);
>      module_t *mod = (module_t *)__va(mbi->mods_addr);
>      unsigned long nr_pages, modules_headroom, *module_map;
> @@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p)
>          panic("Could not protect TXT memory regions\n");
>  
>      /* Create initial domain 0. */
> -    dom0 = domain_create(0, DOMCRF_s3_integrity, 0);
> +    domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0);
> +    domcr_flags |= DOMCRF_s3_integrity;
> +    dom0 = domain_create(0, domcr_flags, 0);
>      if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) )
>          panic("Error creating domain 0\n");
>  
> diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
> index e03f983..aab8558 100644
> --- a/xen/include/asm-x86/hap.h
> +++ b/xen/include/asm-x86/hap.h
> @@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
>                             XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
>  
>  extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
> +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
>  
>  #endif /* XEN_HAP_H */
>  
> -- 
> 1.7.2.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-16  1:06 [PATCH 17/18 V2]: PVH xen: PVH dom0 creation Mukesh Rathor
  2013-03-18 13:01 ` Jan Beulich
  2013-03-18 20:06 ` Konrad Rzeszutek Wilk
@ 2013-03-19  9:27 ` Jan Beulich
  2013-03-19 13:32   ` Konrad Rzeszutek Wilk
  2013-03-26 23:42   ` Mukesh Rathor
  2 siblings, 2 replies; 10+ messages in thread
From: Jan Beulich @ 2013-03-19  9:27 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: Konrad Rzeszutek Wilk, xen-devel

>>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in
>  PVH mode. Introduce, opt_dom0pvh, which when specified in the command
>  line, causes dom0 to boot in PVH mode.

Now that Konrad mentioned that PVH is intended for 64-bit guests
only, I fail to see where this restriction is being enforced in this
patch. And it would certainly have been worthwhile to state that
more prominently, even more so since Konrad keeps telling people
that this patch set is expected to initiate a deprecation phase for
the PV MMU interfaces in Linux (which then would mean no 32-bit
PV guest support at all anymore).

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-19  9:27 ` Jan Beulich
@ 2013-03-19 13:32   ` Konrad Rzeszutek Wilk
  2013-03-26 23:42   ` Mukesh Rathor
  1 sibling, 0 replies; 10+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-03-19 13:32 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On Tue, Mar 19, 2013 at 09:27:34AM +0000, Jan Beulich wrote:
> >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> > Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in
> >  PVH mode. Introduce, opt_dom0pvh, which when specified in the command
> >  line, causes dom0 to boot in PVH mode.
> 
> Now that Konrad mentioned that PVH is intended for 64-bit guests
> only, I fail to see where this restriction is being enforced in this
> patch. And it would certainly have been worthwhile to state that
> more prominently, even more so since Konrad keeps telling people
> that this patch set is expected to initiate a deprecation phase for
> the PV MMU interfaces in Linux (which then would mean no 32-bit
> PV guest support at all anymore).

It was not in my mind for the first stage of this work, thought I
have to admit that the 32-bit part completly vanished in my mind.

You are right - we need then 32-bit support otherwise we are screwed in
the X years we initiate the deprecation phase in the upstream
PV MMU interface.

I was not expecting that this set of patches would be the 'ok, the
timer for deprecation is ticking once the patch goes in Xen' - rather
when folks agree that it is the right time.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-18 20:06 ` Konrad Rzeszutek Wilk
@ 2013-03-26 23:25   ` Mukesh Rathor
  0 siblings, 0 replies; 10+ messages in thread
From: Mukesh Rathor @ 2013-03-26 23:25 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: Xen-devel

On Mon, 18 Mar 2013 16:06:02 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:

> On Fri, Mar 15, 2013 at 06:06:45PM -0700, Mukesh Rathor wrote:
> > +static __init void copy_pvh(char *dest, char *src, int bytes)
> > +{
> > +    /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs
> > curr 
> > +     * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't
> > use that.
> > +     * So we just use dbg_rw_mem().
> > +     */
> > +    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src,
> > bytes, 0, 1, 0);
> > +    if (rem) {
> > +        printk("PVH: Failed to copy to dom0. len:%d rem:%d\n",
> > bytes, rem);
> > +        BUG();
> 
> Wait a minute? This is debug code but you use for copying?! Why not
> __copy_to_user?

No, it's not debug code, I wrote the function to work for all
guests under all conditions, but it was only for debugger, so called
it dbg_*.  

Looking at alternatives.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-19  9:27 ` Jan Beulich
  2013-03-19 13:32   ` Konrad Rzeszutek Wilk
@ 2013-03-26 23:42   ` Mukesh Rathor
  1 sibling, 0 replies; 10+ messages in thread
From: Mukesh Rathor @ 2013-03-26 23:42 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Konrad Rzeszutek Wilk, xen-devel

On Tue, 19 Mar 2013 09:27:34 +0000
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > Finally, the hardest. Mostly modify construct_dom0() to boot PV
> > dom0 in PVH mode. Introduce, opt_dom0pvh, which when specified in
> > the command line, causes dom0 to boot in PVH mode.
> 
> Now that Konrad mentioned that PVH is intended for 64-bit guests
> only, I fail to see where this restriction is being enforced in this
> patch. And it would certainly have been worthwhile to state that

Yup. I forgot to mention that in the cover letter, and forgot to add
a check for that. Will be in the next patch for sure. Right now, there
is no code in linux for 32bit PVH btw.

> more prominently, even more so since Konrad keeps telling people
> that this patch set is expected to initiate a deprecation phase for
> the PV MMU interfaces in Linux (which then would mean no 32-bit
> PV guest support at all anymore).

Well, we definitely intend to head that way. And it is most feasible
to do this big work in steps. We are by no means done when this phase I 
patch is checked in. I've already outlined phase II, III etc.. Having 
some baseline working in xen will allow me to progress faster, and allow
to add others to contribute.

I think I should leave PVH disabled until we have reached a satisfactory
point, and we can enable it in future.

thanks
Mukesh

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-18 13:01 ` Jan Beulich
@ 2013-03-27  0:34   ` Mukesh Rathor
  2013-03-29  0:32   ` Mukesh Rathor
  1 sibling, 0 replies; 10+ messages in thread
From: Mukesh Rathor @ 2013-03-27  0:34 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On Mon, 18 Mar 2013 13:01:23 +0000
"Jan Beulich" <JBeulich@suse.com> wrote:

>  >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com>
>  >>> wrote:
> > +static __init void copy_pvh(char *dest, char *src, int bytes)
> > +{
> > +    /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs
> > curr 
> > +     * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't
> > use that.
> > +     * So we just use dbg_rw_mem().
> > +     */
> > +    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src,
> > bytes, 0, 1, 0);
> 
> Same comment as before: This is not acceptable for a submission
> of a patch intended to be committed (i.e. non-RFC). You should
> have worked out a suitable solution to this before posting.

Well, the prev patch was RFC and I dind't hear anything on this, so 
(mis) understood this was acceptable. Ok, I will just implmement
copy_pvh() here.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-18 13:01 ` Jan Beulich
  2013-03-27  0:34   ` Mukesh Rathor
@ 2013-03-29  0:32   ` Mukesh Rathor
  2013-04-02  7:03     ` Jan Beulich
  1 sibling, 1 reply; 10+ messages in thread
From: Mukesh Rathor @ 2013-03-29  0:32 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

On Mon, 18 Mar 2013 13:01:23 +0000
"Jan Beulich" <JBeulich@suse.com> wrote:

>  >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com>
>  >>> wrote:
> > @@ -307,6 +309,65 @@ static void __init
> > process_dom0_ioports_disable(void) }
> >  }
> >  
> > +/* 
> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0
> > will have
> > + * the entire io region mapped in the EPT/NPT.
> > + */
> > +static __init void  pvh_map_all_iomem(struct domain *d)
> > +{
> > +    unsigned long start = 0;
> > +    const struct e820entry *entry;
> > +    int rc, i, nump;
> > +
> > +    for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
> > +        unsigned long end = entry->addr + entry->size;
> > +
> > +        if (entry->type == E820_RAM || i == e820.nr_map - 1) {
> > +            unsigned long start_pfn = PFN_DOWN(start);
> > +            unsigned long end_pfn = PFN_UP(end);
> > +
> > +            if (entry->type == E820_RAM)
> > +                end_pfn = PFN_UP(entry->addr);
> > +
> > +            if (start_pfn < end_pfn) {
> > +                nump = end_pfn - start_pfn + 1;
> > +                rc = domctl_memory_mapping(d, start_pfn,
> > start_pfn, nump, 1);
> > +                BUG_ON(rc);
> > +            }
> > +            start = end;
> > +        }
> > +    }
> 
> At least E820_UNUSABLE must be excluded here.
> 
> And as you're mapping the holes only - how do you deal with
> the MMIO range past end of RAM? And perhaps even more
> important - how do you deal with the split between RAM and
> MMIO not being at the end of currently populated RAM, but
> at the end of possible hotpluggable regions.

Right now, phase I, no support for hotplug. Are there any other cases
of this, can you please give an example if yes?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...
  2013-03-29  0:32   ` Mukesh Rathor
@ 2013-04-02  7:03     ` Jan Beulich
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2013-04-02  7:03 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel

>>> On 29.03.13 at 01:32, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Mon, 18 Mar 2013 13:01:23 +0000
> "Jan Beulich" <JBeulich@suse.com> wrote:
> 
>>  >>> On 16.03.13 at 02:06, Mukesh Rathor <mukesh.rathor@oracle.com>
>>  >>> wrote:
>> > @@ -307,6 +309,65 @@ static void __init
>> > process_dom0_ioports_disable(void) }
>> >  }
>> >  
>> > +/* 
>> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0
>> > will have
>> > + * the entire io region mapped in the EPT/NPT.
>> > + */
>> > +static __init void  pvh_map_all_iomem(struct domain *d)
>> > +{
>> > +    unsigned long start = 0;
>> > +    const struct e820entry *entry;
>> > +    int rc, i, nump;
>> > +
>> > +    for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
>> > +        unsigned long end = entry->addr + entry->size;
>> > +
>> > +        if (entry->type == E820_RAM || i == e820.nr_map - 1) {
>> > +            unsigned long start_pfn = PFN_DOWN(start);
>> > +            unsigned long end_pfn = PFN_UP(end);
>> > +
>> > +            if (entry->type == E820_RAM)
>> > +                end_pfn = PFN_UP(entry->addr);
>> > +
>> > +            if (start_pfn < end_pfn) {
>> > +                nump = end_pfn - start_pfn + 1;
>> > +                rc = domctl_memory_mapping(d, start_pfn,
>> > start_pfn, nump, 1);
>> > +                BUG_ON(rc);
>> > +            }
>> > +            start = end;
>> > +        }
>> > +    }
>> 
>> At least E820_UNUSABLE must be excluded here.
>> 
>> And as you're mapping the holes only - how do you deal with
>> the MMIO range past end of RAM? And perhaps even more
>> important - how do you deal with the split between RAM and
>> MMIO not being at the end of currently populated RAM, but
>> at the end of possible hotpluggable regions.
> 
> Right now, phase I, no support for hotplug. Are there any other cases
> of this, can you please give an example if yes?

On a system with just 2Gb (and no hotplug) the code as I read it
stops mapping at the 2Gb boundary, i.e. all MMIO regions (up to
4Gb as well as beyond) would remain unmapped.

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2013-04-02  7:03 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-16  1:06 [PATCH 17/18 V2]: PVH xen: PVH dom0 creation Mukesh Rathor
2013-03-18 13:01 ` Jan Beulich
2013-03-27  0:34   ` Mukesh Rathor
2013-03-29  0:32   ` Mukesh Rathor
2013-04-02  7:03     ` Jan Beulich
2013-03-18 20:06 ` Konrad Rzeszutek Wilk
2013-03-26 23:25   ` Mukesh Rathor
2013-03-19  9:27 ` Jan Beulich
2013-03-19 13:32   ` Konrad Rzeszutek Wilk
2013-03-26 23:42   ` Mukesh Rathor

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.