All of lore.kernel.org
 help / color / mirror / Atom feed
* [V10 PATCH 0/4] pvh dom0 patches...
@ 2014-04-30  1:06 Mukesh Rathor
  2014-04-30  1:06 ` [V10 PATCH 1/4] pvh dom0: construct_dom0 changes Mukesh Rathor
                   ` (4 more replies)
  0 siblings, 5 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-04-30  1:06 UTC (permalink / raw)
  To: xen-devel; +Cc: George.Dunlap, keir.xen, tim, JBeulich

Hi,

Inching closer to the finish line, please find v10 of the pvh dom0
patches based on c/s: 8cfc8e5.

git tree: git://oss.oracle.com/git/mrathor/xen.git  branch: dom0pvh-v10

Changed in v10:
  patch 1: trivial merge

  patch 2: redo it.

  patch 3: - redo atomic_write_ept_entry adding level parameter
           - drop p2m_teardown foreign pages cleanup
           - couple style related changes
           - valid p2mt for foreign types in p2m_add_foreign are changed

  patch 4: - add change to xen-command-line.markdown
           - domcr_flags redone.

thanks,
Mukesh

^ permalink raw reply	[flat|nested] 52+ messages in thread

* [V10 PATCH 1/4] pvh dom0: construct_dom0 changes
  2014-04-30  1:06 [V10 PATCH 0/4] pvh dom0 patches Mukesh Rathor
@ 2014-04-30  1:06 ` Mukesh Rathor
  2014-05-06 15:18   ` Roger Pau Monné
  2014-04-30  1:06 ` [V10 PATCH 2/4] pvh dom0: Add checks and restrictions for p2m_is_foreign Mukesh Rathor
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 52+ messages in thread
From: Mukesh Rathor @ 2014-04-30  1:06 UTC (permalink / raw)
  To: xen-devel; +Cc: George.Dunlap, keir.xen, tim, JBeulich

This patch changes construct_dom0() to boot in pvh mode:
  - Make sure dom0 elf supports pvh mode.
  - Call guest_physmap_add_page for pvh rather than simple p2m setting
  - Map all non-RAM regions 1:1 upto the end region in e820 or 4GB which
    ever is higher.
  - Allocate p2m, copying calculation from toolstack.
  - Allocate shared info page from the virtual space so that dom0 PT
    can be updated. Then update p2m for it with the actual mfn.
  - Since we build the page tables for pvh same as for pv, in
    pvh_fixup_page_tables_for_hap we replace the mfns with pfns.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
 xen/arch/x86/domain_build.c | 249 +++++++++++++++++++++++++++++++++++++++++---
 xen/arch/x86/mm/hap/hap.c   |  12 +++
 xen/include/asm-x86/hap.h   |   1 +
 3 files changed, 245 insertions(+), 17 deletions(-)

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 1eccead..38ed9f6 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -35,6 +35,7 @@
 #include <asm/setup.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
+#include <asm/hap.h>
 
 #include <public/version.h>
 
@@ -307,6 +308,158 @@ static void __init process_dom0_ioports_disable(void)
     }
 }
 
+static __init void pvh_add_mem_mapping(struct domain *d, unsigned long gfn,
+                                       unsigned long mfn, unsigned long nr_mfns)
+{
+    unsigned long i;
+    int rc;
+
+    for ( i = 0; i < nr_mfns; i++ )
+        if ( (rc = set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i))) )
+            panic("pvh_add_mem_mapping: gfn:%lx mfn:%lx i:%ld rc:%d\n",
+                  gfn, mfn, i, rc);
+}
+
+/*
+ * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
+ * the entire io region mapped in the EPT/NPT.
+ *
+ * pvh fixme: The following doesn't map MMIO ranges when they sit above the
+ *            highest E820 covered address.
+ */
+static __init void pvh_map_all_iomem(struct domain *d)
+{
+    unsigned long start_pfn, end_pfn, end = 0, start = 0;
+    const struct e820entry *entry;
+    unsigned int i, nump;
+
+    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
+    {
+        end = entry->addr + entry->size;
+
+        if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ||
+             i == e820.nr_map - 1 )
+        {
+            start_pfn = PFN_DOWN(start);
+
+            /* Unused RAM areas are marked UNUSABLE, so skip them too */
+            if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE )
+                end_pfn = PFN_UP(entry->addr);
+            else
+                end_pfn = PFN_UP(end);
+
+            if ( start_pfn < end_pfn )
+            {
+                nump = end_pfn - start_pfn;
+                /* Add pages to the mapping */
+                pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
+            }
+            start = end;
+        }
+    }
+
+    /*
+     * Some BIOSes may not report io space above ram that is less than 4GB. So
+     * we map any non-ram upto 4GB.
+     */
+    if ( end < GB(4) )
+    {
+        start_pfn = PFN_UP(end);
+        end_pfn = (GB(4)) >> PAGE_SHIFT;
+        nump = end_pfn - start_pfn;
+        pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
+    }
+}
+
+static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
+                                   unsigned long mfn, unsigned long vphysmap_s)
+{
+    if ( is_pvh_domain(d) )
+    {
+        int rc = guest_physmap_add_page(d, pfn, mfn, 0);
+        BUG_ON(rc);
+        return;
+    }
+    if ( !is_pv_32on64_domain(d) )
+        ((unsigned long *)vphysmap_s)[pfn] = mfn;
+    else
+        ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+    set_gpfn_from_mfn(mfn, pfn);
+}
+
+/* Replace mfns with pfns in dom0 page tables */
+static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v,
+                                                 unsigned long v_start,
+                                                 unsigned long v_end)
+{
+    int i, j, k;
+    l4_pgentry_t *pl4e, *l4start;
+    l3_pgentry_t *pl3e;
+    l2_pgentry_t *pl2e;
+    l1_pgentry_t *pl1e;
+    unsigned long cr3_pfn;
+
+    ASSERT(paging_mode_enabled(v->domain));
+
+    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+
+    /* Clear entries prior to guest L4 start */
+    pl4e = l4start + l4_table_offset(v_start);
+    memset(l4start, 0, (unsigned long)pl4e - (unsigned long)l4start);
+
+    for ( ; pl4e <= l4start + l4_table_offset(v_end - 1); pl4e++ )
+    {
+        pl3e = map_l3t_from_l4e(*pl4e);
+        for ( i = 0; i < PAGE_SIZE / sizeof(*pl3e); i++, pl3e++ )
+        {
+            if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
+                continue;
+
+            pl2e = map_l2t_from_l3e(*pl3e);
+            for ( j = 0; j < PAGE_SIZE / sizeof(*pl2e); j++, pl2e++ )
+            {
+                if ( !(l2e_get_flags(*pl2e)  & _PAGE_PRESENT) )
+                    continue;
+
+                pl1e = map_l1t_from_l2e(*pl2e);
+                for ( k = 0; k < PAGE_SIZE / sizeof(*pl1e); k++, pl1e++ )
+                {
+                    if ( !(l1e_get_flags(*pl1e) & _PAGE_PRESENT) )
+                        continue;
+
+                    *pl1e = l1e_from_pfn(get_gpfn_from_mfn(l1e_get_pfn(*pl1e)),
+                                         l1e_get_flags(*pl1e));
+                }
+                unmap_domain_page(pl1e);
+                *pl2e = l2e_from_pfn(get_gpfn_from_mfn(l2e_get_pfn(*pl2e)),
+                                     l2e_get_flags(*pl2e));
+            }
+            unmap_domain_page(pl2e);
+            *pl3e = l3e_from_pfn(get_gpfn_from_mfn(l3e_get_pfn(*pl3e)),
+                                 l3e_get_flags(*pl3e));
+        }
+        unmap_domain_page(pl3e);
+        *pl4e = l4e_from_pfn(get_gpfn_from_mfn(l4e_get_pfn(*pl4e)),
+                             l4e_get_flags(*pl4e));
+    }
+
+    /* Clear entries post guest L4. */
+    if ( (unsigned long)pl4e & (PAGE_SIZE - 1) )
+        memset(pl4e, 0, PAGE_SIZE - ((unsigned long)pl4e & (PAGE_SIZE - 1)));
+
+    unmap_domain_page(l4start);
+
+    cr3_pfn = get_gpfn_from_mfn(paddr_to_pfn(v->arch.cr3));
+    v->arch.hvm_vcpu.guest_cr[3] = pfn_to_paddr(cr3_pfn);
+
+    /*
+     * Finally, we update the paging modes (hap_update_paging_modes). This will
+     * create monitor_table for us, update v->arch.cr3, and update vmcs.cr3.
+     */
+    paging_update_paging_modes(v);
+}
+
 static __init void mark_pv_pt_pages_rdonly(struct domain *d,
                                            l4_pgentry_t *l4start,
                                            unsigned long vpt_start,
@@ -516,6 +669,8 @@ int __init construct_dom0(
     l3_pgentry_t *l3tab = NULL, *l3start = NULL;
     l2_pgentry_t *l2tab = NULL, *l2start = NULL;
     l1_pgentry_t *l1tab = NULL, *l1start = NULL;
+    paddr_t shared_info_paddr = 0;
+    u32 save_pvh_pg_mode = 0;
 
     /*
      * This fully describes the memory layout of the initial domain. All 
@@ -593,12 +748,21 @@ int __init construct_dom0(
         goto out;
     }
 
-    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE &&
-         !test_bit(XENFEAT_dom0, parms.f_supported) )
+    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE )
     {
-        printk("Kernel does not support Dom0 operation\n");
-        rc = -EINVAL;
-        goto out;
+        if ( !test_bit(XENFEAT_dom0, parms.f_supported) )
+        {
+            printk("Kernel does not support Dom0 operation\n");
+            rc = -EINVAL;
+            goto out;
+        }
+        if ( is_pvh_domain(d) &&
+             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) )
+        {
+            printk("Kernel does not support PVH mode\n");
+            rc = -EINVAL;
+            goto out;
+        }
     }
 
     if ( compat32 )
@@ -663,6 +827,13 @@ int __init construct_dom0(
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
                         sizeof(struct dom0_vga_console_info));
+
+    if ( is_pvh_domain(d) )
+    {
+        shared_info_paddr = round_pgup(vstartinfo_end) - v_start;
+        vstartinfo_end   += PAGE_SIZE;
+    }
+
     vpt_start        = round_pgup(vstartinfo_end);
     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
     {
@@ -903,6 +1074,13 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, cpu);
     }
 
+    /*
+     * pvh: we temporarily disable d->arch.paging.mode so that we can build cr3
+     * needed to run on dom0's page tables.
+     */
+    save_pvh_pg_mode = d->arch.paging.mode;
+    d->arch.paging.mode = 0;
+
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
@@ -969,6 +1147,22 @@ int __init construct_dom0(
                          nr_pages);
     }
 
+    if ( is_pvh_domain(d) )
+    {
+        unsigned long hap_pages, memkb = nr_pages * (PAGE_SIZE / 1024);
+
+        /* Copied from: libxl_get_required_shadow_memory() */
+        memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+        hap_pages = ( (memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
+        hap_set_alloc_for_pvh_dom0(d, hap_pages);
+    }
+
+    /*
+     * We enable paging mode again so guest_physmap_add_page will do the
+     * right thing for us.
+     */
+    d->arch.paging.mode = save_pvh_pg_mode;
+
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
     {
@@ -985,11 +1179,7 @@ int __init construct_dom0(
         if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
             mfn = alloc_epfn - (pfn - REVERSE_START);
 #endif
-        if ( !is_pv_32on64_domain(d) )
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-        else
-            ((unsigned int *)vphysmap_start)[pfn] = mfn;
-        set_gpfn_from_mfn(mfn, pfn);
+        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
         if (!(pfn & 0xfffff))
             process_pending_softirqs();
     }
@@ -1005,8 +1195,8 @@ int __init construct_dom0(
             if ( !page->u.inuse.type_info &&
                  !get_page_and_type(page, d, PGT_writable_page) )
                 BUG();
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
             ++pfn;
             if (!(pfn & 0xfffff))
                 process_pending_softirqs();
@@ -1026,11 +1216,7 @@ int __init construct_dom0(
 #ifndef NDEBUG
 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
 #endif
-            if ( !is_pv_32on64_domain(d) )
-                ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            else
-                ((unsigned int *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
 #undef pfn
             page++; pfn++;
             if (!(pfn & 0xfffff))
@@ -1054,6 +1240,15 @@ int __init construct_dom0(
         si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
     }
 
+    /*
+     * PVH: We need to update si->shared_info while we are on dom0 page tables,
+     * but need to defer the p2m update until after we have fixed up the
+     * page tables for PVH so that the m2p for the si pte entry returns
+     * correct pfn.
+     */
+    if ( is_pvh_domain(d) )
+        si->shared_info = shared_info_paddr;
+
     if ( is_pv_32on64_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
 
@@ -1087,8 +1282,15 @@ int __init construct_dom0(
     regs->eflags = X86_EFLAGS_IF;
 
     if ( opt_dom0_shadow )
+    {
+        if ( is_pvh_domain(d) )
+        {
+            printk("Unsupported option dom0_shadow for PVH\n");
+            return -EINVAL;
+        }
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
+    }
 
     if ( supervisor_mode_kernel )
     {
@@ -1179,6 +1381,19 @@ int __init construct_dom0(
         printk(" Xen warning: dom0 kernel broken ELF: %s\n",
                elf_check_broken(&elf));
 
+    if ( is_pvh_domain(d) )
+    {
+        /* finally, fixup the page table, replacing mfns with pfns */
+        pvh_fixup_page_tables_for_hap(v, v_start, v_end);
+
+        /* the pt has correct pfn for si, now update the mfn in the p2m */
+        mfn = virt_to_mfn(d->shared_info);
+        pfn = shared_info_paddr >> PAGE_SHIFT;
+        dom0_update_physmap(d, pfn, mfn, 0);
+
+        pvh_map_all_iomem(d);
+    }
+
     if ( d->domain_id == hardware_domid )
         iommu_hwdom_init(d);
 
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 60806bb..8a90cd8 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -589,6 +589,18 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
     }
 }
 
+void __init hap_set_alloc_for_pvh_dom0(struct domain *d,
+                                       unsigned long hap_pages)
+{
+    int rc;
+
+    paging_lock(d);
+    rc = hap_set_allocation(d, hap_pages, NULL);
+    paging_unlock(d);
+
+    BUG_ON(rc);
+}
+
 static const struct paging_mode hap_paging_real_mode;
 static const struct paging_mode hap_paging_protected_mode;
 static const struct paging_mode hap_paging_pae_mode;
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index e03f983..5161927 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+void hap_set_alloc_for_pvh_dom0(struct domain *d, unsigned long num_pages);
 
 #endif /* XEN_HAP_H */
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* [V10 PATCH 2/4] pvh dom0: Add checks and restrictions for p2m_is_foreign
  2014-04-30  1:06 [V10 PATCH 0/4] pvh dom0 patches Mukesh Rathor
  2014-04-30  1:06 ` [V10 PATCH 1/4] pvh dom0: construct_dom0 changes Mukesh Rathor
@ 2014-04-30  1:06 ` Mukesh Rathor
  2014-05-01 16:14   ` Tim Deegan
  2014-04-30  1:06 ` [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages Mukesh Rathor
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 52+ messages in thread
From: Mukesh Rathor @ 2014-04-30  1:06 UTC (permalink / raw)
  To: xen-devel; +Cc: George.Dunlap, keir.xen, tim, JBeulich

In this patch, we add some checks and restrictions in the relevant
p2m paths for p2m_is_foreign.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/mm/p2m-ept.c |  1 +
 xen/arch/x86/mm/p2m.c     | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 8d3051b..c0bfc50 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -776,6 +776,7 @@ static void ept_change_entry_type_global(struct p2m_domain *p2m,
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
     BUG_ON(p2m_is_mmio(ot) || p2m_is_mmio(nt));
+    BUG_ON(p2m_is_foreign(ot) || p2m_is_foreign(nt));
 
     ept_change_entry_type_page(_mfn(ept_get_asr(ept)),
                                ept_get_wl(ept), ot, nt);
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 365e37a..6370306 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -559,6 +559,10 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
         return 0;
     }
 
+    /* foreign pages are added thru p2m_add_foreign */
+    if ( p2m_is_foreign(t) )
+        return -EINVAL;
+
     p2m_lock(p2m);
 
     P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
@@ -593,9 +597,9 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
             omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL);
             ASSERT(!p2m_is_shared(ot));
         }
-        if ( p2m_is_grant(ot) )
+        if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
         {
-            /* Really shouldn't be unmapping grant maps this way */
+            /* Really shouldn't be unmapping grant/foreign maps this way */
             domain_crash(d);
             p2m_unlock(p2m);
             
@@ -697,6 +701,7 @@ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(p2m_is_foreign(ot) || p2m_is_foreign(nt));
 
     gfn_lock(p2m, gfn, 0);
 
@@ -721,6 +726,7 @@ void p2m_change_type_range(struct domain *d,
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
+    BUG_ON(p2m_is_foreign(ot) || p2m_is_foreign(nt));
 
     p2m_lock(p2m);
     p2m->defer_nested_flush = 1;
@@ -771,7 +777,7 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 
     gfn_lock(p2m, gfn, 0);
     omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL);
-    if ( p2m_is_grant(ot) )
+    if ( p2m_is_grant(ot) || p2m_is_foreign(ot) )
     {
         p2m_unlock(p2m);
         domain_crash(d);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-04-30  1:06 [V10 PATCH 0/4] pvh dom0 patches Mukesh Rathor
  2014-04-30  1:06 ` [V10 PATCH 1/4] pvh dom0: construct_dom0 changes Mukesh Rathor
  2014-04-30  1:06 ` [V10 PATCH 2/4] pvh dom0: Add checks and restrictions for p2m_is_foreign Mukesh Rathor
@ 2014-04-30  1:06 ` Mukesh Rathor
  2014-05-01 16:19   ` Tim Deegan
  2014-04-30  1:06 ` [V10 PATCH 4/4] dom0: add opt_dom0pvh to setup.c Mukesh Rathor
  2014-04-30 14:11 ` [V10 PATCH 0/4] pvh dom0 patches Roger Pau Monné
  4 siblings, 1 reply; 52+ messages in thread
From: Mukesh Rathor @ 2014-04-30  1:06 UTC (permalink / raw)
  To: xen-devel
  Cc: JBeulich, George.Dunlap, tim, eddie.dong, keir.xen, jun.nakajima

In this patch, a new function, p2m_add_foreign(), is added
to map pages from a foreign guest into dom0 for various purposes
like domU creation, running xentrace, etc... Such pages are
typed p2m_map_foreign.  Note, it is the nature of such pages
that a refcnt is held during their stay in the p2m. The
refcnt is added and released in the low level ept function
atomic_write_ept_entry. That macro is converted to a function to allow
for such refcounting, which only applies to leaf entries in the ept.

Also, get_pg_owner is changed to allow pvh to map foreign mappings,
and is made public to be used by p2m_add_foreign().

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/mm.c         |   9 ++--
 xen/arch/x86/mm/p2m-ept.c |  67 +++++++++++++++++++++------
 xen/arch/x86/mm/p2m-pt.c  |   7 +++
 xen/arch/x86/mm/p2m.c     | 113 +++++++++++++++++++++++++++++++++++++++++++---
 xen/include/asm-x86/mm.h  |   2 +
 xen/include/asm-x86/p2m.h |   7 +++
 6 files changed, 181 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index b9a54a5..9527d8c 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2794,7 +2794,7 @@ int new_guest_cr3(unsigned long mfn)
     return rc;
 }
 
-static struct domain *get_pg_owner(domid_t domid)
+struct domain *get_pg_owner(domid_t domid)
 {
     struct domain *pg_owner = NULL, *curr = current->domain;
 
@@ -2810,7 +2810,7 @@ static struct domain *get_pg_owner(domid_t domid)
         goto out;
     }
 
-    if ( unlikely(paging_mode_translate(curr)) )
+    if ( !is_pvh_domain(curr) && unlikely(paging_mode_translate(curr)) )
     {
         MEM_LOG("Cannot mix foreign mappings with translated domains");
         goto out;
@@ -2837,7 +2837,7 @@ static struct domain *get_pg_owner(domid_t domid)
     return pg_owner;
 }
 
-static void put_pg_owner(struct domain *pg_owner)
+void put_pg_owner(struct domain *pg_owner)
 {
     rcu_unlock_domain(pg_owner);
 }
@@ -4583,6 +4583,9 @@ int xenmem_add_to_physmap_one(
             page = mfn_to_page(mfn);
             break;
         }
+        case XENMAPSPACE_gmfn_foreign:
+            rc = p2m_add_foreign(d, idx, gpfn, foreign_domid);
+            return rc;
         default:
             break;
     }
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index c0bfc50..11474e8 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -36,8 +36,6 @@
 
 #define atomic_read_ept_entry(__pepte)                              \
     ( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } )
-#define atomic_write_ept_entry(__pepte, __epte)                     \
-    write_atomic(&(__pepte)->epte, (__epte).epte)
 
 #define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
 #define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
@@ -46,6 +44,46 @@ static inline bool_t is_epte_valid(ept_entry_t *e)
     return (e->epte != 0 && e->sa_p2mt != p2m_invalid);
 }
 
+/* returns : 0 for success, -errno otherwise */
+static int atomic_write_ept_entry(ept_entry_t *entryptr, ept_entry_t new,
+                                  int level)
+{
+    bool_t same_mfn = (new.mfn == entryptr->mfn);
+    unsigned long oldmfn = INVALID_MFN;
+
+    if ( level )
+    {
+        write_atomic(&entryptr->epte, new.epte);
+        return 0;
+    }
+
+    if ( unlikely(p2m_is_foreign(new.sa_p2mt)) && !same_mfn )
+    {
+        struct domain *fdom;
+
+        if ( !mfn_valid(new.mfn) )
+            return -EINVAL;
+
+        fdom = page_get_owner(mfn_to_page(new.mfn));
+        if ( fdom == NULL )
+            return -ESRCH;
+
+        /* get refcount on the page */
+        if ( !get_page(mfn_to_page(new.mfn), fdom) )
+            return -EBUSY;
+    }
+
+    if ( unlikely(p2m_is_foreign(entryptr->sa_p2mt)) && !same_mfn )
+        oldmfn = entryptr->mfn;
+
+    write_atomic(&entryptr->epte, new.epte);
+
+    if ( unlikely(oldmfn != INVALID_MFN) )
+        put_page(mfn_to_page(oldmfn));
+
+    return 0;
+}
+
 static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_access_t access)
 {
     /* First apply type permissions */
@@ -271,7 +309,7 @@ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
     return GUEST_TABLE_NORMAL_PAGE;
 }
 
-static bool_t ept_invalidate_emt(mfn_t mfn)
+static bool_t ept_invalidate_emt(mfn_t mfn, int level)
 {
     ept_entry_t *epte = map_domain_page(mfn_x(mfn));
     unsigned int i;
@@ -286,7 +324,7 @@ static bool_t ept_invalidate_emt(mfn_t mfn)
             continue;
 
         e.emt = MTRR_NUM_TYPES;
-        atomic_write_ept_entry(&epte[i], e);
+        atomic_write_ept_entry(&epte[i], e, level);
         changed = 1;
     }
 
@@ -341,7 +379,7 @@ bool_t ept_handle_misconfig(uint64_t gpa)
                                                _mfn(e.mfn), 0, &ipat,
                                                e.sa_p2mt == p2m_mmio_direct);
                     e.ipat = ipat;
-                    atomic_write_ept_entry(&epte[i], e);
+                    atomic_write_ept_entry(&epte[i], e, level);
                 }
             }
             else
@@ -363,7 +401,7 @@ bool_t ept_handle_misconfig(uint64_t gpa)
                 }
                 e.emt = emt;
                 e.ipat = ipat;
-                atomic_write_ept_entry(&epte[i], e);
+                atomic_write_ept_entry(&epte[i], e, level);
             }
 
             okay = 1;
@@ -373,10 +411,10 @@ bool_t ept_handle_misconfig(uint64_t gpa)
         if ( e.emt == MTRR_NUM_TYPES )
         {
             ASSERT(is_epte_present(&e));
-            ept_invalidate_emt(_mfn(e.mfn));
+            ept_invalidate_emt(_mfn(e.mfn), level);
             smp_wmb();
             e.emt = 0;
-            atomic_write_ept_entry(&epte[i], e);
+            atomic_write_ept_entry(&epte[i], e, level);
             unmap_domain_page(epte);
             okay = 1;
         }
@@ -443,6 +481,7 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
     ASSERT((target == 2 && hvm_hap_has_1gb()) ||
            (target == 1 && hvm_hap_has_2mb()) ||
            (target == 0));
+    ASSERT(!p2m_is_foreign(p2mt) || target == 0);
 
     table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
 
@@ -506,7 +545,7 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
 
         /* now install the newly split ept sub-tree */
         /* NB: please make sure domian is paused and no in-fly VT-d DMA. */
-        atomic_write_ept_entry(ept_entry, split_ept_entry);
+        atomic_write_ept_entry(ept_entry, split_ept_entry, i);
 
         /* then move to the level we want to make real changes */
         for ( ; i > target; i-- )
@@ -545,10 +584,10 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         ept_p2m_type_to_flags(&new_entry, p2mt, p2ma);
     }
 
-    atomic_write_ept_entry(ept_entry, new_entry);
+    rc = atomic_write_ept_entry(ept_entry, new_entry, i);
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( p2mt != p2m_invalid &&
+    if ( rc == 0 && p2mt != p2m_invalid &&
          (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
         p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
 
@@ -580,7 +619,7 @@ out:
        last thing we do, after the ept_sync_domain() and removal
        from the iommu tables, so as to avoid a potential
        use-after-free. */
-    if ( is_epte_present(&old_entry) )
+    if ( rc == 0 && is_epte_present(&old_entry) )
         ept_free_entry(p2m, &old_entry, target);
 
     return rc;
@@ -760,7 +799,7 @@ static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
 
             e.sa_p2mt = nt;
             ept_p2m_type_to_flags(&e, nt, e.access);
-            atomic_write_ept_entry(&epte[i], e);
+            atomic_write_ept_entry(&epte[i], e, ept_page_level);
         }
     }
 
@@ -791,7 +830,7 @@ static void ept_memory_type_changed(struct p2m_domain *p2m)
     if ( !mfn )
         return;
 
-    if ( ept_invalidate_emt(_mfn(mfn)) )
+    if ( ept_invalidate_emt(_mfn(mfn), ept_get_wl(&p2m->ept)) )
         ept_sync_domain(p2m);
 }
 
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index b53db70..03d8d46 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -313,6 +313,13 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
     }
 
+    if ( p2m_is_foreign(p2mt) )
+    {
+        /* pvh fixme: foreign types are only supported on ept at present */
+        gdprintk(XENLOG_WARNING, "Unimplemented foreign p2m type.\n");
+        return -EINVAL;
+    }
+
     rc = p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn,
                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table);
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 6370306..f52c7a9 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -36,6 +36,7 @@
 #include <xen/event.h>
 #include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/svm/amd-iommu-proto.h>
+#include <xsm/xsm.h>
 
 #include "mm-locks.h"
 
@@ -287,14 +288,20 @@ struct page_info *get_page_from_gfn_p2m(
         /* Fast path: look up and get out */
         p2m_read_lock(p2m);
         mfn = __get_gfn_type_access(p2m, gfn, t, a, 0, NULL, 0);
-        if ( (p2m_is_ram(*t) || p2m_is_grant(*t))
-             && mfn_valid(mfn)
+        if ( p2m_is_any_ram(*t) && mfn_valid(mfn)
              && !((q & P2M_UNSHARE) && p2m_is_shared(*t)) )
         {
             page = mfn_to_page(mfn);
-            if ( !get_page(page, d)
-                 /* Page could be shared */
-                 && !get_page(page, dom_cow) )
+            if ( p2m_is_foreign(*t) )
+            {
+                struct domain *fdom = page_get_owner_and_reference(page);
+                ASSERT(fdom != d);
+                if ( fdom == NULL )
+                    page = NULL;
+            }
+            else if ( !get_page(page, d)
+                      /* Page could be shared */
+                      && !get_page(page, dom_cow) )
                 page = NULL;
         }
         p2m_read_unlock(p2m);
@@ -801,8 +808,8 @@ static int set_typed_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn,
 }
 
 /* Set foreign mfn in the given guest's p2m table. */
-static int __attribute__((unused))
-set_foreign_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+static int set_foreign_p2m_entry(struct domain *d, unsigned long gfn,
+                                 mfn_t mfn)
 {
     return set_typed_p2m_entry(d, gfn, mfn, p2m_map_foreign);
 }
@@ -1760,6 +1767,98 @@ out_p2m_audit:
 #endif /* P2M_AUDIT */
 
 /*
+ * Add frame from foreign domain to target domain's physmap. Similar to
+ * XENMAPSPACE_gmfn but the frame is foreign being mapped into current,
+ * and is not removed from foreign domain.
+ *
+ * Usage: - libxl on pvh dom0 creating a guest and doing privcmd_ioctl_mmap.
+ *        - xentrace running on dom0 mapping xenheap pages. foreigndom would
+ *          be DOMID_XEN in such a case.
+ *        etc..
+ *
+ * Side Effect: the mfn for fgfn will be refcounted in lower level routines
+ *              so it is not lost while mapped here. The refcnt is released
+ *              via the XENMEM_remove_from_physmap path.
+ *
+ * Returns: 0 ==> success
+ */
+int p2m_add_foreign(struct domain *tdom, unsigned long fgfn,
+                    unsigned long gpfn, domid_t foreigndom)
+{
+    p2m_type_t p2mt, p2mt_prev;
+    unsigned long prev_mfn, mfn;
+    struct page_info *page;
+    int rc = -EINVAL;
+    struct domain *fdom = NULL;
+
+    ASSERT(tdom);
+    if ( foreigndom == DOMID_SELF )
+        goto out;
+
+    rc = -ESRCH;
+    fdom = get_pg_owner(foreigndom);
+    if ( fdom == NULL )
+        goto out;
+
+    rc = -EINVAL;
+    if ( tdom == fdom || !is_pvh_domain(tdom) )
+        goto out;
+
+    rc = xsm_map_gmfn_foreign(XSM_TARGET, tdom, fdom);
+    if ( rc )
+        goto out;
+
+    /*
+     * Take a refcnt on the mfn. NB: following supported for foreign mapping:
+     *     ram_rw | ram_logdirty | ram_ro | paging_out.
+     */
+    page = get_page_from_gfn(fdom, fgfn, &p2mt, P2M_ALLOC);
+    if ( !page ||
+         !p2m_is_ram(p2mt) || p2m_is_shared(p2mt) || p2m_is_hole(p2mt) )
+    {
+        if ( page )
+            put_page(page);
+        goto out;
+    }
+    mfn = mfn_x(page_to_mfn(page));
+
+    /* Remove previously mapped page if it is present. */
+    prev_mfn = mfn_x(get_gfn(tdom, gpfn, &p2mt_prev));
+    if ( mfn_valid(_mfn(prev_mfn)) )
+    {
+        if ( is_xen_heap_mfn(prev_mfn) )
+            /* Xen heap frames are simply unhooked from this phys slot */
+            guest_physmap_remove_page(tdom, gpfn, prev_mfn, 0);
+        else
+            /* Normal domain memory is freed, to avoid leaking memory. */
+            guest_remove_page(tdom, gpfn);
+    }
+    /*
+     * Create the new mapping. Can't use guest_physmap_add_page() because it
+     * will update the m2p table which will result in  mfn -> gpfn of dom0
+     * and not fgfn of domU.
+     */
+    rc = set_foreign_p2m_entry(tdom, gpfn, _mfn(mfn));
+    if ( rc )
+        gdprintk(XENLOG_WARNING, "set_foreign_p2m_entry failed. "
+                 "gpfn:%lx mfn:%lx fgfn:%lx td:%d fd:%d\n",
+                 gpfn, mfn, fgfn, tdom->domain_id, fdom->domain_id);
+
+    put_page(page);
+
+    /*
+     * This put_gfn for the above get_gfn for prev_mfn.  We must do this
+     * after set_foreign_p2m_entry so another cpu doesn't populate the gpfn
+     * before us.
+     */
+    put_gfn(tdom, gpfn);
+
+out:
+    if ( fdom )
+        put_pg_owner(fdom);
+    return rc;
+}
+/*
  * Local variables:
  * mode: C
  * c-file-style: "BSD"
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index c835f76..afa6c48 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -358,6 +358,8 @@ int  put_old_guest_table(struct vcpu *);
 int  get_page_from_l1e(
     l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner);
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner);
+struct domain *get_pg_owner(domid_t domid);
+void put_pg_owner(struct domain *pg_owner);
 
 static inline void put_page_and_type(struct page_info *page)
 {
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 0308b89..1682190 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -183,6 +183,10 @@ typedef unsigned int p2m_query_t;
 #define p2m_is_broken(_t)   (p2m_to_mask(_t) & P2M_BROKEN_TYPES)
 #define p2m_is_foreign(_t)  (p2m_to_mask(_t) & p2m_to_mask(p2m_map_foreign))
 
+#define p2m_is_any_ram(_t)  (p2m_to_mask(_t) &                   \
+                             (P2M_RAM_TYPES | P2M_GRANT_TYPES |  \
+                              p2m_to_mask(p2m_map_foreign)))
+
 /* Per-p2m-table state */
 struct p2m_domain {
     /* Lock that protects updates to the p2m */
@@ -516,6 +520,9 @@ void p2m_memory_type_changed(struct domain *d);
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);
 
+/* Add foreign mapping to the guest's p2m table. */
+int p2m_add_foreign(struct domain *tdom, unsigned long fgfn,
+                    unsigned long gpfn, domid_t foreign_domid);
 
 /* 
  * Populate-on-demand
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* [V10 PATCH 4/4] dom0: add opt_dom0pvh to setup.c
  2014-04-30  1:06 [V10 PATCH 0/4] pvh dom0 patches Mukesh Rathor
                   ` (2 preceding siblings ...)
  2014-04-30  1:06 ` [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages Mukesh Rathor
@ 2014-04-30  1:06 ` Mukesh Rathor
  2014-04-30 14:11 ` [V10 PATCH 0/4] pvh dom0 patches Roger Pau Monné
  4 siblings, 0 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-04-30  1:06 UTC (permalink / raw)
  To: xen-devel; +Cc: George.Dunlap, keir.xen, tim, JBeulich

Finally last patch in the series to enable creation of pvh dom0.
A pvh dom0 is created by adding dom0pvh to grub xen command line.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 docs/misc/pvh-readme.txt            |  2 ++
 docs/misc/xen-command-line.markdown |  7 +++++++
 xen/arch/x86/setup.c                | 11 +++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/docs/misc/pvh-readme.txt b/docs/misc/pvh-readme.txt
index 9fea137..c5b3de4 100644
--- a/docs/misc/pvh-readme.txt
+++ b/docs/misc/pvh-readme.txt
@@ -37,6 +37,8 @@ supported. Phase I patches are broken into three parts:
    - tools changes for creating a PVH guest
    - boot of 64bit dom0 in PVH mode.
 
+To boot 64bit dom0 in PVH mode, add dom0pvh to grub xen command line.
+
 Following fixme's exist in the code:
   - arch/x86/time.c: support more tsc modes.
 
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index e8d23b4..8b8c459 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -494,6 +494,13 @@ Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory
 
 Pin dom0 vcpus to their respective pcpus
 
+### dom0pvh
+> `= <boolean>`
+
+> Default: `false`
+
+Flag that makes a 64bit dom0 boot in PVH mode. No 32bit support at present.
+
 ### e820-mtrr-clip
 > `= <boolean>`
 
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 2e30701..373e236 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -61,6 +61,10 @@ integer_param("maxcpus", max_cpus);
 static bool_t __initdata disable_smep;
 invbool_param("smep", disable_smep);
 
+/* Boot dom0 in pvh mode */
+static bool_t __initdata opt_dom0pvh;
+boolean_param("dom0pvh", opt_dom0pvh);
+
 /* **** Linux config option: propagated to domain0. */
 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
 /* "acpi=force":  Override the disable blacklist.                   */
@@ -541,7 +545,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 {
     char *memmap_type = NULL;
     char *cmdline, *kextra, *loader;
-    unsigned int initrdidx;
+    unsigned int initrdidx, domcr_flags = DOMCRF_s3_integrity;
     multiboot_info_t *mbi = __va(mbi_p);
     module_t *mod = (module_t *)__va(mbi->mods_addr);
     unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
@@ -1338,8 +1342,11 @@ void __init noreturn __start_xen(unsigned long mbi_p)
     if ( !tboot_protect_mem_regions() )
         panic("Could not protect TXT memory regions");
 
+    if ( opt_dom0pvh )
+        domcr_flags |= DOMCRF_pvh | DOMCRF_hap;
+
     /* Create initial domain 0. */
-    dom0 = domain_create(0, DOMCRF_s3_integrity, 0);
+    dom0 = domain_create(0, domcr_flags, 0);
     if ( IS_ERR(dom0) || (alloc_dom0_vcpu0(dom0) == NULL) )
         panic("Error creating domain 0");
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-04-30  1:06 [V10 PATCH 0/4] pvh dom0 patches Mukesh Rathor
                   ` (3 preceding siblings ...)
  2014-04-30  1:06 ` [V10 PATCH 4/4] dom0: add opt_dom0pvh to setup.c Mukesh Rathor
@ 2014-04-30 14:11 ` Roger Pau Monné
  2014-04-30 18:12   ` Mukesh Rathor
  4 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-04-30 14:11 UTC (permalink / raw)
  To: Mukesh Rathor, xen-devel; +Cc: George.Dunlap, keir.xen, tim, JBeulich

On 30/04/14 03:06, Mukesh Rathor wrote:
> Hi,
> 
> Inching closer to the finish line, please find v10 of the pvh dom0
> patches based on c/s: 8cfc8e5.
> 
> git tree: git://oss.oracle.com/git/mrathor/xen.git  branch: dom0pvh-v10
> 
> Changed in v10:
>   patch 1: trivial merge
> 
>   patch 2: redo it.
> 
>   patch 3: - redo atomic_write_ept_entry adding level parameter
>            - drop p2m_teardown foreign pages cleanup
>            - couple style related changes
>            - valid p2mt for foreign types in p2m_add_foreign are changed
> 
>   patch 4: - add change to xen-command-line.markdown
>            - domcr_flags redone.
> 
> thanks,
> Mukesh

Hello Mukesh,

Thanks for the new version, unfortunately when trying to boot FreeBSD 
Dom0 with this version I get the following hypervisor crash (it works
fine with previous versions):

PXELINUX 4.02 debian-20101014  Copyright (C) 1994-2010 H. Peter Anvin et al
boot:
Loading xen/xen_pvh2.gz... ok
Loading freebsd/freebsd6.pvh... ok
 Xen 4.5-unstable
(XEN) Xen version 4.5-unstable (root@) (gcc (FreeBSD Ports Collection) 4.8.3 20140220 (prerelease)) debug=y Wed Apr 30 15:59:03 CEST 2014
(XEN) Latest ChangeSet: Wed Apr 23 18:49:06 2014 -0700 git:ae88c7c
(XEN) Console output is synchronous.
(XEN) Bootloader: PXELINUX 4.02 debian-20101014
(XEN) Command line: ioapic_ack=old dom0_max_vcpus=2 sync_console=true dom0pvh=1 dom0_mem=1024M com1=115200,8n1 guest_loglvl=all loglvl=all console=com1
(XEN) Video information:
(XEN)  VGA is text mode 80x25, font 8x16
(XEN)  VBE/DDC methods: V2; EDID transfer time: 1 seconds
(XEN)  EDID info not retrieved because of reasons unknown
(XEN) Disc information:
(XEN)  Found 2 MBR signatures
(XEN)  Found 2 EDD information structures
(XEN) Xen-e820 RAM map:
(XEN)  0000000000000000 - 0000000000092400 (usable)
(XEN)  00000000000f0000 - 0000000000100000 (reserved)
(XEN)  0000000000100000 - 00000000dfdf9c00 (usable)
(XEN)  00000000dfdf9c00 - 00000000dfe4bc00 (ACPI NVS)
(XEN)  00000000dfe4bc00 - 00000000dfe4dc00 (ACPI data)
(XEN)  00000000dfe4dc00 - 00000000e0000000 (reserved)
(XEN)  00000000f8000000 - 00000000fd000000 (reserved)
(XEN)  00000000fe000000 - 00000000fed00400 (reserved)
(XEN)  00000000fee00000 - 00000000fef00000 (reserved)
(XEN)  00000000ffb00000 - 0000000100000000 (reserved)
(XEN)  0000000100000000 - 00000001a0000000 (usable)
(XEN) ACPI: RSDP 000FEC30, 0024 (r2 DELL  )
(XEN) ACPI: XSDT 000FCCC7, 007C (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: FACP 000FCDB7, 00F4 (r3 DELL    B10K          15 ASL        61)
(XEN) ACPI: DSDT FFE9E951, 4A74 (r1   DELL    dt_ex     1000 INTL 20050624)
(XEN) ACPI: FACS DFDF9C00, 0040
(XEN) ACPI: SSDT FFEA34D6, 009C (r1   DELL    st_ex     1000 INTL 20050624)
(XEN) ACPI: APIC 000FCEAB, 015E (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: BOOT 000FD009, 0028 (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: ASF! 000FD031, 0096 (r32 DELL    B10K          15 ASL        61)
(XEN) ACPI: MCFG 000FD0C7, 003C (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: HPET 000FD103, 0038 (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: TCPA 000FD35F, 0032 (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: DMAR 000FD391, 00C8 (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: SLIC 000FD13B, 0176 (r1 DELL    B10K          15 ASL        61)
(XEN) ACPI: SSDT DFE4DC00, 13C8 (r1  INTEL PPM RCM  80000001 INTL 20061109)
(XEN) System RAM: 6141MB (6288940kB)
(XEN) No NUMA configuration found
(XEN) Faking a node at 0000000000000000-00000001a0000000
(XEN) Domain heap initialised
(XEN) DMI 2.5 present.
(XEN) Using APIC driver default
(XEN) ACPI: PM-Timer IO Port: 0x808
(XEN) ACPI: SLEEP INFO: pm1x_cnt[1:804,1:0], pm1x_evt[1:800,1:0]
(XEN) ACPI:             wakeup_vec[dfdf9c0c], vec_size[20]
(XEN) ACPI: Local APIC address 0xfee00000
(XEN) ACPI: LAPIC (acpi_id[0x01] lapic_id[0x00] enabled)
(XEN) Processor #0 7:10 APIC version 21
(XEN) ACPI: LAPIC (acpi_id[0x02] lapic_id[0x02] enabled)
(XEN) Processor #2 7:10 APIC version 21
(XEN) ACPI: LAPIC (acpi_id[0x03] lapic_id[0x04] enabled)
(XEN) Processor #4 7:10 APIC version 21
(XEN) ACPI: LAPIC (acpi_id[0x04] lapic_id[0x06] enabled)
(XEN) Processor #6 7:10 APIC version 21
(XEN) ACPI: LAPIC (acpi_id[0x05] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x06] lapic_id[0x01] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x07] lapic_id[0x02] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x08] lapic_id[0x03] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x09] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x0a] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x0b] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x0c] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x0d] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x0e] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x0f] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x10] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x11] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x12] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x13] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x14] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x15] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x16] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x17] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x18] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x19] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x1a] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x1b] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x1c] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x1d] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x1e] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x1f] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC (acpi_id[0x20] lapic_id[0x00] disabled)
(XEN) ACPI: LAPIC_NMI (acpi_id[0xff] high level lint[0x1])
(XEN) ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0])
(XEN) IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23
(XEN) ACPI: IOAPIC (id[0x09] address[0xfec80000] gsi_base[24])
(XEN) IOAPIC[1]: apic_id 9, version 32, address 0xfec80000, GSI 24-47
(XEN) ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
(XEN) ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
(XEN) ACPI: IRQ0 used by override.
(XEN) ACPI: IRQ2 used by override.
(XEN) ACPI: IRQ9 used by override.
(XEN) Enabling APIC mode:  Flat.  Using 2 I/O APICs
(XEN) ACPI: HPET id: 0x8086a301 base: 0xfed00000
(XEN) ERST table was not found
(XEN) Using ACPI (MADT) for SMP configuration information
(XEN) SMP: Allowing 32 CPUs (28 hotplug CPUs)
(XEN) IRQ limits: 48 GSI, 736 MSI/MSI-X
(XEN) Using scheduler: SMP Credit Scheduler (credit)
(XEN) Detected 3066.852 MHz processor.
(XEN) Initing memory sharing.
(XEN) mce_intel.c:717: MCA Capability: BCAST 1 SER 0 CMCI 1 firstbank 0 extended MCE MSR 0
(XEN) Intel machine check reporting enabled
(XEN) PCI: MCFG configuration 0: base f8000000 segment 0000 buses 00 - 3f
(XEN) PCI: MCFG area at f8000000 reserved in E820
(XEN) PCI: Using MCFG for segment 0000 bus 00-3f
(XEN) Intel VT-d iommu 0 supported page sizes: 4kB.
(XEN) Intel VT-d Snoop Control enabled.
(XEN) Intel VT-d Dom0 DMA Passthrough not enabled.
(XEN) Intel VT-d Queued Invalidation enabled.
(XEN) Intel VT-d Interrupt Remapping enabled.
(XEN) Intel VT-d Shared EPT tables not enabled.
(XEN) I/O virtualisation enabled
(XEN)  - Dom0 mode: Relaxed
(XEN) Interrupt remapping enabled
(XEN) ENABLING IO-APIC IRQs
(XEN)  -> Using old ACK method
(XEN) ..TIMER: vector=0xF0 apic1=0 pin1=2 apic2=-1 pin2=-1
(XEN) Platform timer is 14.318MHz HPET
(XEN) Allocated console ring of 32 KiB.
(XEN) mwait-idle: MWAIT substates: 0x1120
(XEN) mwait-idle: v0.4 model 0x1a
(XEN) mwait-idle: lapic_timer_reliable_states 0x2
(XEN) HPET: 0 timers usable for broadcast (4 total)
(XEN) VMX: Supported advanced features:
(XEN)  - APIC MMIO access virtualisation
(XEN)  - APIC TPR shadow
(XEN)  - Extended Page Tables (EPT)
(XEN)  - Virtual-Processor Identifiers (VPID)
(XEN)  - Virtual NMI
(XEN)  - MSR direct-access bitmap
(XEN) HVM: ASIDs enabled.
(XEN) HVM: VMX enabled
(XEN) HVM: Hardware Assisted Paging (HAP) detected
(XEN) HVM: HAP page sizes: 4kB, 2MB
(XEN) Brought up 4 CPUs
(XEN) ACPI sleep modes: S3
(XEN) mcheck_poll: Machine check polling timer started.
(XEN) *** LOADING DOMAIN 0 ***
(XEN) elf_parse_binary: phdr: paddr=0xffffffff80200000 memsz=0xf56738
(XEN) elf_parse_binary: phdr: paddr=0xffffffff81356738 memsz=0x58a598
(XEN) elf_parse_binary: memory: 0xffffffff80200000 -> 0xffffffff818e0cd0
(XEN) elf_xen_parse_note: GUEST_OS = "FreeBSD"
(XEN) elf_xen_parse_note: GUEST_VERSION = "0x10c8f0"
(XEN) elf_xen_parse_note: XEN_VERSION = "xen-3.0"
(XEN) elf_xen_parse_note: VIRT_BASE = 0xffffffff80000000
(XEN) elf_xen_parse_note: PADDR_OFFSET = 0xffffffff80000000
(XEN) elf_xen_parse_note: ENTRY = 0xffffffff80cae000
(XEN) elf_xen_parse_note: HYPERCALL_PAGE = 0xffffffff80cad000
(XEN) elf_xen_parse_note: HV_START_LOW = 0xffff800000000000
(XEN) elf_xen_parse_note: FEATURES = "writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel|hvm_callback_vector"
(XEN) elf_xen_parse_note: PAE_MODE = "yes"
(XEN) elf_xen_parse_note: unknown xen elf note (0xd)
(XEN) elf_xen_parse_note: LOADER = "generic"
(XEN) elf_xen_parse_note: SUSPEND_CANCEL = 0x0
(XEN) elf_xen_parse_note: BSD_SYMTAB = "yes"
(XEN) elf_xen_parse: using notes from SHT_NOTE section
(XEN) elf_xen_addr_calc_check: addresses:
(XEN)     virt_base        = 0xffffffff80000000
(XEN)     elf_paddr_offset = 0xffffffff80000000
(XEN)     virt_offset      = 0x0
(XEN)     virt_kstart      = 0xffffffff80200000
(XEN)     virt_kend        = 0xffffffff81bda5b8
(XEN)     virt_entry       = 0xffffffff80cae000
(XEN)     p2m_base         = 0xffffffffffffffff
(XEN)  Xen  kernel: 64-bit, lsb, compat32
(XEN)  Dom0 kernel: 64-bit, PAE, lsb, paddr 0xffffffff80200000 -> 0xffffffff818e0cd0
(XEN)  Dom0 symbol map 0xffffffff818e0cd0 -> 0xffffffff81bda5b8
(XEN) PHYSICAL MEMORY ARRANGEMENT:
(XEN)  Dom0 alloc.:   0000000198000000->000000019a000000 (253952 pages to be allocated)
(XEN) VIRTUAL MEMORY ARRANGEMENT:
(XEN)  Loaded kernel: ffffffff80200000->ffffffff81bda5b8
(XEN)  Init. ramdisk: ffffffff81bdb000->ffffffff81bdb000
(XEN)  Phys-Mach map: ffffffff81bdb000->ffffffff81ddb000
(XEN)  Start info:    ffffffff81ddb000->ffffffff81ddc4b4
(XEN)  Page tables:   ffffffff81ddd000->ffffffff81df0000
(XEN)  Boot stack:    ffffffff81df0000->ffffffff81df1000
(XEN)  TOTAL:         ffffffff80000000->ffffffff82000000
(XEN)  ENTRY ADDRESS: ffffffff80cae000
(XEN) Dom0 has maximum 2 VCPUs
(XEN) elf_load_binary: phdr 2 at 0xffffffff80200000 -> 0xffffffff81156738
(XEN) elf_load_binary: phdr 3 at 0xffffffff81356738 -> 0xffffffff81474c30
(XEN) elf_load_bsdsyms: shdr 4 at 0xffff83019eca7ab0 -> 0xffffffff818e1818
(XEN) elf_load_bsdsyms: shdr 41 at 0xffff83019fd58c51 -> 0xffffffff819340a8
(XEN) elf_load_bsdsyms: shdr 42 at 0xffff83019fd59990 -> 0xffffffff819342e8
(XEN) elf_load_bsdsyms: shdr 43 at 0xffff83019fe9f7d8 -> 0xffffffff81a7a130
(XEN) Scrubbing Free RAM: ..................................................done.
(XEN) Initial low memory virq threshold set at 0x4000 pages.
(XEN) Std. Loglevel: All
(XEN) Guest Loglevel: All
(XEN) **********************************************
(XEN) ******* WARNING: CONSOLE OUTPUT IS SYNCHRONOUS
(XEN) ******* This option is intended to aid debugging of Xen by ensuring
(XEN) ******* that all output is synchronously delivered on the serial line.
(XEN) ******* However it can introduce SIGNIFICANT latencies and affect
(XEN) ******* timekeeping. It is NOT recommended for production use!
(XEN) **********************************************
(XEN) 3... 2... 1...
(XEN) *** Serial input -> DOM0 (type 'CTRL-a' three times to switch input to Xen)
(XEN) Freed 276kB init memory.
FreeBSD PVH running on xen-3.0-x86_64p
GDB: no debug ports present
KDB: debugger backends: ddb
KDB: current backend: ddb
SMAP type=01 base=0000000000000000 len=0000000000092400
SMAP type=02 base=00000000000f0000 len=0000000000010000
SMAP type=01 base=0000000000100000 len=000000003ff6e000
SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
SMAP type=02 base=00000000f8000000 len=0000000005000000
SMAP type=02 base=00000000fe000000 len=0000000000d00400
SMAP type=02 base=00000000fee00000 len=0000000000100000
SMAP type=02 base=00000000ffb00000 len=0000000000500000
SMAP type=02 base=0000000100000000 len=00000000a0000000
(XEN) ----[ Xen-4.5-unstable  x86_64  debug=y  Tainted:    C ]----
(XEN) CPU:    0
(XEN) RIP:    e008:[<ffff82d0801c282f>] vioapic_range+0xf/0x2e
(XEN) RFLAGS: 0000000000010297   CONTEXT: hypervisor
(XEN) rax: 0000000000000000   rbx: ffff82d0802d7560   rcx: 0000000040000000
(XEN) rdx: 0000000000000000   rsi: 0000000040000000   rdi: ffff8300dfb1a000
(XEN) rbp: ffff82d0802d74a8   rsp: ffff82d0802d74a8   r8:  0000000000000001
(XEN) r9:  0000000000000000   r10: 0000000080000011   r11: ffffffff81475871
(XEN) r12: ffff8300dfb1a000   r13: ffff82d080279e30   r14: ffff82d08027b1a0
(XEN) r15: ffff82d080279e48   cr0: 0000000080050033   cr4: 00000000000026f0
(XEN) cr3: 000000019ec87000   cr2: 0000000000000000
(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008
(XEN) Xen stack trace from rsp=ffff82d0802d74a8:
(XEN)    ffff82d0802d7518 ffff82d0801bc190 ffff82d000000000 ffff82d0801e6cb4
(XEN)    ffff83019a581000 0000000200000000 ffff82d0802d7c04 00000007dfb1a000
(XEN)    ffff82d0802d7538 0000000000000000 ffff82e003300000 0000000000000020
(XEN)    ffff8300dfb1a000 ffff82d0802d7b00 ffff82d0802d75b8 ffff82d0801af408
(XEN)    ffff8201802d0000 ffff82d0802d7608 0000000100010004 0000000040000000
(XEN)    ffff82d0802d7560 0000000400000001 0000000000000000 0000000040000000
(XEN)    0000000000000000 0000000400000001 0120000000000000 0000000000000000
(XEN)    0000000000000004 0000000000000004 0000000000000004 0000000000000000
(XEN)    ffff82d0802d7608 ffff82d0802d7bb0 ffff82d0802d7648 ffff82d0801b0a40
(XEN)    ffff82d000000000 ffff82d0802d79a0 ffff82d0802d7658 ffff82d0802d7600
(XEN)    ffff830000000001 ffff82d0802d79a0 0000000000000000 0000000040000000
(XEN)    0000000000000001 ffffffff82000000 0000000000000000 ffff82d0802d7bb0
(XEN)    0000000000000000 0000000000000001 0000000000000008 ffff82d08027b0c0
(XEN)    ffff82d0802d7658 ffff82d0801b0b5b ffff82d0802d7668 ffff82d08018e1e3
(XEN)    ffff82d0802d7ac8 ffff82d0801900b3 ffff82d0802d7698 ffff82d08012975c
(XEN)    0000000000000007 ffff82d0802d7728 0000000139011830 ffffffff82000000
(XEN)    ffff82d0802d778b 0000000000000009 000000000019a500 0000000800000000
(XEN)    ffff82d0802d0000 ffff82d0801e9084 00ff8301000000dc 0000000000000004
(XEN)    0000000000000004 0000000000000000 ffff82d0802d7778 ffff82d0801e9921
(XEN)    0000000000000000 ffff82d0802d7804 ffff82d0802d78d4 0000000000000000
(XEN) Xen call trace:
(XEN)    [<ffff82d0801c282f>] vioapic_range+0xf/0x2e
(XEN)    [<ffff82d0801bc190>] hvm_mmio_intercept+0x40/0x320
(XEN)    [<ffff82d0801af408>] hvmemul_do_io+0x493/0x6b5
(XEN)    [<ffff82d0801b0a40>] __hvmemul_read+0x24b/0x2ca
(XEN)    [<ffff82d0801b0b5b>] hvmemul_read+0x12/0x14
(XEN)    [<ffff82d08018e1e3>] read_ulong+0xe/0x10
(XEN)    [<ffff82d0801900b3>] x86_emulate+0x1554/0xf4b1
(XEN)    [<ffff82d0801afc7a>] hvm_emulate_one+0x160/0x28b
(XEN)    [<ffff82d0801bcc12>] handle_mmio+0x43/0x1cf
(XEN)    [<ffff82d0801bb0dc>] hvm_hap_nested_page_fault+0x257/0x489
(XEN)    [<ffff82d0801da7ed>] vmx_vmexit_handler+0x1602/0x19a7
(XEN)    [<ffff82d0801e0cd1>] vmx_asm_vmexit_handler+0x41/0xc0
(XEN)
(XEN) Pagetable walk from 0000000000000000:
(XEN)  L4[0x000] = 0000000000000000 ffffffffffffffff
(XEN)
(XEN) ****************************************
(XEN) Panic on CPU 0:
(XEN) FATAL PAGE FAULT
(XEN) [error_code=0000]
(XEN) Faulting linear address: 0000000000000000
(XEN) ****************************************
(XEN)
(XEN) Reboot in five seconds...

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-04-30 14:11 ` [V10 PATCH 0/4] pvh dom0 patches Roger Pau Monné
@ 2014-04-30 18:12   ` Mukesh Rathor
  2014-05-01  1:19     ` Mukesh Rathor
  0 siblings, 1 reply; 52+ messages in thread
From: Mukesh Rathor @ 2014-04-30 18:12 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On Wed, 30 Apr 2014 16:11:39 +0200
Roger Pau Monné <roger.pau@citrix.com> wrote:

> On 30/04/14 03:06, Mukesh Rathor wrote:
.....

> Hello Mukesh,
> 
> Thanks for the new version, unfortunately when trying to boot FreeBSD 
> Dom0 with this version I get the following hypervisor crash (it works
> fine with previous versions):

Aha, Jan, there's the vioapic crash!! Roger, see:

http://www.gossamer-threads.com/lists/xen/devel/325784

I had seen this few weeks ago, but could not reproduce last week 
despite several attempts. You are seeing this in V10 because I dropped
the vioapic patch from V9 (included below).

BTW, since I'm not able to reproduce this, can you kindly check
where the ept violation is coming from? Is that on an io space?
Also, our binaries don't match, so can you please confirm it's the 
call from:

hvm_hap_nested_page_fault():
    if ( (p2mt == p2m_mmio_dm) ||
         (access_w && (p2mt == p2m_ram_ro)) )
    {
        put_gfn(p2m->domain, gfn);
        if ( !handle_mmio() )   <==========
            hvm_inject_hw_exception(TRAP_gp_fault, 0);

In which case, what's the p2mt?

thanks,
Mukesh


diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
index d3c681b..0b343bd 100644
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -238,10 +238,11 @@ static int vioapic_write(
 
 static int vioapic_range(struct vcpu *v, unsigned long addr)
 {
-    struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain);
+    struct hvm_vioapic *vioapic = v->domain->arch.hvm_domain.vioapic;
+    struct hvm_hw_vioapic *hwapic = domain_vioapic(v->domain);
 
-    return ((addr >= vioapic->base_address &&
-             (addr < vioapic->base_address + VIOAPIC_MEM_LENGTH)));
+    return (vioapic && (addr >= hwapic->base_address &&
+                        (addr < hwapic->base_address + VIOAPIC_MEM_LENGTH)));
 }
 



> PXELINUX 4.02 debian-20101014  Copyright (C) 1994-2010 H. Peter Anvin
> et al boot:
> Loading xen/xen_pvh2.gz... ok
> Loading freebsd/freebsd6.pvh... ok
>  Xen 4.5-unstable
> (XEN) Xen version 4.5-unstable (root@) (gcc (FreeBSD Ports
> Collection) 4.8.3 20140220 (prerelease)) debug=y Wed Apr 30 15:59:03
> CEST 2014 (XEN) Latest ChangeSet: Wed Apr 23 18:49:06 2014 -0700
> git:ae88c7c (XEN) Console output is synchronous. (XEN) Bootloader:
> PXELINUX 4.02 debian-20101014 (XEN) Command line: ioapic_ack=old
> dom0_max_vcpus=2 sync_console=true dom0pvh=1 dom0_mem=1024M
> com1=115200,8n1 guest_loglvl=all loglvl=all console=com1 (XEN) Video
> information: (XEN)  VGA is text mode 80x25, font 8x16 (XEN)  VBE/DDC
> methods: V2; EDID transfer time: 1 seconds (XEN)  EDID info not
> retrieved because of reasons unknown (XEN) Disc information:
> (XEN)  Found 2 MBR signatures
> (XEN)  Found 2 EDD information structures
> (XEN) Xen-e820 RAM map:
> (XEN)  0000000000000000 - 0000000000092400 (usable)
> (XEN)  00000000000f0000 - 0000000000100000 (reserved)
> (XEN)  0000000000100000 - 00000000dfdf9c00 (usable)
> (XEN)  00000000dfdf9c00 - 00000000dfe4bc00 (ACPI NVS)
> (XEN)  00000000dfe4bc00 - 00000000dfe4dc00 (ACPI data)
> (XEN)  00000000dfe4dc00 - 00000000e0000000 (reserved)
> (XEN)  00000000f8000000 - 00000000fd000000 (reserved)
> (XEN)  00000000fe000000 - 00000000fed00400 (reserved)
> (XEN)  00000000fee00000 - 00000000fef00000 (reserved)
> (XEN)  00000000ffb00000 - 0000000100000000 (reserved)
> (XEN)  0000000100000000 - 00000001a0000000 (usable)
> (XEN) ACPI: RSDP 000FEC30, 0024 (r2 DELL  )
> (XEN) ACPI: XSDT 000FCCC7, 007C (r1 DELL    B10K          15
> ASL        61) (XEN) ACPI: FACP 000FCDB7, 00F4 (r3 DELL
> B10K          15 ASL        61) (XEN) ACPI: DSDT FFE9E951, 4A74 (r1
> DELL    dt_ex     1000 INTL 20050624) (XEN) ACPI: FACS DFDF9C00, 0040
> (XEN) ACPI: SSDT FFEA34D6, 009C (r1   DELL    st_ex     1000 INTL
> 20050624) (XEN) ACPI: APIC 000FCEAB, 015E (r1 DELL    B10K
> 15 ASL        61) (XEN) ACPI: BOOT 000FD009, 0028 (r1 DELL
> B10K          15 ASL        61) (XEN) ACPI: ASF! 000FD031, 0096 (r32
> DELL    B10K          15 ASL        61) (XEN) ACPI: MCFG 000FD0C7,
> 003C (r1 DELL    B10K          15 ASL        61) (XEN) ACPI: HPET
> 000FD103, 0038 (r1 DELL    B10K          15 ASL        61) (XEN)
> ACPI: TCPA 000FD35F, 0032 (r1 DELL    B10K          15 ASL        61)
> (XEN) ACPI: DMAR 000FD391, 00C8 (r1 DELL    B10K          15
> ASL        61) (XEN) ACPI: SLIC 000FD13B, 0176 (r1 DELL
> B10K          15 ASL        61) (XEN) ACPI: SSDT DFE4DC00, 13C8 (r1
> INTEL PPM RCM  80000001 INTL 20061109) (XEN) System RAM: 6141MB
> (6288940kB) (XEN) No NUMA configuration found (XEN) Faking a node at
> 0000000000000000-00000001a0000000 (XEN) Domain heap initialised
> (XEN) DMI 2.5 present.
> (XEN) Using APIC driver default
> (XEN) ACPI: PM-Timer IO Port: 0x808
> (XEN) ACPI: SLEEP INFO: pm1x_cnt[1:804,1:0], pm1x_evt[1:800,1:0]
> (XEN) ACPI:             wakeup_vec[dfdf9c0c], vec_size[20]
> (XEN) ACPI: Local APIC address 0xfee00000
> (XEN) ACPI: LAPIC (acpi_id[0x01] lapic_id[0x00] enabled)
> (XEN) Processor #0 7:10 APIC version 21
> (XEN) ACPI: LAPIC (acpi_id[0x02] lapic_id[0x02] enabled)
> (XEN) Processor #2 7:10 APIC version 21
> (XEN) ACPI: LAPIC (acpi_id[0x03] lapic_id[0x04] enabled)
> (XEN) Processor #4 7:10 APIC version 21
> (XEN) ACPI: LAPIC (acpi_id[0x04] lapic_id[0x06] enabled)
> (XEN) Processor #6 7:10 APIC version 21
> (XEN) ACPI: LAPIC (acpi_id[0x05] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x06] lapic_id[0x01] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x07] lapic_id[0x02] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x08] lapic_id[0x03] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x09] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x0a] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x0b] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x0c] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x0d] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x0e] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x0f] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x10] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x11] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x12] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x13] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x14] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x15] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x16] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x17] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x18] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x19] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x1a] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x1b] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x1c] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x1d] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x1e] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x1f] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC (acpi_id[0x20] lapic_id[0x00] disabled)
> (XEN) ACPI: LAPIC_NMI (acpi_id[0xff] high level lint[0x1])
> (XEN) ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0])
> (XEN) IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23
> (XEN) ACPI: IOAPIC (id[0x09] address[0xfec80000] gsi_base[24])
> (XEN) IOAPIC[1]: apic_id 9, version 32, address 0xfec80000, GSI 24-47
> (XEN) ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
> (XEN) ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
> (XEN) ACPI: IRQ0 used by override.
> (XEN) ACPI: IRQ2 used by override.
> (XEN) ACPI: IRQ9 used by override.
> (XEN) Enabling APIC mode:  Flat.  Using 2 I/O APICs
> (XEN) ACPI: HPET id: 0x8086a301 base: 0xfed00000
> (XEN) ERST table was not found
> (XEN) Using ACPI (MADT) for SMP configuration information
> (XEN) SMP: Allowing 32 CPUs (28 hotplug CPUs)
> (XEN) IRQ limits: 48 GSI, 736 MSI/MSI-X
> (XEN) Using scheduler: SMP Credit Scheduler (credit)
> (XEN) Detected 3066.852 MHz processor.
> (XEN) Initing memory sharing.
> (XEN) mce_intel.c:717: MCA Capability: BCAST 1 SER 0 CMCI 1 firstbank
> 0 extended MCE MSR 0 (XEN) Intel machine check reporting enabled
> (XEN) PCI: MCFG configuration 0: base f8000000 segment 0000 buses 00
> - 3f (XEN) PCI: MCFG area at f8000000 reserved in E820
> (XEN) PCI: Using MCFG for segment 0000 bus 00-3f
> (XEN) Intel VT-d iommu 0 supported page sizes: 4kB.
> (XEN) Intel VT-d Snoop Control enabled.
> (XEN) Intel VT-d Dom0 DMA Passthrough not enabled.
> (XEN) Intel VT-d Queued Invalidation enabled.
> (XEN) Intel VT-d Interrupt Remapping enabled.
> (XEN) Intel VT-d Shared EPT tables not enabled.
> (XEN) I/O virtualisation enabled
> (XEN)  - Dom0 mode: Relaxed
> (XEN) Interrupt remapping enabled
> (XEN) ENABLING IO-APIC IRQs
> (XEN)  -> Using old ACK method
> (XEN) ..TIMER: vector=0xF0 apic1=0 pin1=2 apic2=-1 pin2=-1
> (XEN) Platform timer is 14.318MHz HPET
> (XEN) Allocated console ring of 32 KiB.
> (XEN) mwait-idle: MWAIT substates: 0x1120
> (XEN) mwait-idle: v0.4 model 0x1a
> (XEN) mwait-idle: lapic_timer_reliable_states 0x2
> (XEN) HPET: 0 timers usable for broadcast (4 total)
> (XEN) VMX: Supported advanced features:
> (XEN)  - APIC MMIO access virtualisation
> (XEN)  - APIC TPR shadow
> (XEN)  - Extended Page Tables (EPT)
> (XEN)  - Virtual-Processor Identifiers (VPID)
> (XEN)  - Virtual NMI
> (XEN)  - MSR direct-access bitmap
> (XEN) HVM: ASIDs enabled.
> (XEN) HVM: VMX enabled
> (XEN) HVM: Hardware Assisted Paging (HAP) detected
> (XEN) HVM: HAP page sizes: 4kB, 2MB
> (XEN) Brought up 4 CPUs
> (XEN) ACPI sleep modes: S3
> (XEN) mcheck_poll: Machine check polling timer started.
> (XEN) *** LOADING DOMAIN 0 ***
> (XEN) elf_parse_binary: phdr: paddr=0xffffffff80200000 memsz=0xf56738
> (XEN) elf_parse_binary: phdr: paddr=0xffffffff81356738 memsz=0x58a598
> (XEN) elf_parse_binary: memory: 0xffffffff80200000 ->
> 0xffffffff818e0cd0 (XEN) elf_xen_parse_note: GUEST_OS = "FreeBSD"
> (XEN) elf_xen_parse_note: GUEST_VERSION = "0x10c8f0"
> (XEN) elf_xen_parse_note: XEN_VERSION = "xen-3.0"
> (XEN) elf_xen_parse_note: VIRT_BASE = 0xffffffff80000000
> (XEN) elf_xen_parse_note: PADDR_OFFSET = 0xffffffff80000000
> (XEN) elf_xen_parse_note: ENTRY = 0xffffffff80cae000
> (XEN) elf_xen_parse_note: HYPERCALL_PAGE = 0xffffffff80cad000
> (XEN) elf_xen_parse_note: HV_START_LOW = 0xffff800000000000
> (XEN) elf_xen_parse_note: FEATURES =
> "writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel|hvm_callback_vector" (XEN)
> elf_xen_parse_note: PAE_MODE = "yes" (XEN) elf_xen_parse_note:
> unknown xen elf note (0xd) (XEN) elf_xen_parse_note: LOADER =
> "generic" (XEN) elf_xen_parse_note: SUSPEND_CANCEL = 0x0
> (XEN) elf_xen_parse_note: BSD_SYMTAB = "yes"
> (XEN) elf_xen_parse: using notes from SHT_NOTE section
> (XEN) elf_xen_addr_calc_check: addresses:
> (XEN)     virt_base        = 0xffffffff80000000
> (XEN)     elf_paddr_offset = 0xffffffff80000000
> (XEN)     virt_offset      = 0x0
> (XEN)     virt_kstart      = 0xffffffff80200000
> (XEN)     virt_kend        = 0xffffffff81bda5b8
> (XEN)     virt_entry       = 0xffffffff80cae000
> (XEN)     p2m_base         = 0xffffffffffffffff
> (XEN)  Xen  kernel: 64-bit, lsb, compat32
> (XEN)  Dom0 kernel: 64-bit, PAE, lsb, paddr 0xffffffff80200000 ->
> 0xffffffff818e0cd0 (XEN)  Dom0 symbol map 0xffffffff818e0cd0 ->
> 0xffffffff81bda5b8 (XEN) PHYSICAL MEMORY ARRANGEMENT:
> (XEN)  Dom0 alloc.:   0000000198000000->000000019a000000 (253952
> pages to be allocated) (XEN) VIRTUAL MEMORY ARRANGEMENT:
> (XEN)  Loaded kernel: ffffffff80200000->ffffffff81bda5b8
> (XEN)  Init. ramdisk: ffffffff81bdb000->ffffffff81bdb000
> (XEN)  Phys-Mach map: ffffffff81bdb000->ffffffff81ddb000
> (XEN)  Start info:    ffffffff81ddb000->ffffffff81ddc4b4
> (XEN)  Page tables:   ffffffff81ddd000->ffffffff81df0000
> (XEN)  Boot stack:    ffffffff81df0000->ffffffff81df1000
> (XEN)  TOTAL:         ffffffff80000000->ffffffff82000000
> (XEN)  ENTRY ADDRESS: ffffffff80cae000
> (XEN) Dom0 has maximum 2 VCPUs
> (XEN) elf_load_binary: phdr 2 at 0xffffffff80200000 ->
> 0xffffffff81156738 (XEN) elf_load_binary: phdr 3 at
> 0xffffffff81356738 -> 0xffffffff81474c30 (XEN) elf_load_bsdsyms: shdr
> 4 at 0xffff83019eca7ab0 -> 0xffffffff818e1818 (XEN) elf_load_bsdsyms:
> shdr 41 at 0xffff83019fd58c51 -> 0xffffffff819340a8 (XEN)
> elf_load_bsdsyms: shdr 42 at 0xffff83019fd59990 -> 0xffffffff819342e8
> (XEN) elf_load_bsdsyms: shdr 43 at 0xffff83019fe9f7d8 ->
> 0xffffffff81a7a130 (XEN) Scrubbing Free
> RAM: ..................................................done. (XEN)
> Initial low memory virq threshold set at 0x4000 pages. (XEN) Std.
> Loglevel: All (XEN) Guest Loglevel: All (XEN)
> ********************************************** (XEN) ******* WARNING:
> CONSOLE OUTPUT IS SYNCHRONOUS (XEN) ******* This option is intended
> to aid debugging of Xen by ensuring (XEN) ******* that all output is
> synchronously delivered on the serial line. (XEN) ******* However it
> can introduce SIGNIFICANT latencies and affect (XEN) *******
> timekeeping. It is NOT recommended for production use! (XEN)
> ********************************************** (XEN) 3... 2... 1...
> (XEN) *** Serial input -> DOM0 (type 'CTRL-a' three times to switch
> input to Xen) (XEN) Freed 276kB init memory.
> FreeBSD PVH running on xen-3.0-x86_64p
> GDB: no debug ports present
> KDB: debugger backends: ddb
> KDB: current backend: ddb
> SMAP type=01 base=0000000000000000 len=0000000000092400
> SMAP type=02 base=00000000000f0000 len=0000000000010000
> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
> SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
> SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
> SMAP type=02 base=00000000f8000000 len=0000000005000000
> SMAP type=02 base=00000000fe000000 len=0000000000d00400
> SMAP type=02 base=00000000fee00000 len=0000000000100000
> SMAP type=02 base=00000000ffb00000 len=0000000000500000
> SMAP type=02 base=0000000100000000 len=00000000a0000000
> (XEN) ----[ Xen-4.5-unstable  x86_64  debug=y  Tainted:    C ]----
> (XEN) CPU:    0
> (XEN) RIP:    e008:[<ffff82d0801c282f>] vioapic_range+0xf/0x2e
> (XEN) RFLAGS: 0000000000010297   CONTEXT: hypervisor
> (XEN) rax: 0000000000000000   rbx: ffff82d0802d7560   rcx:
> 0000000040000000 (XEN) rdx: 0000000000000000   rsi:
> 0000000040000000   rdi: ffff8300dfb1a000 (XEN) rbp:
> ffff82d0802d74a8   rsp: ffff82d0802d74a8   r8:  0000000000000001
> (XEN) r9:  0000000000000000   r10: 0000000080000011   r11:
> ffffffff81475871 (XEN) r12: ffff8300dfb1a000   r13:
> ffff82d080279e30   r14: ffff82d08027b1a0 (XEN) r15:
> ffff82d080279e48   cr0: 0000000080050033   cr4: 00000000000026f0
> (XEN) cr3: 000000019ec87000   cr2: 0000000000000000 (XEN) ds: 0000
> es: 0000   fs: 0000   gs: 0000   ss: 0000   cs: e008 (XEN) Xen stack
> trace from rsp=ffff82d0802d74a8: (XEN)    ffff82d0802d7518
> ffff82d0801bc190 ffff82d000000000 ffff82d0801e6cb4 (XEN)
> ffff83019a581000 0000000200000000 ffff82d0802d7c04 00000007dfb1a000
> (XEN)    ffff82d0802d7538 0000000000000000 ffff82e003300000
> 0000000000000020 (XEN)    ffff8300dfb1a000 ffff82d0802d7b00
> ffff82d0802d75b8 ffff82d0801af408 (XEN)    ffff8201802d0000
> ffff82d0802d7608 0000000100010004 0000000040000000 (XEN)
> ffff82d0802d7560 0000000400000001 0000000000000000 0000000040000000
> (XEN)    0000000000000000 0000000400000001 0120000000000000
> 0000000000000000 (XEN)    0000000000000004 0000000000000004
> 0000000000000004 0000000000000000 (XEN)    ffff82d0802d7608
> ffff82d0802d7bb0 ffff82d0802d7648 ffff82d0801b0a40 (XEN)
> ffff82d000000000 ffff82d0802d79a0 ffff82d0802d7658 ffff82d0802d7600
> (XEN)    ffff830000000001 ffff82d0802d79a0 0000000000000000
> 0000000040000000 (XEN)    0000000000000001 ffffffff82000000
> 0000000000000000 ffff82d0802d7bb0 (XEN)    0000000000000000
> 0000000000000001 0000000000000008 ffff82d08027b0c0 (XEN)
> ffff82d0802d7658 ffff82d0801b0b5b ffff82d0802d7668 ffff82d08018e1e3
> (XEN)    ffff82d0802d7ac8 ffff82d0801900b3 ffff82d0802d7698
> ffff82d08012975c (XEN)    0000000000000007 ffff82d0802d7728
> 0000000139011830 ffffffff82000000 (XEN)    ffff82d0802d778b
> 0000000000000009 000000000019a500 0000000800000000 (XEN)
> ffff82d0802d0000 ffff82d0801e9084 00ff8301000000dc 0000000000000004
> (XEN)    0000000000000004 0000000000000000 ffff82d0802d7778
> ffff82d0801e9921 (XEN)    0000000000000000 ffff82d0802d7804
> ffff82d0802d78d4 0000000000000000 (XEN) Xen call trace: (XEN)
> [<ffff82d0801c282f>] vioapic_range+0xf/0x2e (XEN)
> [<ffff82d0801bc190>] hvm_mmio_intercept+0x40/0x320 (XEN)
> [<ffff82d0801af408>] hvmemul_do_io+0x493/0x6b5 (XEN)
> [<ffff82d0801b0a40>] __hvmemul_read+0x24b/0x2ca (XEN)
> [<ffff82d0801b0b5b>] hvmemul_read+0x12/0x14 (XEN)
> [<ffff82d08018e1e3>] read_ulong+0xe/0x10 (XEN)
> [<ffff82d0801900b3>] x86_emulate+0x1554/0xf4b1 (XEN)
> [<ffff82d0801afc7a>] hvm_emulate_one+0x160/0x28b (XEN)
> [<ffff82d0801bcc12>] handle_mmio+0x43/0x1cf (XEN)
> [<ffff82d0801bb0dc>] hvm_hap_nested_page_fault+0x257/0x489 (XEN)
> [<ffff82d0801da7ed>] vmx_vmexit_handler+0x1602/0x19a7 (XEN)
> [<ffff82d0801e0cd1>] vmx_asm_vmexit_handler+0x41/0xc0 (XEN) (XEN)
> Pagetable walk from 0000000000000000: (XEN)  L4[0x000] =
> 0000000000000000 ffffffffffffffff (XEN) (XEN)
> **************************************** (XEN) Panic on CPU 0: (XEN)
> FATAL PAGE FAULT (XEN) [error_code=0000] (XEN) Faulting linear
> address: 0000000000000000 (XEN)
> **************************************** (XEN) (XEN) Reboot in five
> seconds...


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-04-30 18:12   ` Mukesh Rathor
@ 2014-05-01  1:19     ` Mukesh Rathor
  2014-05-02 11:05       ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-01  1:19 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On Wed, 30 Apr 2014 11:12:16 -0700
Mukesh Rathor <mukesh.rathor@oracle.com> wrote:

> On Wed, 30 Apr 2014 16:11:39 +0200
> Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
> > On 30/04/14 03:06, Mukesh Rathor wrote:
> .....
> 
> > Hello Mukesh,
> > 
> > Thanks for the new version, unfortunately when trying to boot
> > FreeBSD Dom0 with this version I get the following hypervisor crash
> > (it works fine with previous versions):
> 
> Aha, Jan, there's the vioapic crash!! Roger, see:
> 
> http://www.gossamer-threads.com/lists/xen/devel/325784
> 
> I had seen this few weeks ago, but could not reproduce last week 
> despite several attempts. You are seeing this in V10 because I dropped
> the vioapic patch from V9 (included below).
> 
> BTW, since I'm not able to reproduce this, can you kindly check
> where the ept violation is coming from? Is that on an io space?
> Also, our binaries don't match, so can you please confirm it's the 
> call from:
> 
> hvm_hap_nested_page_fault():
>     if ( (p2mt == p2m_mmio_dm) ||
>          (access_w && (p2mt == p2m_ram_ro)) )
>     {
>         put_gfn(p2m->domain, gfn);
>         if ( !handle_mmio() )   <==========
>             hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> In which case, what's the p2mt?
> 

Hey Roger,

I tried few things, but still could not reproduce. I saw it few weeks
ago, and I think I misread the code thinking hvm_hap_nested_page_fault
was calling handle_mmio unconditionally, and quickly came up with
the vioapic patch for v9. 

So, can you please try with the vioapic patch. Then two things will
happen:

  1. The ept violation is genuine, in which case it will return back
     successfully to ept_handle_violation which will print the gfn/mfn
     info for further debug.
  2. the emulation will be handled, in which case we need to know what
     was it, mmio_dm or ram_ro, and where it came from in dom0? Both are
     unexpected.

thanks for the help,
mukesh


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 2/4] pvh dom0: Add checks and restrictions for p2m_is_foreign
  2014-04-30  1:06 ` [V10 PATCH 2/4] pvh dom0: Add checks and restrictions for p2m_is_foreign Mukesh Rathor
@ 2014-05-01 16:14   ` Tim Deegan
  0 siblings, 0 replies; 52+ messages in thread
From: Tim Deegan @ 2014-05-01 16:14 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, JBeulich

At 18:06 -0700 on 29 Apr (1398791206), Mukesh Rathor wrote:
> In this patch, we add some checks and restrictions in the relevant
> p2m paths for p2m_is_foreign.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>

Acked-by: Tim Deegan <tim@xen.org>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-04-30  1:06 ` [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages Mukesh Rathor
@ 2014-05-01 16:19   ` Tim Deegan
  2014-05-02  1:45     ` Mukesh Rathor
  0 siblings, 1 reply; 52+ messages in thread
From: Tim Deegan @ 2014-05-01 16:19 UTC (permalink / raw)
  To: Mukesh Rathor
  Cc: JBeulich, George.Dunlap, eddie.dong, keir.xen, jun.nakajima, xen-devel

Hi,

Nearly there, except that the teardown code is now gone, with nothing
replacing it.  Ideally we'd have some logic to find active EPT L1
tables and undo the foreign refcounts.  If not that, then:
 - a hard-coded check, in p2m_add_foreign I suppose, that only allows
   foreign mappings to be added to a p2m for the hardware domain,
   with a pvh fixme comment explaining why; and
 - another pvh fixme comment in the p2m teardown code to say that
   once non-hardware domains can have foreign mappings there will
   have to be some cleanup code to handle them.

Also, in the inner refcounting logic:

At 18:06 -0700 on 29 Apr (1398791207), Mukesh Rathor wrote:
> diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
> index c0bfc50..11474e8 100644
> --- a/xen/arch/x86/mm/p2m-ept.c
> +++ b/xen/arch/x86/mm/p2m-ept.c
> @@ -36,8 +36,6 @@
>  
>  #define atomic_read_ept_entry(__pepte)                              \
>      ( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } )
> -#define atomic_write_ept_entry(__pepte, __epte)                     \
> -    write_atomic(&(__pepte)->epte, (__epte).epte)
>  
>  #define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
>  #define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
> @@ -46,6 +44,46 @@ static inline bool_t is_epte_valid(ept_entry_t *e)
>      return (e->epte != 0 && e->sa_p2mt != p2m_invalid);
>  }
>  
> +/* returns : 0 for success, -errno otherwise */
> +static int atomic_write_ept_entry(ept_entry_t *entryptr, ept_entry_t new,
> +                                  int level)
> +{
> +    bool_t same_mfn = (new.mfn == entryptr->mfn);
> +    unsigned long oldmfn = INVALID_MFN;
> +
> +    if ( level )
> +    {

ASSERT(!(new.sp && p2m_is_foreign(new.sa_p2mt))) here?

> +        write_atomic(&entryptr->epte, new.epte);
> +        return 0;
> +    }
> +
> +    if ( unlikely(p2m_is_foreign(new.sa_p2mt)) && !same_mfn )
> +    {
> +        struct domain *fdom;
> +
> +        if ( !mfn_valid(new.mfn) )
> +            return -EINVAL;
> +
> +        fdom = page_get_owner(mfn_to_page(new.mfn));

This needs a matching put_pg_owner() once you're done with it.

Cheers,

Tim.

> +        if ( fdom == NULL )
> +            return -ESRCH;
> +
> +        /* get refcount on the page */
> +        if ( !get_page(mfn_to_page(new.mfn), fdom) )
> +            return -EBUSY;
> +    }
> +
> +    if ( unlikely(p2m_is_foreign(entryptr->sa_p2mt)) && !same_mfn )
> +        oldmfn = entryptr->mfn;
> +
> +    write_atomic(&entryptr->epte, new.epte);
> +
> +    if ( unlikely(oldmfn != INVALID_MFN) )
> +        put_page(mfn_to_page(oldmfn));
> +
> +    return 0;
> +}
> +

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-01 16:19   ` Tim Deegan
@ 2014-05-02  1:45     ` Mukesh Rathor
  2014-05-02  8:38       ` Jan Beulich
  2014-05-02  8:55       ` Tim Deegan
  0 siblings, 2 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-02  1:45 UTC (permalink / raw)
  To: Tim Deegan
  Cc: JBeulich, George.Dunlap, eddie.dong, keir.xen, jun.nakajima, xen-devel

On Thu, 1 May 2014 18:19:08 +0200
Tim Deegan <tim@xen.org> wrote:

> Hi,
> 
> Nearly there, except that the teardown code is now gone, with nothing
> replacing it.  Ideally we'd have some logic to find active EPT L1
> tables and undo the foreign refcounts.  If not that, then:
>  - a hard-coded check, in p2m_add_foreign I suppose, that only allows
>    foreign mappings to be added to a p2m for the hardware domain,
>    with a pvh fixme comment explaining why; and
>  - another pvh fixme comment in the p2m teardown code to say that
>    once non-hardware domains can have foreign mappings there will
>    have to be some cleanup code to handle them.

Added fixmes... 

> Also, in the inner refcounting logic:
> 
> At 18:06 -0700 on 29 Apr (1398791207), Mukesh Rathor wrote:
> > diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
> > index c0bfc50..11474e8 100644
> > --- a/xen/arch/x86/mm/p2m-ept.c
> > +++ b/xen/arch/x86/mm/p2m-ept.c
> > @@ -36,8 +36,6 @@
> >  
> >  #define
> > atomic_read_ept_entry(__pepte)                              \
> > ( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } )
> > -#define atomic_write_ept_entry(__pepte,
> > __epte)                     \
> > -    write_atomic(&(__pepte)->epte, (__epte).epte)
> >  
> >  #define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
> >  #define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
> > @@ -46,6 +44,46 @@ static inline bool_t is_epte_valid(ept_entry_t
> > *e) return (e->epte != 0 && e->sa_p2mt != p2m_invalid);
> >  }
> >  
> > +/* returns : 0 for success, -errno otherwise */
> > +static int atomic_write_ept_entry(ept_entry_t *entryptr,
> > ept_entry_t new,
> > +                                  int level)
> > +{
> > +    bool_t same_mfn = (new.mfn == entryptr->mfn);
> > +    unsigned long oldmfn = INVALID_MFN;
> > +
> > +    if ( level )
> > +    {
> 
> ASSERT(!(new.sp && p2m_is_foreign(new.sa_p2mt))) here?

Yeah, I debated adding p2m_is_foreign ASSERT, but didn't because iirc 
Jan had said he wanted to use up the non-leaf bits for something else 
in future.  I suppose I can add it, it would be  easy
to find it anyways....  BTW, there is no path allowing 
superpage for foreign at present.

> > +        write_atomic(&entryptr->epte, new.epte);
> > +        return 0;
> > +    }
> > +
> > +    if ( unlikely(p2m_is_foreign(new.sa_p2mt)) && !same_mfn )
> > +    {
> > +        struct domain *fdom;
> > +
> > +        if ( !mfn_valid(new.mfn) )
> > +            return -EINVAL;
> > +
> > +        fdom = page_get_owner(mfn_to_page(new.mfn));
> 
> This needs a matching put_pg_owner() once you're done with it.

Doing page_get_owner, not get_pg_owner here, so don't think so, right?

thanks,
Mukesh

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-02  1:45     ` Mukesh Rathor
@ 2014-05-02  8:38       ` Jan Beulich
  2014-05-02  8:55       ` Tim Deegan
  1 sibling, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2014-05-02  8:38 UTC (permalink / raw)
  To: Mukesh Rathor, Tim Deegan
  Cc: George.Dunlap, xen-devel, keir.xen, eddie.dong, jun.nakajima

>>> On 02.05.14 at 03:45, <mukesh.rathor@oracle.com> wrote:
> On Thu, 1 May 2014 18:19:08 +0200 Tim Deegan <tim@xen.org> wrote:
>> > @@ -46,6 +44,46 @@ static inline bool_t is_epte_valid(ept_entry_t
>> > *e) return (e->epte != 0 && e->sa_p2mt != p2m_invalid);
>> >  }
>> >  
>> > +/* returns : 0 for success, -errno otherwise */
>> > +static int atomic_write_ept_entry(ept_entry_t *entryptr,
>> > ept_entry_t new,
>> > +                                  int level)
>> > +{
>> > +    bool_t same_mfn = (new.mfn == entryptr->mfn);
>> > +    unsigned long oldmfn = INVALID_MFN;
>> > +
>> > +    if ( level )
>> > +    {
>> 
>> ASSERT(!(new.sp && p2m_is_foreign(new.sa_p2mt))) here?
> 
> Yeah, I debated adding p2m_is_foreign ASSERT, but didn't because iirc 
> Jan had said he wanted to use up the non-leaf bits for something else 
> in future.  I suppose I can add it, it would be  easy
> to find it anyways....  BTW, there is no path allowing 
> superpage for foreign at present.

But .sp set implies it being a leaf entry.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-02  1:45     ` Mukesh Rathor
  2014-05-02  8:38       ` Jan Beulich
@ 2014-05-02  8:55       ` Tim Deegan
  2014-05-02 23:35         ` Mukesh Rathor
  1 sibling, 1 reply; 52+ messages in thread
From: Tim Deegan @ 2014-05-02  8:55 UTC (permalink / raw)
  To: Mukesh Rathor
  Cc: JBeulich, George.Dunlap, eddie.dong, keir.xen, jun.nakajima, xen-devel

Hi,

At 18:45 -0700 on 01 May (1398966318), Mukesh Rathor wrote:
> On Thu, 1 May 2014 18:19:08 +0200
> Tim Deegan <tim@xen.org> wrote:
> > Nearly there, except that the teardown code is now gone, with nothing
> > replacing it.  Ideally we'd have some logic to find active EPT L1
> > tables and undo the foreign refcounts.  If not that, then:
> >  - a hard-coded check, in p2m_add_foreign I suppose, that only allows
> >    foreign mappings to be added to a p2m for the hardware domain,
> >    with a pvh fixme comment explaining why; and
> >  - another pvh fixme comment in the p2m teardown code to say that
> >    once non-hardware domains can have foreign mappings there will
> >    have to be some cleanup code to handle them.
> 
> Added fixmes... 

Thanks.

> > Also, in the inner refcounting logic:
> > 
> > At 18:06 -0700 on 29 Apr (1398791207), Mukesh Rathor wrote:
> > > diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
> > > index c0bfc50..11474e8 100644
> > > --- a/xen/arch/x86/mm/p2m-ept.c
> > > +++ b/xen/arch/x86/mm/p2m-ept.c
> > > @@ -36,8 +36,6 @@
> > >  
> > >  #define
> > > atomic_read_ept_entry(__pepte)                              \
> > > ( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } )
> > > -#define atomic_write_ept_entry(__pepte,
> > > __epte)                     \
> > > -    write_atomic(&(__pepte)->epte, (__epte).epte)
> > >  
> > >  #define is_epte_present(ept_entry)      ((ept_entry)->epte & 0x7)
> > >  #define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
> > > @@ -46,6 +44,46 @@ static inline bool_t is_epte_valid(ept_entry_t
> > > *e) return (e->epte != 0 && e->sa_p2mt != p2m_invalid);
> > >  }
> > >  
> > > +/* returns : 0 for success, -errno otherwise */
> > > +static int atomic_write_ept_entry(ept_entry_t *entryptr,
> > > ept_entry_t new,
> > > +                                  int level)
> > > +{
> > > +    bool_t same_mfn = (new.mfn == entryptr->mfn);
> > > +    unsigned long oldmfn = INVALID_MFN;
> > > +
> > > +    if ( level )
> > > +    {
> > 
> > ASSERT(!(new.sp && p2m_is_foreign(new.sa_p2mt))) here?
> 
> Yeah, I debated adding p2m_is_foreign ASSERT, but didn't because iirc 
> Jan had said he wanted to use up the non-leaf bits for something else 
> in future. 

Well, if new.sp is set it's not non-leaf.  Though I suppose maybe the
test should be for _valid_ + superpage + foreign.

> I suppose I can add it, it would be  easy
> to find it anyways....  BTW, there is no path allowing 
> superpage for foreign at present.

Yes, otherwise the ASSERT would be bogus. :)  But it will be useful at
least as documentation if someone adds such a path.

> > > +        write_atomic(&entryptr->epte, new.epte);
> > > +        return 0;
> > > +    }
> > > +
> > > +    if ( unlikely(p2m_is_foreign(new.sa_p2mt)) && !same_mfn )
> > > +    {
> > > +        struct domain *fdom;
> > > +
> > > +        if ( !mfn_valid(new.mfn) )
> > > +            return -EINVAL;
> > > +
> > > +        fdom = page_get_owner(mfn_to_page(new.mfn));
> > 
> > This needs a matching put_pg_owner() once you're done with it.
> 
> Doing page_get_owner, not get_pg_owner here, so don't think so, right?

Oh yes -- sorry, I got confused between this and the case where you do
use get_pg_owner().  But actually on this subject ISTR objecting to
exporting get_pg_owner() from mm.c before.  Oh yes, here it is:

http://lists.xenproject.org/archives/html/xen-devel/2014-01/msg02612.html

So can you use rcu_lock_live_remote_domain_by_id() here instead?

Tim.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-01  1:19     ` Mukesh Rathor
@ 2014-05-02 11:05       ` Roger Pau Monné
  2014-05-02 12:31         ` Jan Beulich
  2014-05-03  0:01         ` Mukesh Rathor
  0 siblings, 2 replies; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-02 11:05 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On 01/05/14 03:19, Mukesh Rathor wrote:
> On Wed, 30 Apr 2014 11:12:16 -0700
> Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> 
>> On Wed, 30 Apr 2014 16:11:39 +0200
>> Roger Pau Monné <roger.pau@citrix.com> wrote:
>>
>>> On 30/04/14 03:06, Mukesh Rathor wrote:
>> .....
>>
>>> Hello Mukesh,
>>>
>>> Thanks for the new version, unfortunately when trying to boot
>>> FreeBSD Dom0 with this version I get the following hypervisor crash
>>> (it works fine with previous versions):
>>
>> Aha, Jan, there's the vioapic crash!! Roger, see:
>>
>> http://www.gossamer-threads.com/lists/xen/devel/325784
>>
>> I had seen this few weeks ago, but could not reproduce last week 
>> despite several attempts. You are seeing this in V10 because I dropped
>> the vioapic patch from V9 (included below).
>>
>> BTW, since I'm not able to reproduce this, can you kindly check
>> where the ept violation is coming from? Is that on an io space?
>> Also, our binaries don't match, so can you please confirm it's the 
>> call from:
>>
>> hvm_hap_nested_page_fault():
>>     if ( (p2mt == p2m_mmio_dm) ||
>>          (access_w && (p2mt == p2m_ram_ro)) )
>>     {
>>         put_gfn(p2m->domain, gfn);
>>         if ( !handle_mmio() )   <==========
>>             hvm_inject_hw_exception(TRAP_gp_fault, 0);
>>
>> In which case, what's the p2mt?
>>
> 
> Hey Roger,
> 
> I tried few things, but still could not reproduce. I saw it few weeks
> ago, and I think I misread the code thinking hvm_hap_nested_page_fault
> was calling handle_mmio unconditionally, and quickly came up with
> the vioapic patch for v9. 
> 
> So, can you please try with the vioapic patch. Then two things will
> happen:
> 
>   1. The ept violation is genuine, in which case it will return back
>      successfully to ept_handle_violation which will print the gfn/mfn
>      info for further debug.
>   2. the emulation will be handled, in which case we need to know what
>      was it, mmio_dm or ram_ro, and where it came from in dom0? Both are
>      unexpected.

With the patch applied I can boot fine, no error messages at all. I've
printed the address that's causing the vioapic_range call, it's
0x1073741824, which according to the e820 map passed by Xen falls into a
region marked as valid memory:

SMAP type=01 base=0000000000100000 len=000000003ff6e000

The crash happens because FreeBSD scrubs all valid memory at early boot
when booted with hw.memtest.tests=1.

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 11:05       ` Roger Pau Monné
@ 2014-05-02 12:31         ` Jan Beulich
  2014-05-02 14:06           ` Roger Pau Monné
  2014-05-03  0:01         ` Mukesh Rathor
  1 sibling, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2014-05-02 12:31 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 02.05.14 at 13:05, <roger.pau@citrix.com> wrote:
> With the patch applied I can boot fine, no error messages at all. I've
> printed the address that's causing the vioapic_range call, it's
> 0x1073741824, which according to the e820 map passed by Xen falls into a
> region marked as valid memory:
> 
> SMAP type=01 base=0000000000100000 len=000000003ff6e000

If the number in the text above really was represented in hex, it very
clearly is outside the range you show. Assuming it was in fact decimal,
its hex representation being 0x40000000 also makes clear that this is
outside the shown range. But it's also rather strange an address for an
IO-APIC to live at. So in the end I'm only confused.

> The crash happens because FreeBSD scrubs all valid memory at early boot
> when booted with hw.memtest.tests=1.

If with "valid memory" you mean "valid RAM", then clearly the IO-APIC
page(s) shouldn't be touched by this.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 12:31         ` Jan Beulich
@ 2014-05-02 14:06           ` Roger Pau Monné
  2014-05-02 14:16             ` Jan Beulich
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-02 14:06 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George.Dunlap, xen-devel, keir.xen, tim

On 02/05/14 14:31, Jan Beulich wrote:
>>>> On 02.05.14 at 13:05, <roger.pau@citrix.com> wrote:
>> With the patch applied I can boot fine, no error messages at all. I've
>> printed the address that's causing the vioapic_range call, it's
>> 0x1073741824, which according to the e820 map passed by Xen falls into a
>> region marked as valid memory:
>>
>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> 
> If the number in the text above really was represented in hex, it very
> clearly is outside the range you show. Assuming it was in fact decimal,
> its hex representation being 0x40000000 also makes clear that this is
> outside the shown range. But it's also rather strange an address for an
> IO-APIC to live at. So in the end I'm only confused.

My bad, I've incorrectly printed this as 0x%lu instead of %lx, the
following output is correct:

SMAP type=01 base=0000000000000000 len=0000000000092400
SMAP type=02 base=00000000000f0000 len=0000000000010000
SMAP type=01 base=0000000000100000 len=000000003ff6e000
SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
SMAP type=02 base=00000000f8000000 len=0000000005000000
SMAP type=02 base=00000000fe000000 len=0000000000d00400
SMAP type=02 base=00000000fee00000 len=0000000000100000
SMAP type=02 base=00000000ffb00000 len=0000000000500000
SMAP type=02 base=0000000100000000 len=00000000a0000000
(XEN) Trying to access 0x40000000 <- Printed from vioapic_range.

In this case 0x40000000 falls in range reported as usable RAM by Xen:

SMAP type=01 base=0000000000100000 len=000000003ff6e000

Which goes from

[0x100000, 0x4006e000]

Roger.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 14:06           ` Roger Pau Monné
@ 2014-05-02 14:16             ` Jan Beulich
  2014-05-02 14:35               ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2014-05-02 14:16 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 02.05.14 at 16:06, <roger.pau@citrix.com> wrote:
> My bad, I've incorrectly printed this as 0x%lu instead of %lx, the
> following output is correct:
> 
> SMAP type=01 base=0000000000000000 len=0000000000092400
> SMAP type=02 base=00000000000f0000 len=0000000000010000
> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
> SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
> SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
> SMAP type=02 base=00000000f8000000 len=0000000005000000
> SMAP type=02 base=00000000fe000000 len=0000000000d00400
> SMAP type=02 base=00000000fee00000 len=0000000000100000
> SMAP type=02 base=00000000ffb00000 len=0000000000500000
> SMAP type=02 base=0000000100000000 len=00000000a0000000
> (XEN) Trying to access 0x40000000 <- Printed from vioapic_range.
> 
> In this case 0x40000000 falls in range reported as usable RAM by Xen:
> 
> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> 
> Which goes from
> 
> [0x100000, 0x4006e000]

Which is quite odd a range (I realize that I implied the original pair
to be a range, when it was a (start,length) tuple). I suppose the
above is being printed by your kernel - what does the hypervisor
print regarding the layout?

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 14:16             ` Jan Beulich
@ 2014-05-02 14:35               ` Roger Pau Monné
  2014-05-02 15:41                 ` Jan Beulich
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-02 14:35 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George.Dunlap, xen-devel, keir.xen, tim

On 02/05/14 16:16, Jan Beulich wrote:
>>>> On 02.05.14 at 16:06, <roger.pau@citrix.com> wrote:
>> My bad, I've incorrectly printed this as 0x%lu instead of %lx, the
>> following output is correct:
>>
>> SMAP type=01 base=0000000000000000 len=0000000000092400
>> SMAP type=02 base=00000000000f0000 len=0000000000010000
>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
>> SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
>> SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
>> SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
>> SMAP type=02 base=00000000f8000000 len=0000000005000000
>> SMAP type=02 base=00000000fe000000 len=0000000000d00400
>> SMAP type=02 base=00000000fee00000 len=0000000000100000
>> SMAP type=02 base=00000000ffb00000 len=0000000000500000
>> SMAP type=02 base=0000000100000000 len=00000000a0000000
>> (XEN) Trying to access 0x40000000 <- Printed from vioapic_range.
>>
>> In this case 0x40000000 falls in range reported as usable RAM by Xen:
>>
>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
>>
>> Which goes from
>>
>> [0x100000, 0x4006e000]
> 
> Which is quite odd a range (I realize that I implied the original pair
> to be a range, when it was a (start,length) tuple). I suppose the
> above is being printed by your kernel - what does the hypervisor
> print regarding the layout?

Maybe the problem is on FreeBSD, and I'm not correctly clamping the e820
memory map returned by Xen. Right now I'm using start_info->nr_pages as
the number of valid RAM pages assigned to Dom0, but it is not clear if
start_info->nr_pages also takes into account the holes and invalid
regions in the e820 memory map.

This is the hw memory map reported by Xen:

(XEN) Xen-e820 RAM map:
(XEN)  0000000000000000 - 0000000000092400 (usable)
(XEN)  00000000000f0000 - 0000000000100000 (reserved)
(XEN)  0000000000100000 - 00000000dfdf9c00 (usable)
(XEN)  00000000dfdf9c00 - 00000000dfe4bc00 (ACPI NVS)
(XEN)  00000000dfe4bc00 - 00000000dfe4dc00 (ACPI data)
(XEN)  00000000dfe4dc00 - 00000000e0000000 (reserved)
(XEN)  00000000f8000000 - 00000000fd000000 (reserved)
(XEN)  00000000fe000000 - 00000000fed00400 (reserved)
(XEN)  00000000fee00000 - 00000000fef00000 (reserved)
(XEN)  00000000ffb00000 - 0000000100000000 (reserved)
(XEN)  0000000100000000 - 00000001a0000000 (usable)

And the Dom0 is assigned 1024M of RAM.

Roger.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 14:35               ` Roger Pau Monné
@ 2014-05-02 15:41                 ` Jan Beulich
  2014-05-02 16:13                   ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2014-05-02 15:41 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 02.05.14 at 16:35, <roger.pau@citrix.com> wrote:
> On 02/05/14 16:16, Jan Beulich wrote:
>>>>> On 02.05.14 at 16:06, <roger.pau@citrix.com> wrote:
>>> My bad, I've incorrectly printed this as 0x%lu instead of %lx, the
>>> following output is correct:
>>>
>>> SMAP type=01 base=0000000000000000 len=0000000000092400
>>> SMAP type=02 base=00000000000f0000 len=0000000000010000
>>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
>>> SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
>>> SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
>>> SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
>>> SMAP type=02 base=00000000f8000000 len=0000000005000000
>>> SMAP type=02 base=00000000fe000000 len=0000000000d00400
>>> SMAP type=02 base=00000000fee00000 len=0000000000100000
>>> SMAP type=02 base=00000000ffb00000 len=0000000000500000
>>> SMAP type=02 base=0000000100000000 len=00000000a0000000

Considering the hypervisor view below, this range clearly is then
wrong here too, ...

> Maybe the problem is on FreeBSD, and I'm not correctly clamping the e820
> memory map returned by Xen. Right now I'm using start_info->nr_pages as
> the number of valid RAM pages assigned to Dom0, but it is not clear if
> start_info->nr_pages also takes into account the holes and invalid
> regions in the e820 memory map.

i.e. yes, there must be some kind of problem in your handling in any
case.

> This is the hw memory map reported by Xen:
> 
> (XEN) Xen-e820 RAM map:
> (XEN)  0000000000000000 - 0000000000092400 (usable)
> (XEN)  00000000000f0000 - 0000000000100000 (reserved)
> (XEN)  0000000000100000 - 00000000dfdf9c00 (usable)
> (XEN)  00000000dfdf9c00 - 00000000dfe4bc00 (ACPI NVS)
> (XEN)  00000000dfe4bc00 - 00000000dfe4dc00 (ACPI data)
> (XEN)  00000000dfe4dc00 - 00000000e0000000 (reserved)
> (XEN)  00000000f8000000 - 00000000fd000000 (reserved)
> (XEN)  00000000fe000000 - 00000000fed00400 (reserved)
> (XEN)  00000000fee00000 - 00000000fef00000 (reserved)
> (XEN)  00000000ffb00000 - 0000000100000000 (reserved)
> (XEN)  0000000100000000 - 00000001a0000000 (usable)
> 
> And the Dom0 is assigned 1024M of RAM.

I.e. it can have pages at or beyond 0x40000000 only if some other
region is unpopulated.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 15:41                 ` Jan Beulich
@ 2014-05-02 16:13                   ` Roger Pau Monné
  2014-05-02 19:35                     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-02 16:13 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George.Dunlap, xen-devel, keir.xen, tim

On 02/05/14 17:41, Jan Beulich wrote:
>>>> On 02.05.14 at 16:35, <roger.pau@citrix.com> wrote:
>> On 02/05/14 16:16, Jan Beulich wrote:
>>>>>> On 02.05.14 at 16:06, <roger.pau@citrix.com> wrote:
>>>> My bad, I've incorrectly printed this as 0x%lu instead of %lx, the
>>>> following output is correct:
>>>>
>>>> SMAP type=01 base=0000000000000000 len=0000000000092400
>>>> SMAP type=02 base=00000000000f0000 len=0000000000010000
>>>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
>>>> SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
>>>> SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
>>>> SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
>>>> SMAP type=02 base=00000000f8000000 len=0000000005000000
>>>> SMAP type=02 base=00000000fe000000 len=0000000000d00400
>>>> SMAP type=02 base=00000000fee00000 len=0000000000100000
>>>> SMAP type=02 base=00000000ffb00000 len=0000000000500000
>>>> SMAP type=02 base=0000000100000000 len=00000000a0000000
> 
> Considering the hypervisor view below, this range clearly is then
> wrong here too, ...
> 
>> Maybe the problem is on FreeBSD, and I'm not correctly clamping the e820
>> memory map returned by Xen. Right now I'm using start_info->nr_pages as
>> the number of valid RAM pages assigned to Dom0, but it is not clear if
>> start_info->nr_pages also takes into account the holes and invalid
>> regions in the e820 memory map.
> 
> i.e. yes, there must be some kind of problem in your handling in any
> case.
> 
>> This is the hw memory map reported by Xen:
>>
>> (XEN) Xen-e820 RAM map:
>> (XEN)  0000000000000000 - 0000000000092400 (usable)
>> (XEN)  00000000000f0000 - 0000000000100000 (reserved)
>> (XEN)  0000000000100000 - 00000000dfdf9c00 (usable)
>> (XEN)  00000000dfdf9c00 - 00000000dfe4bc00 (ACPI NVS)
>> (XEN)  00000000dfe4bc00 - 00000000dfe4dc00 (ACPI data)
>> (XEN)  00000000dfe4dc00 - 00000000e0000000 (reserved)
>> (XEN)  00000000f8000000 - 00000000fd000000 (reserved)
>> (XEN)  00000000fe000000 - 00000000fed00400 (reserved)
>> (XEN)  00000000fee00000 - 00000000fef00000 (reserved)
>> (XEN)  00000000ffb00000 - 0000000100000000 (reserved)
>> (XEN)  0000000100000000 - 00000001a0000000 (usable)
>>
>> And the Dom0 is assigned 1024M of RAM.
> 
> I.e. it can have pages at or beyond 0x40000000 only if some other
> region is unpopulated.

If I got this right, it means that the maximum populated gpfn on the
domain is (start_info->nr_pages << PAGE_SHIFT), which is kind of odd,
because on PVH Dom0 all the holes in the memory map are already set to
p2m_mmio_direct (see pvh_map_all_iomem in patch 1), so I don't think
there's anything I can unpopulate, and it means that the dom0_mem param
passed in the command line is not properly handled, because the actual
usable RAM assigned to the Dom0 will vary depending on the underlying
hardware e820 map.

Roger.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 16:13                   ` Roger Pau Monné
@ 2014-05-02 19:35                     ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 52+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-05-02 19:35 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim, Jan Beulich

On Fri, May 02, 2014 at 06:13:51PM +0200, Roger Pau Monné wrote:
> On 02/05/14 17:41, Jan Beulich wrote:
> >>>> On 02.05.14 at 16:35, <roger.pau@citrix.com> wrote:
> >> On 02/05/14 16:16, Jan Beulich wrote:
> >>>>>> On 02.05.14 at 16:06, <roger.pau@citrix.com> wrote:
> >>>> My bad, I've incorrectly printed this as 0x%lu instead of %lx, the
> >>>> following output is correct:
> >>>>
> >>>> SMAP type=01 base=0000000000000000 len=0000000000092400
> >>>> SMAP type=02 base=00000000000f0000 len=0000000000010000
> >>>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> >>>> SMAP type=04 base=00000000dfdf9c00 len=0000000000052000
> >>>> SMAP type=03 base=00000000dfe4bc00 len=0000000000002000
> >>>> SMAP type=02 base=00000000dfe4dc00 len=00000000001b2400
> >>>> SMAP type=02 base=00000000f8000000 len=0000000005000000
> >>>> SMAP type=02 base=00000000fe000000 len=0000000000d00400
> >>>> SMAP type=02 base=00000000fee00000 len=0000000000100000
> >>>> SMAP type=02 base=00000000ffb00000 len=0000000000500000
> >>>> SMAP type=02 base=0000000100000000 len=00000000a0000000
> > 
> > Considering the hypervisor view below, this range clearly is then
> > wrong here too, ...
> > 
> >> Maybe the problem is on FreeBSD, and I'm not correctly clamping the e820
> >> memory map returned by Xen. Right now I'm using start_info->nr_pages as
> >> the number of valid RAM pages assigned to Dom0, but it is not clear if
> >> start_info->nr_pages also takes into account the holes and invalid
> >> regions in the e820 memory map.
> > 
> > i.e. yes, there must be some kind of problem in your handling in any
> > case.
> > 
> >> This is the hw memory map reported by Xen:
> >>
> >> (XEN) Xen-e820 RAM map:
> >> (XEN)  0000000000000000 - 0000000000092400 (usable)
> >> (XEN)  00000000000f0000 - 0000000000100000 (reserved)
> >> (XEN)  0000000000100000 - 00000000dfdf9c00 (usable)
> >> (XEN)  00000000dfdf9c00 - 00000000dfe4bc00 (ACPI NVS)
> >> (XEN)  00000000dfe4bc00 - 00000000dfe4dc00 (ACPI data)
> >> (XEN)  00000000dfe4dc00 - 00000000e0000000 (reserved)
> >> (XEN)  00000000f8000000 - 00000000fd000000 (reserved)
> >> (XEN)  00000000fe000000 - 00000000fed00400 (reserved)
> >> (XEN)  00000000fee00000 - 00000000fef00000 (reserved)
> >> (XEN)  00000000ffb00000 - 0000000100000000 (reserved)
> >> (XEN)  0000000100000000 - 00000001a0000000 (usable)
> >>
> >> And the Dom0 is assigned 1024M of RAM.
> > 
> > I.e. it can have pages at or beyond 0x40000000 only if some other
> > region is unpopulated.
> 
> If I got this right, it means that the maximum populated gpfn on the
> domain is (start_info->nr_pages << PAGE_SHIFT), which is kind of odd,

So it is -1ULL without any dom0_mem=max:X arguments but if you
use dom0_mem=max:X it has a sensible value?

If you use 'dom0_mem=max:1GB" the nr_pages should be 262144.

> because on PVH Dom0 all the holes in the memory map are already set to
> p2m_mmio_direct (see pvh_map_all_iomem in patch 1), so I don't think
> there's anything I can unpopulate, and it means that the dom0_mem param
> passed in the command line is not properly handled, because the actual
> usable RAM assigned to the Dom0 will vary depending on the underlying
> hardware e820 map.

Right. The nr_pages will only correspond to the E820_RAM regions.

> 
> Roger.
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-02  8:55       ` Tim Deegan
@ 2014-05-02 23:35         ` Mukesh Rathor
  2014-05-05  7:46           ` Jan Beulich
  2014-05-08 12:16           ` Tim Deegan
  0 siblings, 2 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-02 23:35 UTC (permalink / raw)
  To: Tim Deegan
  Cc: JBeulich, George.Dunlap, eddie.dong, keir.xen, jun.nakajima, xen-devel

On Fri, 2 May 2014 10:55:55 +0200
Tim Deegan <tim@xen.org> wrote:

> Hi,
> 
> At 18:45 -0700 on 01 May (1398966318), Mukesh Rathor wrote:
> > On Thu, 1 May 2014 18:19:08 +0200
> > Tim Deegan <tim@xen.org> wrote:
.....
> > > 
> > > At 18:06 -0700 on 29 Apr (1398791207), Mukesh Rathor wrote:
> > > > diff --git a/xen/arch/x86/mm/p2m-ept.c
> > > > b/xen/arch/x86/mm/p2m-ept.c index c0bfc50..11474e8 100644
> > > > --- a/xen/arch/x86/mm/p2m-ept.c
> > > > +++ b/xen/arch/x86/mm/p2m-ept.c
> > > > @@ -36,8 +36,6 @@
> > > >  
> > > >  #define
> > > > atomic_read_ept_entry(__pepte)                              \
> > > > ( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } )
> > > > -#define atomic_write_ept_entry(__pepte,
> > > > __epte)                     \
> > > > -    write_atomic(&(__pepte)->epte, (__epte).epte)
> > > >  
> > > >  #define is_epte_present(ept_entry)      ((ept_entry)->epte &
> > > > 0x7) #define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
> > > > @@ -46,6 +44,46 @@ static inline bool_t
> > > > is_epte_valid(ept_entry_t *e) return (e->epte != 0 &&
> > > > e->sa_p2mt != p2m_invalid); }
> > > >  
> > > > +/* returns : 0 for success, -errno otherwise */
> > > > +static int atomic_write_ept_entry(ept_entry_t *entryptr,
> > > > ept_entry_t new,
> > > > +                                  int level)
> > > > +{
> > > > +    bool_t same_mfn = (new.mfn == entryptr->mfn);
> > > > +    unsigned long oldmfn = INVALID_MFN;
> > > > +
> > > > +    if ( level )
> > > > +    {
> > > 
> > > ASSERT(!(new.sp && p2m_is_foreign(new.sa_p2mt))) here?
> > 
> > Yeah, I debated adding p2m_is_foreign ASSERT, but didn't because
> > iirc Jan had said he wanted to use up the non-leaf bits for
> > something else in future. 
> 
> Well, if new.sp is set it's not non-leaf.  Though I suppose maybe the
> test should be for _valid_ + superpage + foreign.

Ok, so looks like:

    ASSERT(new.sa_p2mt == p2m_invalid || 
           !(new.sp && p2m_is_foreign(new.sa_p2mt)));

should do it?

...

> > > > +        write_atomic(&entryptr->epte, new.epte);
> > > > +        return 0;
> > > > +    }
> > > > +
> > > > +    if ( unlikely(p2m_is_foreign(new.sa_p2mt)) && !same_mfn )
> > > > +    {
> > > > +        struct domain *fdom;
> > > > +
> > > > +        if ( !mfn_valid(new.mfn) )
> > > > +            return -EINVAL;
> > > > +
> > > > +        fdom = page_get_owner(mfn_to_page(new.mfn));
> > > 
> > > This needs a matching put_pg_owner() once you're done with it.
> > 
> > Doing page_get_owner, not get_pg_owner here, so don't think so,
> > right?
> 
> Oh yes -- sorry, I got confused between this and the case where you do
> use get_pg_owner().  But actually on this subject ISTR objecting to
> exporting get_pg_owner() from mm.c before.  Oh yes, here it is:
> 
> http://lists.xenproject.org/archives/html/xen-devel/2014-01/msg02612.html
> 
> So can you use rcu_lock_live_remote_domain_by_id() here instead?


In p2m_add_foreign() we already get the rcu lock, still do it here? 

And, btw, that function, p2m_add_foreign(), went from 

+    if ( (fdid == DOMID_XEN && (fdom = rcu_lock_domain(dom_xen)) == NULL) ||
+         (fdid != DOMID_XEN && (fdom = rcu_lock_domain_by_id(fdid)) == NULL) )
+        goto out;

to 

+    fdom = get_pg_owner(foreigndom);

(causing get_pg_owner to be exported) at Jan's comment/suggestion in v8.

thanks,
mukesh

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-02 11:05       ` Roger Pau Monné
  2014-05-02 12:31         ` Jan Beulich
@ 2014-05-03  0:01         ` Mukesh Rathor
  2014-05-05  8:52           ` Roger Pau Monné
  1 sibling, 1 reply; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-03  0:01 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On Fri, 2 May 2014 13:05:23 +0200
Roger Pau Monné <roger.pau@citrix.com> wrote:

> On 01/05/14 03:19, Mukesh Rathor wrote:
> > On Wed, 30 Apr 2014 11:12:16 -0700
> > Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> > 
> >> On Wed, 30 Apr 2014 16:11:39 +0200
> >> Roger Pau Monné <roger.pau@citrix.com> wrote:
> >>
> >>> On 30/04/14 03:06, Mukesh Rathor wrote:
> >> .....
> >>
> >>> Hello Mukesh,
> >>>
> >>> Thanks for the new version, unfortunately when trying to boot
> >>> FreeBSD Dom0 with this version I get the following hypervisor
> >>> crash (it works fine with previous versions):
> >>
> >> Aha, Jan, there's the vioapic crash!! Roger, see:
> >>
> >> http://www.gossamer-threads.com/lists/xen/devel/325784
> >>
> >> I had seen this few weeks ago, but could not reproduce last week 
> >> despite several attempts. You are seeing this in V10 because I
> >> dropped the vioapic patch from V9 (included below).
> >>
> >> BTW, since I'm not able to reproduce this, can you kindly check
> >> where the ept violation is coming from? Is that on an io space?
> >> Also, our binaries don't match, so can you please confirm it's the 
> >> call from:
> >>
> >> hvm_hap_nested_page_fault():
> >>     if ( (p2mt == p2m_mmio_dm) ||
> >>          (access_w && (p2mt == p2m_ram_ro)) )
> >>     {
> >>         put_gfn(p2m->domain, gfn);
> >>         if ( !handle_mmio() )   <==========
> >>             hvm_inject_hw_exception(TRAP_gp_fault, 0);
> >>
> >> In which case, what's the p2mt?
> >>
> > 
> > Hey Roger,
> > 
> > I tried few things, but still could not reproduce. I saw it few
> > weeks ago, and I think I misread the code thinking
> > hvm_hap_nested_page_fault was calling handle_mmio unconditionally,
> > and quickly came up with the vioapic patch for v9. 
> > 
> > So, can you please try with the vioapic patch. Then two things will
> > happen:
> > 
> >   1. The ept violation is genuine, in which case it will return back
> >      successfully to ept_handle_violation which will print the
> > gfn/mfn info for further debug.
> >   2. the emulation will be handled, in which case we need to know
> > what was it, mmio_dm or ram_ro, and where it came from in dom0?
> > Both are unexpected.
> 
> With the patch applied I can boot fine, no error messages at all. I've
> printed the address that's causing the vioapic_range call, it's
> 0x1073741824, which according to the e820 map passed by Xen falls
> into a region marked as valid memory:
> 
> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> 
> The crash happens because FreeBSD scrubs all valid memory at early
> boot when booted with hw.memtest.tests=1.

Hi Roger,

I think something else is going on here. 
The vioapic address check is fenced by is_hvm check, 

    if ( !nestedhvm_vcpu_in_guestmode(v)
         && is_hvm_vcpu(v)    <====
         && gfn == PFN_DOWN(vlapic_base_address(vcpu_vlapic(v))) )
    {

so the call should be coming from the place I mentioned above.
The p2mt combined with the pfn would hopefully tell whats going on.

Can you kindly remove the vioapic patch, and apply below patch and post
the output from both hvm_hap_nested_page_fault and ept_violation.

thanks
mukesh


index ac05160..dcffc6d 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1667,6 +1667,15 @@ int hvm_hap_nested_page_fault(paddr_t gpa,
          (access_w && (p2mt == p2m_ram_ro)) )
     {
         put_gfn(p2m->domain, gfn);
+
+        if ( is_pvh_vcpu(v) )
+        {
+            printk("hvm_hap_nested_page_fault: gfn:%lx gla:%lx p2mt:%d\n",
+                   gfn, gla, p2mt);
+            rc = 0;
+            goto out;
+        }
+
         if ( !handle_mmio() )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
         rc = 1;


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-02 23:35         ` Mukesh Rathor
@ 2014-05-05  7:46           ` Jan Beulich
  2014-05-08 12:16           ` Tim Deegan
  1 sibling, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2014-05-05  7:46 UTC (permalink / raw)
  To: Mukesh Rathor
  Cc: George.Dunlap, Tim Deegan, eddie.dong, keir.xen, jun.nakajima, xen-devel

>>> On 03.05.14 at 01:35, <mukesh.rathor@oracle.com> wrote:
> On Fri, 2 May 2014 10:55:55 +0200
> Tim Deegan <tim@xen.org> wrote:
>> At 18:45 -0700 on 01 May (1398966318), Mukesh Rathor wrote:
>> > On Thu, 1 May 2014 18:19:08 +0200
>> > Tim Deegan <tim@xen.org> wrote:
>> > > At 18:06 -0700 on 29 Apr (1398791207), Mukesh Rathor wrote:
>> > > > diff --git a/xen/arch/x86/mm/p2m-ept.c
>> > > > b/xen/arch/x86/mm/p2m-ept.c index c0bfc50..11474e8 100644
>> > > > --- a/xen/arch/x86/mm/p2m-ept.c
>> > > > +++ b/xen/arch/x86/mm/p2m-ept.c
>> > > > @@ -36,8 +36,6 @@
>> > > >  
>> > > >  #define
>> > > > atomic_read_ept_entry(__pepte)                              \
>> > > > ( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } )
>> > > > -#define atomic_write_ept_entry(__pepte,
>> > > > __epte)                     \
>> > > > -    write_atomic(&(__pepte)->epte, (__epte).epte)
>> > > >  
>> > > >  #define is_epte_present(ept_entry)      ((ept_entry)->epte &
>> > > > 0x7) #define is_epte_superpage(ept_entry)    ((ept_entry)->sp)
>> > > > @@ -46,6 +44,46 @@ static inline bool_t
>> > > > is_epte_valid(ept_entry_t *e) return (e->epte != 0 &&
>> > > > e->sa_p2mt != p2m_invalid); }
>> > > >  
>> > > > +/* returns : 0 for success, -errno otherwise */
>> > > > +static int atomic_write_ept_entry(ept_entry_t *entryptr,
>> > > > ept_entry_t new,
>> > > > +                                  int level)
>> > > > +{
>> > > > +    bool_t same_mfn = (new.mfn == entryptr->mfn);
>> > > > +    unsigned long oldmfn = INVALID_MFN;
>> > > > +
>> > > > +    if ( level )
>> > > > +    {
>> > > 
>> > > ASSERT(!(new.sp && p2m_is_foreign(new.sa_p2mt))) here?
>> > 
>> > Yeah, I debated adding p2m_is_foreign ASSERT, but didn't because
>> > iirc Jan had said he wanted to use up the non-leaf bits for
>> > something else in future. 
>> 
>> Well, if new.sp is set it's not non-leaf.  Though I suppose maybe the
>> test should be for _valid_ + superpage + foreign.
> 
> Ok, so looks like:
> 
>     ASSERT(new.sa_p2mt == p2m_invalid || 
>            !(new.sp && p2m_is_foreign(new.sa_p2mt)));
> 
> should do it?

Decoding this to

    ASSERT(new.sa_p2mt == p2m_invalid || !new.sp || !p2m_is_foreign(new.sa_p2mt));

makes already clear that the first check is redundant with the last one.
I.e. just

    ASSERT(!new.sp || !p2m_is_foreign(new.sa_p2mt));

would seem to suffice, but then again wouldn't cover intermediate
levels (which right not appears to not be a problem).

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-03  0:01         ` Mukesh Rathor
@ 2014-05-05  8:52           ` Roger Pau Monné
  2014-05-06  0:28             ` Mukesh Rathor
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-05  8:52 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On 03/05/14 02:01, Mukesh Rathor wrote:
> On Fri, 2 May 2014 13:05:23 +0200
> Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
>> On 01/05/14 03:19, Mukesh Rathor wrote:
>>> On Wed, 30 Apr 2014 11:12:16 -0700
>>> Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
>>>
>>>> On Wed, 30 Apr 2014 16:11:39 +0200
>>>> Roger Pau Monné <roger.pau@citrix.com> wrote:
>>>>
>>>>> On 30/04/14 03:06, Mukesh Rathor wrote:
>>>> .....
>>>>
>>>>> Hello Mukesh,
>>>>>
>>>>> Thanks for the new version, unfortunately when trying to boot
>>>>> FreeBSD Dom0 with this version I get the following hypervisor
>>>>> crash (it works fine with previous versions):
>>>>
>>>> Aha, Jan, there's the vioapic crash!! Roger, see:
>>>>
>>>> http://www.gossamer-threads.com/lists/xen/devel/325784
>>>>
>>>> I had seen this few weeks ago, but could not reproduce last week 
>>>> despite several attempts. You are seeing this in V10 because I
>>>> dropped the vioapic patch from V9 (included below).
>>>>
>>>> BTW, since I'm not able to reproduce this, can you kindly check
>>>> where the ept violation is coming from? Is that on an io space?
>>>> Also, our binaries don't match, so can you please confirm it's the 
>>>> call from:
>>>>
>>>> hvm_hap_nested_page_fault():
>>>>     if ( (p2mt == p2m_mmio_dm) ||
>>>>          (access_w && (p2mt == p2m_ram_ro)) )
>>>>     {
>>>>         put_gfn(p2m->domain, gfn);
>>>>         if ( !handle_mmio() )   <==========
>>>>             hvm_inject_hw_exception(TRAP_gp_fault, 0);
>>>>
>>>> In which case, what's the p2mt?
>>>>
>>>
>>> Hey Roger,
>>>
>>> I tried few things, but still could not reproduce. I saw it few
>>> weeks ago, and I think I misread the code thinking
>>> hvm_hap_nested_page_fault was calling handle_mmio unconditionally,
>>> and quickly came up with the vioapic patch for v9. 
>>>
>>> So, can you please try with the vioapic patch. Then two things will
>>> happen:
>>>
>>>   1. The ept violation is genuine, in which case it will return back
>>>      successfully to ept_handle_violation which will print the
>>> gfn/mfn info for further debug.
>>>   2. the emulation will be handled, in which case we need to know
>>> what was it, mmio_dm or ram_ro, and where it came from in dom0?
>>> Both are unexpected.
>>
>> With the patch applied I can boot fine, no error messages at all. I've
>> printed the address that's causing the vioapic_range call, it's
>> 0x1073741824, which according to the e820 map passed by Xen falls
>> into a region marked as valid memory:
>>
>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
>>
>> The crash happens because FreeBSD scrubs all valid memory at early
>> boot when booted with hw.memtest.tests=1.
> 
> Hi Roger,

Hello Mukesh, thanks for the help.

> I think something else is going on here. 
> The vioapic address check is fenced by is_hvm check, 
> 
>     if ( !nestedhvm_vcpu_in_guestmode(v)
>          && is_hvm_vcpu(v)    <====
>          && gfn == PFN_DOWN(vlapic_base_address(vcpu_vlapic(v))) )
>     {

AFAIK this is not the path that's causing the fault, the fault comes from:

    if ( (p2mt == p2m_mmio_dm) ||
         (access_w && (p2mt == p2m_ram_ro)) )
    {
        put_gfn(p2m->domain, gfn);
        if ( !handle_mmio() ) <=====
            hvm_inject_hw_exception(TRAP_gp_fault, 0);
        rc = 1;
        goto out;
    }

This was happening because I was trying to access a gpfn from outside of
the p2m map, which didn't have a valid mfn. The type of the page was
p2m_mmio_dm, the access p2m_access_n and the mfn was not valid (I've
done a p2m->get_entry on the faulting address).

This was because I was using start_info->nr_pages as the number of
usable RAM pages, but AFAICT from the code in domain_build.c,
pvh_map_all_iomem is making holes in the p2m, but it is not adding those
freed pages back to the end of the memory map, so the value in nr_pages
is not the number of usable RAM pages, but the number of pages in the
p2m map (taking into account both usable RAM pages and p2m_mmio_direct
regions).

I'm not sure if this logic is correct, shouldn't the freed pages by
pvh_map_all_iomem be added to the end of the memory map?

I will try with the attached patch, but this means I will have to revert
to my previous broken logic in FreeBSD.

Roger.


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-05  8:52           ` Roger Pau Monné
@ 2014-05-06  0:28             ` Mukesh Rathor
  2014-05-06  7:13               ` Roger Pau Monné
  2014-05-06 19:38               ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-06  0:28 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On Mon, 5 May 2014 10:52:54 +0200
Roger Pau Monné <roger.pau@citrix.com> wrote:

> On 03/05/14 02:01, Mukesh Rathor wrote:
> > On Fri, 2 May 2014 13:05:23 +0200
> > Roger Pau Monné <roger.pau@citrix.com> wrote:
> > 
> >> On 01/05/14 03:19, Mukesh Rathor wrote:
> >>> On Wed, 30 Apr 2014 11:12:16 -0700
> >>> Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> >>>
> >>>> On Wed, 30 Apr 2014 16:11:39 +0200
> >>>> Roger Pau Monné <roger.pau@citrix.com> wrote:
> >>>>
> >>>>> On 30/04/14 03:06, Mukesh Rathor wrote:
> >>>> .....
> >>>>
> >>>>> Hello Mukesh,
> >>>>>
.......
> >> With the patch applied I can boot fine, no error messages at all.
> >> I've printed the address that's causing the vioapic_range call,
> >> it's 0x1073741824, which according to the e820 map passed by Xen
> >> falls into a region marked as valid memory:
> >>
> >> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> >>
> >> The crash happens because FreeBSD scrubs all valid memory at early
> >> boot when booted with hw.memtest.tests=1.
> > 
> > Hi Roger,
> 
> Hello Mukesh, thanks for the help.
> 
> > I think something else is going on here. 
> > The vioapic address check is fenced by is_hvm check, 
> > 
> >     if ( !nestedhvm_vcpu_in_guestmode(v)
> >          && is_hvm_vcpu(v)    <====
> >          && gfn == PFN_DOWN(vlapic_base_address(vcpu_vlapic(v))) )
> >     {
> 
> AFAIK this is not the path that's causing the fault, the fault comes
> from:
> 
>     if ( (p2mt == p2m_mmio_dm) ||
>          (access_w && (p2mt == p2m_ram_ro)) )
>     {
>         put_gfn(p2m->domain, gfn);
>         if ( !handle_mmio() ) <=====
>             hvm_inject_hw_exception(TRAP_gp_fault, 0);
>         rc = 1;
>         goto out;
>     }
> 
> This was happening because I was trying to access a gpfn from outside
> of the p2m map, which didn't have a valid mfn. The type of the page
> was p2m_mmio_dm, the access p2m_access_n and the mfn was not valid
> (I've done a p2m->get_entry on the faulting address).

Ok, I know what's going on. By default, the p2m type returned is
p2m_mmio_dm. I'll resubmit my vioapic patch. But, your real issue
here is pages released from punched holes not added back. See below
for that. Once you fix that, you'll not see this, unless some other
kernel bug causing ept violation.

> This was because I was using start_info->nr_pages as the number of
> usable RAM pages, but AFAICT from the code in domain_build.c,
> pvh_map_all_iomem is making holes in the p2m, but it is not adding
> those freed pages back to the end of the memory map, so the value in
> nr_pages is not the number of usable RAM pages, but the number of
> pages in the p2m map (taking into account both usable RAM pages and
> p2m_mmio_direct regions).
> 
> I'm not sure if this logic is correct, shouldn't the freed pages by
> pvh_map_all_iomem be added to the end of the memory map?

Yeah, it's confusing a bit. Let me talk in terms of linux.

In case of PV dom0, linux parses the e820 and punches holes in
the p2m itself. See xen_set_identity_and_release(). For pvh, we
can skip some of that (since it already happened in xen), but we still 
use the "released" variable to keep track of that. Then later, 
xen_populate_chunk() adds those "released" pages back via 
XENMEM_populate_physmap. This happens for both PV and PVH, so the 
pages are added back. 

xen_memory_setup():
        /*
         * Populate back the non-RAM pages and E820 gaps that had been
         * released. */
        populated = xen_populate_chunk(map, memmap.nr_entries,
                        max_pfn, &last_pfn, xen_released_pages);


Perhaps your logic that does similar for PV needs to make sure it is 
populating the pages back for pvh also?  You just don't need to 
punch holes in the p2m as you do for PV, for PVH you can skip that 
part.  Hope that makes sense.

FWIW, my very first patch didn't do that in xen, but was done in linux
same as for PV. This required a new hypercall. But several maintainers 
felt we should map all iomem in xen upfront.

thanks
mukesh


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-06  0:28             ` Mukesh Rathor
@ 2014-05-06  7:13               ` Roger Pau Monné
  2014-05-06  8:09                 ` Jan Beulich
  2014-05-07  1:00                 ` Mukesh Rathor
  2014-05-06 19:38               ` Konrad Rzeszutek Wilk
  1 sibling, 2 replies; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-06  7:13 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On 06/05/14 02:28, Mukesh Rathor wrote:
> On Mon, 5 May 2014 10:52:54 +0200
> Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
>> On 03/05/14 02:01, Mukesh Rathor wrote:
>>> On Fri, 2 May 2014 13:05:23 +0200
>>> Roger Pau Monné <roger.pau@citrix.com> wrote:
>>>
>>>> On 01/05/14 03:19, Mukesh Rathor wrote:
>>>>> On Wed, 30 Apr 2014 11:12:16 -0700
>>>>> Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
>>>>>
>>>>>> On Wed, 30 Apr 2014 16:11:39 +0200
>>>>>> Roger Pau Monné <roger.pau@citrix.com> wrote:
>>>>>>
>>>>>>> On 30/04/14 03:06, Mukesh Rathor wrote:
>>>>>> .....
>>>>>>
>>>>>>> Hello Mukesh,
>>>>>>>
> .......
>>>> With the patch applied I can boot fine, no error messages at all.
>>>> I've printed the address that's causing the vioapic_range call,
>>>> it's 0x1073741824, which according to the e820 map passed by Xen
>>>> falls into a region marked as valid memory:
>>>>
>>>> SMAP type=01 base=0000000000100000 len=000000003ff6e000
>>>>
>>>> The crash happens because FreeBSD scrubs all valid memory at early
>>>> boot when booted with hw.memtest.tests=1.
>>>
>>> Hi Roger,
>>
>> Hello Mukesh, thanks for the help.
>>
>>> I think something else is going on here. 
>>> The vioapic address check is fenced by is_hvm check, 
>>>
>>>     if ( !nestedhvm_vcpu_in_guestmode(v)
>>>          && is_hvm_vcpu(v)    <====
>>>          && gfn == PFN_DOWN(vlapic_base_address(vcpu_vlapic(v))) )
>>>     {
>>
>> AFAIK this is not the path that's causing the fault, the fault comes
>> from:
>>
>>     if ( (p2mt == p2m_mmio_dm) ||
>>          (access_w && (p2mt == p2m_ram_ro)) )
>>     {
>>         put_gfn(p2m->domain, gfn);
>>         if ( !handle_mmio() ) <=====
>>             hvm_inject_hw_exception(TRAP_gp_fault, 0);
>>         rc = 1;
>>         goto out;
>>     }
>>
>> This was happening because I was trying to access a gpfn from outside
>> of the p2m map, which didn't have a valid mfn. The type of the page
>> was p2m_mmio_dm, the access p2m_access_n and the mfn was not valid
>> (I've done a p2m->get_entry on the faulting address).
> 
> Ok, I know what's going on. By default, the p2m type returned is
> p2m_mmio_dm. I'll resubmit my vioapic patch. But, your real issue
> here is pages released from punched holes not added back. See below
> for that. Once you fix that, you'll not see this, unless some other
> kernel bug causing ept violation.

Yes, that's the problem.

> 
>> This was because I was using start_info->nr_pages as the number of
>> usable RAM pages, but AFAICT from the code in domain_build.c,
>> pvh_map_all_iomem is making holes in the p2m, but it is not adding
>> those freed pages back to the end of the memory map, so the value in
>> nr_pages is not the number of usable RAM pages, but the number of
>> pages in the p2m map (taking into account both usable RAM pages and
>> p2m_mmio_direct regions).
>>
>> I'm not sure if this logic is correct, shouldn't the freed pages by
>> pvh_map_all_iomem be added to the end of the memory map?
> 
> Yeah, it's confusing a bit. Let me talk in terms of linux.
> 
> In case of PV dom0, linux parses the e820 and punches holes in
> the p2m itself. See xen_set_identity_and_release(). For pvh, we
> can skip some of that (since it already happened in xen), but we still 
> use the "released" variable to keep track of that. Then later, 
> xen_populate_chunk() adds those "released" pages back via 
> XENMEM_populate_physmap. This happens for both PV and PVH, so the 
> pages are added back. 
> 
> xen_memory_setup():
>         /*
>          * Populate back the non-RAM pages and E820 gaps that had been
>          * released. */
>         populated = xen_populate_chunk(map, memmap.nr_entries,
>                         max_pfn, &last_pfn, xen_released_pages);
> 
> 
> Perhaps your logic that does similar for PV needs to make sure it is 
> populating the pages back for pvh also?  You just don't need to 
> punch holes in the p2m as you do for PV, for PVH you can skip that 
> part.  Hope that makes sense.

Sure, that makes a lot of sense. What I was wondering is why don't we
move the logic of "xen_populate_chunk" into Xen code itself, like we do
for the holes part. If Xen is in charge of doing the holes in the memory
map, it should also take care of adding those pages back to the end of
the memory map, what we do right now is very awkward IMHO.

Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-06  7:13               ` Roger Pau Monné
@ 2014-05-06  8:09                 ` Jan Beulich
  2014-05-07  1:00                 ` Mukesh Rathor
  1 sibling, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2014-05-06  8:09 UTC (permalink / raw)
  To: Roger Pau Monné, Mukesh Rathor
  Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 06.05.14 at 09:13, <roger.pau@citrix.com> wrote:
> Sure, that makes a lot of sense. What I was wondering is why don't we
> move the logic of "xen_populate_chunk" into Xen code itself, like we do
> for the holes part. If Xen is in charge of doing the holes in the memory
> map, it should also take care of adding those pages back to the end of
> the memory map, what we do right now is very awkward IMHO.

Indeed.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 1/4] pvh dom0: construct_dom0 changes
  2014-04-30  1:06 ` [V10 PATCH 1/4] pvh dom0: construct_dom0 changes Mukesh Rathor
@ 2014-05-06 15:18   ` Roger Pau Monné
  0 siblings, 0 replies; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-06 15:18 UTC (permalink / raw)
  To: Mukesh Rathor, xen-devel; +Cc: George.Dunlap, keir.xen, tim, JBeulich

On 30/04/14 03:06, Mukesh Rathor wrote:
> This patch changes construct_dom0() to boot in pvh mode:
>   - Make sure dom0 elf supports pvh mode.
>   - Call guest_physmap_add_page for pvh rather than simple p2m setting
>   - Map all non-RAM regions 1:1 upto the end region in e820 or 4GB which
>     ever is higher.
>   - Allocate p2m, copying calculation from toolstack.
>   - Allocate shared info page from the virtual space so that dom0 PT
>     can be updated. Then update p2m for it with the actual mfn.
>   - Since we build the page tables for pvh same as for pv, in
>     pvh_fixup_page_tables_for_hap we replace the mfns with pfns.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

I have the following patch on top of yours, in order to add the memory 
removed by the holes to the end of the memory map:

---
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 38ed9f6..6f5ba22 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -327,11 +327,13 @@ static __init void pvh_add_mem_mapping(struct domain *d, unsigned long gfn,
  * pvh fixme: The following doesn't map MMIO ranges when they sit above the
  *            highest E820 covered address.
  */
-static __init void pvh_map_all_iomem(struct domain *d)
+static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
 {
     unsigned long start_pfn, end_pfn, end = 0, start = 0;
     const struct e820entry *entry;
-    unsigned int i, nump;
+    unsigned int i, j, nump, navail, nmap, nr_holes = 0;
+    struct page_info *page;
+    int rc;
 
     for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
     {
@@ -353,6 +355,9 @@ static __init void pvh_map_all_iomem(struct domain *d)
                 nump = end_pfn - start_pfn;
                 /* Add pages to the mapping */
                 pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
+                if (start_pfn <= nr_pages)
+                    nr_holes += (end_pfn < nr_pages) ?
+                                    nump : (nr_pages - start_pfn);
             }
             start = end;
         }
@@ -369,6 +374,42 @@ static __init void pvh_map_all_iomem(struct domain *d)
         nump = end_pfn - start_pfn;
         pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
     }
+
+    /*
+     * Add the memory removed by the holes at the end of the
+     * memory map.
+     */
+    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
+    {
+        if ( entry->type != E820_RAM )
+            continue;
+
+        end_pfn = PFN_UP(entry->addr + entry->size);
+        if ( end_pfn <= nr_pages )
+            continue;
+
+        navail = end_pfn - nr_pages;
+        nmap = navail > nr_holes ? nr_holes : navail;
+        start_pfn = PFN_DOWN(entry->addr) < nr_pages ?
+                        nr_pages : PFN_DOWN(entry->addr);
+        page = alloc_domheap_pages(d, get_order_from_pages(nmap), 0);
+        if ( !page )
+            panic("Not enough RAM for domain 0");
+        for ( j = 0; j < nmap; j++ )
+        {
+            rc = guest_physmap_add_page(d, start_pfn + j, page_to_mfn(page), 0);
+            if (rc != 0)
+                panic("Unable to add gpfn %#lx mfn %#lx to Dom0 physmap",
+                      start_pfn + j, page_to_mfn(page));
+            page++;
+
+        }
+        nr_holes -= nmap;
+        if (nr_holes == 0)
+            break;
+    }
+
+    ASSERT(nr_holes == 0);
 }
 
 static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
@@ -1391,7 +1432,7 @@ int __init construct_dom0(
         pfn = shared_info_paddr >> PAGE_SHIFT;
         dom0_update_physmap(d, pfn, mfn, 0);
 
-        pvh_map_all_iomem(d);
+        pvh_map_all_iomem(d, nr_pages);
     }
 
     if ( d->domain_id == hardware_domid )

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-06  0:28             ` Mukesh Rathor
  2014-05-06  7:13               ` Roger Pau Monné
@ 2014-05-06 19:38               ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 52+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-05-06 19:38 UTC (permalink / raw)
  To: Mukesh Rathor
  Cc: George.Dunlap, tim, keir.xen, JBeulich, xen-devel, Roger Pau Monné

On Mon, May 05, 2014 at 05:28:58PM -0700, Mukesh Rathor wrote:
> On Mon, 5 May 2014 10:52:54 +0200
> Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
> > On 03/05/14 02:01, Mukesh Rathor wrote:
> > > On Fri, 2 May 2014 13:05:23 +0200
> > > Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > 
> > >> On 01/05/14 03:19, Mukesh Rathor wrote:
> > >>> On Wed, 30 Apr 2014 11:12:16 -0700
> > >>> Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> > >>>
> > >>>> On Wed, 30 Apr 2014 16:11:39 +0200
> > >>>> Roger Pau Monné <roger.pau@citrix.com> wrote:
> > >>>>
> > >>>>> On 30/04/14 03:06, Mukesh Rathor wrote:
> > >>>> .....
> > >>>>
> > >>>>> Hello Mukesh,
> > >>>>>
> .......
> > >> With the patch applied I can boot fine, no error messages at all.
> > >> I've printed the address that's causing the vioapic_range call,
> > >> it's 0x1073741824, which according to the e820 map passed by Xen
> > >> falls into a region marked as valid memory:
> > >>
> > >> SMAP type=01 base=0000000000100000 len=000000003ff6e000
> > >>
> > >> The crash happens because FreeBSD scrubs all valid memory at early
> > >> boot when booted with hw.memtest.tests=1.
> > > 
> > > Hi Roger,
> > 
> > Hello Mukesh, thanks for the help.
> > 
> > > I think something else is going on here. 
> > > The vioapic address check is fenced by is_hvm check, 
> > > 
> > >     if ( !nestedhvm_vcpu_in_guestmode(v)
> > >          && is_hvm_vcpu(v)    <====
> > >          && gfn == PFN_DOWN(vlapic_base_address(vcpu_vlapic(v))) )
> > >     {
> > 
> > AFAIK this is not the path that's causing the fault, the fault comes
> > from:
> > 
> >     if ( (p2mt == p2m_mmio_dm) ||
> >          (access_w && (p2mt == p2m_ram_ro)) )
> >     {
> >         put_gfn(p2m->domain, gfn);
> >         if ( !handle_mmio() ) <=====
> >             hvm_inject_hw_exception(TRAP_gp_fault, 0);
> >         rc = 1;
> >         goto out;
> >     }
> > 
> > This was happening because I was trying to access a gpfn from outside
> > of the p2m map, which didn't have a valid mfn. The type of the page
> > was p2m_mmio_dm, the access p2m_access_n and the mfn was not valid
> > (I've done a p2m->get_entry on the faulting address).
> 
> Ok, I know what's going on. By default, the p2m type returned is
> p2m_mmio_dm. I'll resubmit my vioapic patch. But, your real issue
> here is pages released from punched holes not added back. See below
> for that. Once you fix that, you'll not see this, unless some other
> kernel bug causing ept violation.
> 
> > This was because I was using start_info->nr_pages as the number of
> > usable RAM pages, but AFAICT from the code in domain_build.c,
> > pvh_map_all_iomem is making holes in the p2m, but it is not adding
> > those freed pages back to the end of the memory map, so the value in
> > nr_pages is not the number of usable RAM pages, but the number of
> > pages in the p2m map (taking into account both usable RAM pages and
> > p2m_mmio_direct regions).
> > 
> > I'm not sure if this logic is correct, shouldn't the freed pages by
> > pvh_map_all_iomem be added to the end of the memory map?
> 
> Yeah, it's confusing a bit. Let me talk in terms of linux.
> 
> In case of PV dom0, linux parses the e820 and punches holes in
> the p2m itself. See xen_set_identity_and_release(). For pvh, we
> can skip some of that (since it already happened in xen), but we still 
> use the "released" variable to keep track of that. Then later, 
> xen_populate_chunk() adds those "released" pages back via 
> XENMEM_populate_physmap. This happens for both PV and PVH, so the 
> pages are added back. 

I wrote that and .. I am not too thrilled about it as I found out
that it can actually cause performance issues. See 
http://lists.xen.org/archives/html/xen-devel/2014-04/msg00001.html

A bit of brainstorming came up with an idea of actually
not doing this punching holes and adding them later but instead just
manipulating the P2M (inside Linux) and M2P (via MMU hypercalls).

I think hoisting creation of the E820 and the P2M (and M2P) to
be done all in the toolstack or hypervisor would be most
beneficial (I think David Vrabel mentioned this idea first so
the credit goes to him).

> 
> xen_memory_setup():
>         /*
>          * Populate back the non-RAM pages and E820 gaps that had been
>          * released. */
>         populated = xen_populate_chunk(map, memmap.nr_entries,
>                         max_pfn, &last_pfn, xen_released_pages);
> 
> 
> Perhaps your logic that does similar for PV needs to make sure it is 
> populating the pages back for pvh also?  You just don't need to 
> punch holes in the p2m as you do for PV, for PVH you can skip that 
> part.  Hope that makes sense.
> 
> FWIW, my very first patch didn't do that in xen, but was done in linux
> same as for PV. This required a new hypercall. But several maintainers 
> felt we should map all iomem in xen upfront.
> 
> thanks
> mukesh
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-06  7:13               ` Roger Pau Monné
  2014-05-06  8:09                 ` Jan Beulich
@ 2014-05-07  1:00                 ` Mukesh Rathor
  2014-05-07  7:50                   ` Jan Beulich
  2014-05-07 13:20                   ` Konrad Rzeszutek Wilk
  1 sibling, 2 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-07  1:00 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On Tue, 6 May 2014 09:13:08 +0200
Roger Pau Monné <roger.pau@citrix.com> wrote:

> On 06/05/14 02:28, Mukesh Rathor wrote:
> > On Mon, 5 May 2014 10:52:54 +0200
> > Roger Pau Monné <roger.pau@citrix.com> wrote:
....
> >> This was because I was using start_info->nr_pages as the number of
> >> usable RAM pages, but AFAICT from the code in domain_build.c,
> >> pvh_map_all_iomem is making holes in the p2m, but it is not adding
> >> those freed pages back to the end of the memory map, so the value
> >> in nr_pages is not the number of usable RAM pages, but the number
> >> of pages in the p2m map (taking into account both usable RAM pages
> >> and p2m_mmio_direct regions).
> >>
> >> I'm not sure if this logic is correct, shouldn't the freed pages by
> >> pvh_map_all_iomem be added to the end of the memory map?
> > 
> > Yeah, it's confusing a bit. Let me talk in terms of linux.
> > 
> > In case of PV dom0, linux parses the e820 and punches holes in
> > the p2m itself. See xen_set_identity_and_release(). For pvh, we
> > can skip some of that (since it already happened in xen), but we
> > still use the "released" variable to keep track of that. Then
> > later, xen_populate_chunk() adds those "released" pages back via 
> > XENMEM_populate_physmap. This happens for both PV and PVH, so the 
> > pages are added back. 
> > 
> > xen_memory_setup():
> >         /*
> >          * Populate back the non-RAM pages and E820 gaps that had
> > been
> >          * released. */
> >         populated = xen_populate_chunk(map, memmap.nr_entries,
> >                         max_pfn, &last_pfn, xen_released_pages);
> > 
> > 
> > Perhaps your logic that does similar for PV needs to make sure it
> > is populating the pages back for pvh also?  You just don't need to 
> > punch holes in the p2m as you do for PV, for PVH you can skip that 
> > part.  Hope that makes sense.
> 
> Sure, that makes a lot of sense. What I was wondering is why don't we
> move the logic of "xen_populate_chunk" into Xen code itself, like we
> do for the holes part. If Xen is in charge of doing the holes in the
> memory map, it should also take care of adding those pages back to
> the end of the memory map, what we do right now is very awkward IMHO.

We don't because the guest needs to know the last pfn mapped. PV guest
starts with nr_pages as max pfn mapped and then walks the e820 punching
holes, keeping track of last_pfn etc... That code in linux is a bit
fragile, there is max_pfn, lastpfn, max_pages, xen_start_info->nr_pages, 
extra_pages, to name a few! 

IOW, even if we did this in xen, guest would still need to have the
exact same code just to adjust it's last_pfn. That would be dangerous
if they go out of sync.

Dont' you already have all that logic for PV dom0? You would just need
to skip the p2m update.

Jan, 

As Konrad points out, there are other issues with that part of the code.
So, IMO, we should wait and address that altogether for both PV and PVH,
and for now leave this as is.

If you still feel we should do this in xen (as in required for this
patch series to go in), then we'd need to communicate last pfn to guest.
However, note that guest would still have all that code for PV, for pvh
we'd just skip it. Obvious way coming to mind is:

unsigned long last_pfn_mapped; 

added to struct start_info.  Alternative would be to add a 
new sub call to perhaps getdomaininfo hcall or something similar. 
Please lmk. 

thanks,
mukesh


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07  1:00                 ` Mukesh Rathor
@ 2014-05-07  7:50                   ` Jan Beulich
  2014-05-07  9:48                     ` Roger Pau Monné
                                       ` (2 more replies)
  2014-05-07 13:20                   ` Konrad Rzeszutek Wilk
  1 sibling, 3 replies; 52+ messages in thread
From: Jan Beulich @ 2014-05-07  7:50 UTC (permalink / raw)
  To: roger.pau, Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 07.05.14 at 03:00, <mukesh.rathor@oracle.com> wrote:
> As Konrad points out, there are other issues with that part of the code.
> So, IMO, we should wait and address that altogether for both PV and PVH,
> and for now leave this as is.

I'm fine either way, as long as you and Roger can reach agreement.
I still tend towards considering Roger's position the right one as a
mid/long term route.

> If you still feel we should do this in xen (as in required for this
> patch series to go in), then we'd need to communicate last pfn to guest.
> However, note that guest would still have all that code for PV, for pvh
> we'd just skip it. Obvious way coming to mind is:
> 
> unsigned long last_pfn_mapped; 
> 
> added to struct start_info.  Alternative would be to add a 
> new sub call to perhaps getdomaininfo hcall or something similar. 
> Please lmk. 

We already have the max_pfn field in the shared info, which for
PV is under the sole control of the guest kernel. That could
certainly be set to other than zero to communicate what the
highest mapped PFN is. The problem I'm seeing with this (and
with everything you suggest above) is that this doesn't tell the
guest where the holes are - while Dom0 can get at the machine
memory map to tell (or really: guess) where they are, DomU can't
yet will need to as soon as you want to be able to support device
pass-through. Hence I guess we will need XENMEM_memory_map to
reflect reality for PVH guests.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07  7:50                   ` Jan Beulich
@ 2014-05-07  9:48                     ` Roger Pau Monné
  2014-05-07 11:34                       ` Jan Beulich
  2014-05-07 13:25                     ` Konrad Rzeszutek Wilk
  2014-05-08  0:04                     ` Mukesh Rathor
  2 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-07  9:48 UTC (permalink / raw)
  To: Jan Beulich, Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, tim

On 07/05/14 09:50, Jan Beulich wrote:
>>>> On 07.05.14 at 03:00, <mukesh.rathor@oracle.com> wrote:
>> As Konrad points out, there are other issues with that part of the code.
>> So, IMO, we should wait and address that altogether for both PV and PVH,
>> and for now leave this as is.
> 
> I'm fine either way, as long as you and Roger can reach agreement.
> I still tend towards considering Roger's position the right one as a
> mid/long term route.
> 
>> If you still feel we should do this in xen (as in required for this
>> patch series to go in), then we'd need to communicate last pfn to guest.
>> However, note that guest would still have all that code for PV, for pvh
>> we'd just skip it. Obvious way coming to mind is:
>>
>> unsigned long last_pfn_mapped; 
>>
>> added to struct start_info.  Alternative would be to add a 
>> new sub call to perhaps getdomaininfo hcall or something similar. 
>> Please lmk. 
> 
> We already have the max_pfn field in the shared info, which for
> PV is under the sole control of the guest kernel. That could
> certainly be set to other than zero to communicate what the
> highest mapped PFN is. The problem I'm seeing with this (and
> with everything you suggest above) is that this doesn't tell the
> guest where the holes are - while Dom0 can get at the machine
> memory map to tell (or really: guess) where they are, DomU can't
> yet will need to as soon as you want to be able to support device
> pass-through. Hence I guess we will need XENMEM_memory_map to
> reflect reality for PVH guests.

I've expanded my patch that added the memory removed form the holes to 
the end of the memory map to also create an e820 map for Dom0 that 
reflects the reality of the underlying p2m. This way PVH guests (either 
DomU or Dom0) should only use XENMEM_memory_map to get the correct e820 
map (and no need for clamping in the Dom0 case).

---
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 38ed9f6..03e8008 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -327,11 +327,13 @@ static __init void pvh_add_mem_mapping(struct domain *d, unsigned long gfn,
  * pvh fixme: The following doesn't map MMIO ranges when they sit above the
  *            highest E820 covered address.
  */
-static __init void pvh_map_all_iomem(struct domain *d)
+static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
 {
     unsigned long start_pfn, end_pfn, end = 0, start = 0;
     const struct e820entry *entry;
-    unsigned int i, nump;
+    unsigned int i, j, nump, navail, nmap, nr_holes = 0;
+    struct page_info *page;
+    int rc;
 
     for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
     {
@@ -353,6 +355,9 @@ static __init void pvh_map_all_iomem(struct domain *d)
                 nump = end_pfn - start_pfn;
                 /* Add pages to the mapping */
                 pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
+                if ( start_pfn <= nr_pages )
+                    nr_holes += (end_pfn < nr_pages) ?
+                                    nump : (nr_pages - start_pfn);
             }
             start = end;
         }
@@ -369,6 +374,89 @@ static __init void pvh_map_all_iomem(struct domain *d)
         nump = end_pfn - start_pfn;
         pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
     }
+
+    /*
+     * Add the memory removed by the holes at the end of the
+     * memory map.
+     */
+    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
+    {
+        if ( entry->type != E820_RAM )
+            continue;
+
+        end_pfn = PFN_UP(entry->addr + entry->size);
+        if ( end_pfn <= nr_pages )
+            continue;
+
+        navail = end_pfn - nr_pages;
+        nmap = navail > nr_holes ? nr_holes : navail;
+        start_pfn = PFN_DOWN(entry->addr) < nr_pages ?
+                        nr_pages : PFN_DOWN(entry->addr);
+        page = alloc_domheap_pages(d, get_order_from_pages(nmap), 0);
+        if ( !page )
+            panic("Not enough RAM for domain 0");
+        for ( j = 0; j < nmap; j++ )
+        {
+            rc = guest_physmap_add_page(d, start_pfn + j, page_to_mfn(page), 0);
+            if ( rc != 0 )
+                panic("Unable to add gpfn %#lx mfn %#lx to Dom0 physmap",
+                      start_pfn + j, page_to_mfn(page));
+            page++;
+
+        }
+        nr_holes -= nmap;
+        if ( nr_holes == 0 )
+            break;
+    }
+
+    ASSERT(nr_holes == 0);
+}
+
+static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
+{
+    struct e820entry *entry;
+    unsigned int i;
+    unsigned long pages, cur_pages = 0;
+
+    /*
+     * Craft the e820 memory map for Dom0 based on the hardware e820 map.
+     */
+    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
+    if ( !d->arch.e820 )
+        panic("Unable to allocate memory for Dom0 e820 map");
+
+    memcpy(d->arch.e820, e820.map, sizeof(struct e820entry) * e820.nr_map);
+    d->arch.nr_e820 = e820.nr_map;
+
+    /* Clamp e820 memory map to match the memory assigned to Dom0 */
+    for ( i = 0, entry = d->arch.e820; i < d->arch.nr_e820; i++, entry++ )
+    {
+        if ( entry->type != E820_RAM )
+            continue;
+
+        if ( nr_pages == cur_pages )
+        {
+            /*
+             * We already have all the assigned memory,
+             * mark this region as reserved.
+             */
+            entry->type = E820_RESERVED;
+            continue;
+        }
+
+        pages = entry->size >> PAGE_SHIFT;
+        if ( (cur_pages + pages) > nr_pages )
+        {
+            /* Truncate region */
+            entry->size = (nr_pages - cur_pages) << PAGE_SHIFT;
+            cur_pages = nr_pages;
+        }
+        else
+        {
+            cur_pages += pages;
+        }
+    }
+    ASSERT(cur_pages == nr_pages);
 }
 
 static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
@@ -1391,7 +1479,8 @@ int __init construct_dom0(
         pfn = shared_info_paddr >> PAGE_SHIFT;
         dom0_update_physmap(d, pfn, mfn, 0);
 
-        pvh_map_all_iomem(d);
+        pvh_map_all_iomem(d, nr_pages);
+        pvh_setup_e820(d, nr_pages);
     }
 
     if ( d->domain_id == hardware_domid )

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07  9:48                     ` Roger Pau Monné
@ 2014-05-07 11:34                       ` Jan Beulich
  2014-05-08 10:27                         ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2014-05-07 11:34 UTC (permalink / raw)
  To: Roger Pau Monné, Mukesh Rathor
  Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 07.05.14 at 11:48, <roger.pau@citrix.com> wrote:
> I've expanded my patch that added the memory removed form the holes to 
> the end of the memory map to also create an e820 map for Dom0 that 
> reflects the reality of the underlying p2m. This way PVH guests (either 
> DomU or Dom0) should only use XENMEM_memory_map to get the correct e820 
> map (and no need for clamping in the Dom0 case).

Looks generally good (except that it doesn't seem to deal with the
DomU case yet), but there are a couple of possible issues:

> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -327,11 +327,13 @@ static __init void pvh_add_mem_mapping(struct domain 
> *d, unsigned long gfn,
>   * pvh fixme: The following doesn't map MMIO ranges when they sit above the
>   *            highest E820 covered address.
>   */
> -static __init void pvh_map_all_iomem(struct domain *d)
> +static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
>  {
>      unsigned long start_pfn, end_pfn, end = 0, start = 0;
>      const struct e820entry *entry;
> -    unsigned int i, nump;
> +    unsigned int i, j, nump, navail, nmap, nr_holes = 0;

With nr_pages above being unsigned long, I think some of these
variables should be too.

> @@ -369,6 +374,89 @@ static __init void pvh_map_all_iomem(struct domain *d)
>          nump = end_pfn - start_pfn;
>          pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
>      }
> +
> +    /*
> +     * Add the memory removed by the holes at the end of the
> +     * memory map.
> +     */
> +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
> +    {
> +        if ( entry->type != E820_RAM )
> +            continue;
> +
> +        end_pfn = PFN_UP(entry->addr + entry->size);
> +        if ( end_pfn <= nr_pages )
> +            continue;
> +
> +        navail = end_pfn - nr_pages;
> +        nmap = navail > nr_holes ? nr_holes : navail;
> +        start_pfn = PFN_DOWN(entry->addr) < nr_pages ?
> +                        nr_pages : PFN_DOWN(entry->addr);
> +        page = alloc_domheap_pages(d, get_order_from_pages(nmap), 0);
> +        if ( !page )
> +            panic("Not enough RAM for domain 0");
> +        for ( j = 0; j < nmap; j++ )
> +        {
> +            rc = guest_physmap_add_page(d, start_pfn + j, page_to_mfn(page), 0);

I realize that this interface isn't the most flexible one in terms of what
page orders it allows to be passed in, but could you at least use order
9 here when the allocation order above is 9 or higher?

> +static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
> +{
> +    struct e820entry *entry;
> +    unsigned int i;
> +    unsigned long pages, cur_pages = 0;
> +
> +    /*
> +     * Craft the e820 memory map for Dom0 based on the hardware e820 map.
> +     */
> +    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
> +    if ( !d->arch.e820 )
> +        panic("Unable to allocate memory for Dom0 e820 map");
> +
> +    memcpy(d->arch.e820, e820.map, sizeof(struct e820entry) * e820.nr_map);
> +    d->arch.nr_e820 = e820.nr_map;
> +
> +    /* Clamp e820 memory map to match the memory assigned to Dom0 */
> +    for ( i = 0, entry = d->arch.e820; i < d->arch.nr_e820; i++, entry++ )
> +    {
> +        if ( entry->type != E820_RAM )
> +            continue;
> +
> +        if ( nr_pages == cur_pages )
> +        {
> +            /*
> +             * We already have all the assigned memory,
> +             * mark this region as reserved.
> +             */
> +            entry->type = E820_RESERVED;

That seems undesirable, as it will prevent Linux from using that space
for MMIO purposes. Plus keeping the region here is consistent with
simply shrinking the range below (yielding the remainder uncovered
by any E820 entry). I think you ought to zap the entry here.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07  1:00                 ` Mukesh Rathor
  2014-05-07  7:50                   ` Jan Beulich
@ 2014-05-07 13:20                   ` Konrad Rzeszutek Wilk
  2014-05-07 13:38                     ` Roger Pau Monné
  2014-05-08  0:07                     ` Mukesh Rathor
  1 sibling, 2 replies; 52+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-05-07 13:20 UTC (permalink / raw)
  To: Mukesh Rathor
  Cc: George.Dunlap, tim, keir.xen, JBeulich, xen-devel, Roger Pau Monné

On Tue, May 06, 2014 at 06:00:53PM -0700, Mukesh Rathor wrote:
> On Tue, 6 May 2014 09:13:08 +0200
> Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
> > On 06/05/14 02:28, Mukesh Rathor wrote:
> > > On Mon, 5 May 2014 10:52:54 +0200
> > > Roger Pau Monné <roger.pau@citrix.com> wrote:
> ....
> > >> This was because I was using start_info->nr_pages as the number of
> > >> usable RAM pages, but AFAICT from the code in domain_build.c,
> > >> pvh_map_all_iomem is making holes in the p2m, but it is not adding
> > >> those freed pages back to the end of the memory map, so the value
> > >> in nr_pages is not the number of usable RAM pages, but the number
> > >> of pages in the p2m map (taking into account both usable RAM pages
> > >> and p2m_mmio_direct regions).
> > >>
> > >> I'm not sure if this logic is correct, shouldn't the freed pages by
> > >> pvh_map_all_iomem be added to the end of the memory map?
> > > 
> > > Yeah, it's confusing a bit. Let me talk in terms of linux.
> > > 
> > > In case of PV dom0, linux parses the e820 and punches holes in
> > > the p2m itself. See xen_set_identity_and_release(). For pvh, we
> > > can skip some of that (since it already happened in xen), but we
> > > still use the "released" variable to keep track of that. Then
> > > later, xen_populate_chunk() adds those "released" pages back via 
> > > XENMEM_populate_physmap. This happens for both PV and PVH, so the 
> > > pages are added back. 
> > > 
> > > xen_memory_setup():
> > >         /*
> > >          * Populate back the non-RAM pages and E820 gaps that had
> > > been
> > >          * released. */
> > >         populated = xen_populate_chunk(map, memmap.nr_entries,
> > >                         max_pfn, &last_pfn, xen_released_pages);
> > > 
> > > 
> > > Perhaps your logic that does similar for PV needs to make sure it
> > > is populating the pages back for pvh also?  You just don't need to 
> > > punch holes in the p2m as you do for PV, for PVH you can skip that 
> > > part.  Hope that makes sense.
> > 
> > Sure, that makes a lot of sense. What I was wondering is why don't we
> > move the logic of "xen_populate_chunk" into Xen code itself, like we
> > do for the holes part. If Xen is in charge of doing the holes in the
> > memory map, it should also take care of adding those pages back to
> > the end of the memory map, what we do right now is very awkward IMHO.
> 
> We don't because the guest needs to know the last pfn mapped. PV guest
> starts with nr_pages as max pfn mapped and then walks the e820 punching
> holes, keeping track of last_pfn etc... That code in linux is a bit
> fragile, there is max_pfn, lastpfn, max_pages, xen_start_info->nr_pages, 
> extra_pages, to name a few! 
> 
> IOW, even if we did this in xen, guest would still need to have the
> exact same code just to adjust it's last_pfn. That would be dangerous
> if they go out of sync.

The Linux code already has that.
> 
> Dont' you already have all that logic for PV dom0? You would just need
> to skip the p2m update.
> 
> Jan, 
> 
> As Konrad points out, there are other issues with that part of the code.
> So, IMO, we should wait and address that altogether for both PV and PVH,
> and for now leave this as is.
> 
> If you still feel we should do this in xen (as in required for this
> patch series to go in), then we'd need to communicate last pfn to guest.
> However, note that guest would still have all that code for PV, for pvh
> we'd just skip it. Obvious way coming to mind is:
> 
> unsigned long last_pfn_mapped; 
> 
> added to struct start_info.  Alternative would be to add a 
> new sub call to perhaps getdomaininfo hcall or something similar. 
> Please lmk. 

Uh, why? The Linux generic code figures out the last_pfn by
enumerating the E820. We only need 'max_pfn_mapped' in the P2M code
to figure the top of the P2M array provided by us - so that we
can use the extend_brk for anything above that- but that is
flexible enough to deal with holes in the P2M and such.
We also skip all of that when we run in the PVH mode.

For example, right now the Linux PV code can run with an E820 that
looks like swiss cheese - aka like the hosts' one. That is easily
seen if you boot an PV guest with PCI passthrough devices - as the
libxl 'e820_host' option gets turned on which creates an E820
that looks like the hosts. Granted it does not populate the P2M
as such (it is all linear). But the point is that the Linux
code is capable of dealing with this and bring the P2M to sync.

Hoisting this up in the hypervisor would be a plus - as the
Linux code wouldn't have to do this anymore.

> 
> thanks,
> mukesh
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07  7:50                   ` Jan Beulich
  2014-05-07  9:48                     ` Roger Pau Monné
@ 2014-05-07 13:25                     ` Konrad Rzeszutek Wilk
  2014-05-08  0:04                     ` Mukesh Rathor
  2 siblings, 0 replies; 52+ messages in thread
From: Konrad Rzeszutek Wilk @ 2014-05-07 13:25 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George.Dunlap, tim, keir.xen, xen-devel, roger.pau

On Wed, May 07, 2014 at 08:50:16AM +0100, Jan Beulich wrote:
> >>> On 07.05.14 at 03:00, <mukesh.rathor@oracle.com> wrote:
> > As Konrad points out, there are other issues with that part of the code.
> > So, IMO, we should wait and address that altogether for both PV and PVH,
> > and for now leave this as is.
> 
> I'm fine either way, as long as you and Roger can reach agreement.
> I still tend towards considering Roger's position the right one as a
> mid/long term route.
> 
> > If you still feel we should do this in xen (as in required for this
> > patch series to go in), then we'd need to communicate last pfn to guest.
> > However, note that guest would still have all that code for PV, for pvh
> > we'd just skip it. Obvious way coming to mind is:
> > 
> > unsigned long last_pfn_mapped; 
> > 
> > added to struct start_info.  Alternative would be to add a 
> > new sub call to perhaps getdomaininfo hcall or something similar. 
> > Please lmk. 
> 
> We already have the max_pfn field in the shared info, which for
> PV is under the sole control of the guest kernel. That could
> certainly be set to other than zero to communicate what the
> highest mapped PFN is. The problem I'm seeing with this (and
> with everything you suggest above) is that this doesn't tell the
> guest where the holes are - while Dom0 can get at the machine
> memory map to tell (or really: guess) where they are, DomU can't
> yet will need to as soon as you want to be able to support device
> pass-through. Hence I guess we will need XENMEM_memory_map to
> reflect reality for PVH guests.

<nods> Right. And we can make it easier for the PVH guest if the M2P
and the Xen's P2M match the E820. For dom0 - meshing Roger's patch
in Mukesh should do it.

For the toolstack, well, that is a bit - there is code to create
E820's, but the code to setup the pagetables and such would need
to be written.

Or we can let the Linux guest do it as it has done for PV guests.
(Parse the E820, do the hypercalls to punch holes and add pages
back in).  For that thought the hypercalls it uses aren't correct
(I think - they didn't work for me in Xen 4.4) and that would need
some changes.

<sigh> Either way code has to be written somewhere to deal with this
for domU guests.

> 
> Jan
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07 13:20                   ` Konrad Rzeszutek Wilk
@ 2014-05-07 13:38                     ` Roger Pau Monné
  2014-05-08  0:12                       ` Mukesh Rathor
  2014-05-08  0:07                     ` Mukesh Rathor
  1 sibling, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-07 13:38 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, Mukesh Rathor
  Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On 07/05/14 15:20, Konrad Rzeszutek Wilk wrote:
> For example, right now the Linux PV code can run with an E820 that
> looks like swiss cheese - aka like the hosts' one. That is easily
> seen if you boot an PV guest with PCI passthrough devices - as the
> libxl 'e820_host' option gets turned on which creates an E820
> that looks like the hosts. Granted it does not populate the P2M
> as such (it is all linear). But the point is that the Linux
> code is capable of dealing with this and bring the P2M to sync.
> 
> Hoisting this up in the hypervisor would be a plus - as the
> Linux code wouldn't have to do this anymore.

IMHO doing it in the hypervisor is clearly the right solution, forcing
the guest OS to do all this on it's own just promotes code duplication
across the several OSes with PV(H) support.

Roger.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07  7:50                   ` Jan Beulich
  2014-05-07  9:48                     ` Roger Pau Monné
  2014-05-07 13:25                     ` Konrad Rzeszutek Wilk
@ 2014-05-08  0:04                     ` Mukesh Rathor
  2014-05-08  6:37                       ` Jan Beulich
  2 siblings, 1 reply; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-08  0:04 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George.Dunlap, xen-devel, keir.xen, tim, roger.pau

On Wed, 07 May 2014 08:50:16 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 07.05.14 at 03:00, <mukesh.rathor@oracle.com> wrote:
> > As Konrad points out, there are other issues with that part of the
> > code. So, IMO, we should wait and address that altogether for both
> > PV and PVH, and for now leave this as is.
> 
> I'm fine either way, as long as you and Roger can reach agreement.
> I still tend towards considering Roger's position the right one as a
> mid/long term route.

Ok, will let Roger drive that on xen, and I'll look at linux side. But
since his patch is incremental on top of mine, I find request reasonable
that my next version V12 be checked in if no issues, or at least 
construct_dom0 patch which is sitting reviewed for over 6 months now. 

thanks,
Mukesh

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07 13:20                   ` Konrad Rzeszutek Wilk
  2014-05-07 13:38                     ` Roger Pau Monné
@ 2014-05-08  0:07                     ` Mukesh Rathor
  1 sibling, 0 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-08  0:07 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: George.Dunlap, tim, keir.xen, JBeulich, xen-devel, Roger Pau Monné

On Wed, 7 May 2014 09:20:35 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:

> On Tue, May 06, 2014 at 06:00:53PM -0700, Mukesh Rathor wrote:
> > On Tue, 6 May 2014 09:13:08 +0200
> > Roger Pau Monné <roger.pau@citrix.com> wrote:
> > 
> > > On 06/05/14 02:28, Mukesh Rathor wrote:
> > > > On Mon, 5 May 2014 10:52:54 +0200
> > > > Roger Pau Monné <roger.pau@citrix.com> wrote:
> > ....
...
> > > Sure, that makes a lot of sense. What I was wondering is why
> > > don't we move the logic of "xen_populate_chunk" into Xen code
> > > itself, like we do for the holes part. If Xen is in charge of
> > > doing the holes in the memory map, it should also take care of
> > > adding those pages back to the end of the memory map, what we do
> > > right now is very awkward IMHO.
> > 
> > We don't because the guest needs to know the last pfn mapped. PV
> > guest starts with nr_pages as max pfn mapped and then walks the
> > e820 punching holes, keeping track of last_pfn etc... That code in
> > linux is a bit fragile, there is max_pfn, lastpfn, max_pages,
> > xen_start_info->nr_pages, extra_pages, to name a few! 
> > 
> > IOW, even if we did this in xen, guest would still need to have the
> > exact same code just to adjust it's last_pfn. That would be
> > dangerous if they go out of sync.
> 
> The Linux code already has that.
> > 
> > Dont' you already have all that logic for PV dom0? You would just
> > need to skip the p2m update.
> > 
> > Jan, 
> > 
> > As Konrad points out, there are other issues with that part of the
> > code. So, IMO, we should wait and address that altogether for both
> > PV and PVH, and for now leave this as is.
> > 
> > If you still feel we should do this in xen (as in required for this
> > patch series to go in), then we'd need to communicate last pfn to
> > guest. However, note that guest would still have all that code for
> > PV, for pvh we'd just skip it. Obvious way coming to mind is:
> > 
> > unsigned long last_pfn_mapped; 
> > 
> > added to struct start_info.  Alternative would be to add a 
> > new sub call to perhaps getdomaininfo hcall or something similar. 
> > Please lmk. 
> 
> Uh, why? The Linux generic code figures out the last_pfn by
> enumerating the E820. We only need 'max_pfn_mapped' in the P2M code
> to figure the top of the P2M array provided by us - so that we
> can use the extend_brk for anything above that- but that is
> flexible enough to deal with holes in the P2M and such.
> We also skip all of that when we run in the PVH mode.

Correct, if we do this in xen,  without Roger's second patch,
then enumerating e820 won't tell that. His second patch fixes that.

Mukesh

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07 13:38                     ` Roger Pau Monné
@ 2014-05-08  0:12                       ` Mukesh Rathor
  2014-05-08 10:52                         ` George Dunlap
  2014-05-08 13:15                         ` David Vrabel
  0 siblings, 2 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-08  0:12 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, tim, keir.xen, JBeulich, xen-devel

On Wed, 7 May 2014 15:38:26 +0200
Roger Pau Monné <roger.pau@citrix.com> wrote:

> On 07/05/14 15:20, Konrad Rzeszutek Wilk wrote:
> > For example, right now the Linux PV code can run with an E820 that
> > looks like swiss cheese - aka like the hosts' one. That is easily
> > seen if you boot an PV guest with PCI passthrough devices - as the
> > libxl 'e820_host' option gets turned on which creates an E820
> > that looks like the hosts. Granted it does not populate the P2M
> > as such (it is all linear). But the point is that the Linux
> > code is capable of dealing with this and bring the P2M to sync.
> > 
> > Hoisting this up in the hypervisor would be a plus - as the
> > Linux code wouldn't have to do this anymore.
> 
> IMHO doing it in the hypervisor is clearly the right solution, forcing
> the guest OS to do all this on it's own just promotes code duplication
> across the several OSes with PV(H) support.

I am in agreement with you, but my point was code exists for PV already,
so if you address pvh only, then that code still has to exist in 
the guest until PV reaches EOL. 

FWIW, my agreement with previous maintainers was to keep PVH as close to 
PV as possible.

thanks,
mukesh


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08  0:04                     ` Mukesh Rathor
@ 2014-05-08  6:37                       ` Jan Beulich
  2014-05-08 19:15                         ` Mukesh Rathor
  0 siblings, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2014-05-08  6:37 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, tim, roger.pau

>>> On 08.05.14 at 02:04, <mukesh.rathor@oracle.com> wrote:
> On Wed, 07 May 2014 08:50:16 +0100
> "Jan Beulich" <JBeulich@suse.com> wrote:
> 
>> >>> On 07.05.14 at 03:00, <mukesh.rathor@oracle.com> wrote:
>> > As Konrad points out, there are other issues with that part of the
>> > code. So, IMO, we should wait and address that altogether for both
>> > PV and PVH, and for now leave this as is.
>> 
>> I'm fine either way, as long as you and Roger can reach agreement.
>> I still tend towards considering Roger's position the right one as a
>> mid/long term route.
> 
> Ok, will let Roger drive that on xen, and I'll look at linux side. But
> since his patch is incremental on top of mine, I find request reasonable
> that my next version V12 be checked in if no issues, or at least 
> construct_dom0 patch which is sitting reviewed for over 6 months now. 

I would have long committed it if you had chased up Tim's ack on it.
I'm not going to do that for you, there's enough other patches that
need taking care of.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-07 11:34                       ` Jan Beulich
@ 2014-05-08 10:27                         ` Roger Pau Monné
  2014-05-08 10:44                           ` Jan Beulich
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-08 10:27 UTC (permalink / raw)
  To: Jan Beulich, Mukesh Rathor; +Cc: George.Dunlap, xen-devel, keir.xen, tim

On 07/05/14 13:34, Jan Beulich wrote:
>>>> On 07.05.14 at 11:48, <roger.pau@citrix.com> wrote:
>> I've expanded my patch that added the memory removed form the holes to 
>> the end of the memory map to also create an e820 map for Dom0 that 
>> reflects the reality of the underlying p2m. This way PVH guests (either 
>> DomU or Dom0) should only use XENMEM_memory_map to get the correct e820 
>> map (and no need for clamping in the Dom0 case).
> 
> Looks generally good (except that it doesn't seem to deal with the
> DomU case yet), but there are a couple of possible issues:
> 
>> --- a/xen/arch/x86/domain_build.c
>> +++ b/xen/arch/x86/domain_build.c
>> @@ -327,11 +327,13 @@ static __init void pvh_add_mem_mapping(struct domain 
>> *d, unsigned long gfn,
>>   * pvh fixme: The following doesn't map MMIO ranges when they sit above the
>>   *            highest E820 covered address.
>>   */
>> -static __init void pvh_map_all_iomem(struct domain *d)
>> +static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
>>  {
>>      unsigned long start_pfn, end_pfn, end = 0, start = 0;
>>      const struct e820entry *entry;
>> -    unsigned int i, nump;
>> +    unsigned int i, j, nump, navail, nmap, nr_holes = 0;
> 
> With nr_pages above being unsigned long, I think some of these
> variables should be too.
> 
>> @@ -369,6 +374,89 @@ static __init void pvh_map_all_iomem(struct domain *d)
>>          nump = end_pfn - start_pfn;
>>          pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
>>      }
>> +
>> +    /*
>> +     * Add the memory removed by the holes at the end of the
>> +     * memory map.
>> +     */
>> +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
>> +    {
>> +        if ( entry->type != E820_RAM )
>> +            continue;
>> +
>> +        end_pfn = PFN_UP(entry->addr + entry->size);
>> +        if ( end_pfn <= nr_pages )
>> +            continue;
>> +
>> +        navail = end_pfn - nr_pages;
>> +        nmap = navail > nr_holes ? nr_holes : navail;
>> +        start_pfn = PFN_DOWN(entry->addr) < nr_pages ?
>> +                        nr_pages : PFN_DOWN(entry->addr);
>> +        page = alloc_domheap_pages(d, get_order_from_pages(nmap), 0);
>> +        if ( !page )
>> +            panic("Not enough RAM for domain 0");
>> +        for ( j = 0; j < nmap; j++ )
>> +        {
>> +            rc = guest_physmap_add_page(d, start_pfn + j, page_to_mfn(page), 0);
> 
> I realize that this interface isn't the most flexible one in terms of what
> page orders it allows to be passed in, but could you at least use order
> 9 here when the allocation order above is 9 or higher?

I've changed it to be:

guest_physmap_add_page(d, start_pfn, page_to_mfn(page), order);

and removed the loop.

> 
>> +static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>> +{
>> +    struct e820entry *entry;
>> +    unsigned int i;
>> +    unsigned long pages, cur_pages = 0;
>> +
>> +    /*
>> +     * Craft the e820 memory map for Dom0 based on the hardware e820 map.
>> +     */
>> +    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
>> +    if ( !d->arch.e820 )
>> +        panic("Unable to allocate memory for Dom0 e820 map");
>> +
>> +    memcpy(d->arch.e820, e820.map, sizeof(struct e820entry) * e820.nr_map);
>> +    d->arch.nr_e820 = e820.nr_map;
>> +
>> +    /* Clamp e820 memory map to match the memory assigned to Dom0 */
>> +    for ( i = 0, entry = d->arch.e820; i < d->arch.nr_e820; i++, entry++ )
>> +    {
>> +        if ( entry->type != E820_RAM )
>> +            continue;
>> +
>> +        if ( nr_pages == cur_pages )
>> +        {
>> +            /*
>> +             * We already have all the assigned memory,
>> +             * mark this region as reserved.
>> +             */
>> +            entry->type = E820_RESERVED;
> 
> That seems undesirable, as it will prevent Linux from using that space
> for MMIO purposes. Plus keeping the region here is consistent with
> simply shrinking the range below (yielding the remainder uncovered
> by any E820 entry). I think you ought to zap the entry here.

I don't think this regions can be used for MMIO, the gpfns in the guest
p2m are not set as p2m_mmio_direct.

Here is the updated patch:
---
commit b84f2ae87de562a3df69599c4b3734262e106445
Author: Roger Pau Monne <roger.pau@citrix.com>
Date:   Thu May 8 09:21:49 2014 +0200

    xen: fix setup of PVH Dom0 memory map
    
    This patch adds the holes removed by MMIO regions to the end of the
    memory map for PVH Dom0, so the guest OS doesn't have to manually
    populate this memory.
    
    Also, provide a suitable e820 memory map for PVH Dom0, that matches
    the underlying p2m map. This means that PVH guests should always use
    XENMEM_memory_map in order to obtain the e820, even when running as
    Dom0.
    
    Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 38ed9f6..3cbd43c 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -327,11 +327,14 @@ static __init void pvh_add_mem_mapping(struct domain *d, unsigned long gfn,
  * pvh fixme: The following doesn't map MMIO ranges when they sit above the
  *            highest E820 covered address.
  */
-static __init void pvh_map_all_iomem(struct domain *d)
+static __init void pvh_map_all_iomem(struct domain *d, unsigned long nr_pages)
 {
     unsigned long start_pfn, end_pfn, end = 0, start = 0;
     const struct e820entry *entry;
-    unsigned int i, nump;
+    unsigned long nump, nmap, navail, nr_holes = 0;
+    unsigned int i, order;
+    struct page_info *page;
+    int rc;
 
     for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
     {
@@ -353,6 +356,9 @@ static __init void pvh_map_all_iomem(struct domain *d)
                 nump = end_pfn - start_pfn;
                 /* Add pages to the mapping */
                 pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
+                if ( start_pfn <= nr_pages )
+                    nr_holes += (end_pfn < nr_pages) ?
+                                    nump : (nr_pages - start_pfn);
             }
             start = end;
         }
@@ -369,6 +375,84 @@ static __init void pvh_map_all_iomem(struct domain *d)
         nump = end_pfn - start_pfn;
         pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
     }
+
+    /*
+     * Add the memory removed by the holes at the end of the
+     * memory map.
+     */
+    for ( i = 0, entry = e820.map; i < e820.nr_map && nr_holes > 0;
+          i++, entry++ )
+    {
+        if ( entry->type != E820_RAM )
+            continue;
+
+        end_pfn = PFN_UP(entry->addr + entry->size);
+        if ( end_pfn <= nr_pages )
+            continue;
+
+        navail = end_pfn - nr_pages;
+        nmap = navail > nr_holes ? nr_holes : navail;
+        start_pfn = PFN_DOWN(entry->addr) < nr_pages ?
+                        nr_pages : PFN_DOWN(entry->addr);
+        order = get_order_from_pages(nmap);
+        page = alloc_domheap_pages(d, order, 0);
+        if ( !page )
+            panic("Not enough RAM for domain 0");
+        rc = guest_physmap_add_page(d, start_pfn, page_to_mfn(page), order);
+        if ( rc != 0 )
+            panic("Unable to add gpfn %#lx mfn %#lx order: %u to Dom0 physmap",
+                  start_pfn, page_to_mfn(page), order);
+        nr_holes -= nmap;
+    }
+
+    ASSERT(nr_holes == 0);
+}
+
+static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
+{
+    struct e820entry *entry;
+    unsigned int i;
+    unsigned long pages, cur_pages = 0;
+
+    /*
+     * Craft the e820 memory map for Dom0 based on the hardware e820 map.
+     */
+    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
+    if ( !d->arch.e820 )
+        panic("Unable to allocate memory for Dom0 e820 map");
+
+    memcpy(d->arch.e820, e820.map, sizeof(struct e820entry) * e820.nr_map);
+    d->arch.nr_e820 = e820.nr_map;
+
+    /* Clamp e820 memory map to match the memory assigned to Dom0 */
+    for ( i = 0, entry = d->arch.e820; i < d->arch.nr_e820; i++, entry++ )
+    {
+        if ( entry->type != E820_RAM )
+            continue;
+
+        if ( nr_pages == cur_pages )
+        {
+            /*
+             * We already have all the assigned memory,
+             * set this region length to 0.
+             */
+            entry->size = 0;
+            continue;
+        }
+
+        pages = entry->size >> PAGE_SHIFT;
+        if ( (cur_pages + pages) > nr_pages )
+        {
+            /* Truncate region */
+            entry->size = (nr_pages - cur_pages) << PAGE_SHIFT;
+            cur_pages = nr_pages;
+        }
+        else
+        {
+            cur_pages += pages;
+        }
+    }
+    ASSERT(cur_pages == nr_pages);
 }
 
 static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
@@ -1391,7 +1475,8 @@ int __init construct_dom0(
         pfn = shared_info_paddr >> PAGE_SHIFT;
         dom0_update_physmap(d, pfn, mfn, 0);
 
-        pvh_map_all_iomem(d);
+        pvh_map_all_iomem(d, nr_pages);
+        pvh_setup_e820(d, nr_pages);
     }
 
     if ( d->domain_id == hardware_domid )

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08 10:27                         ` Roger Pau Monné
@ 2014-05-08 10:44                           ` Jan Beulich
  2014-05-08 15:00                             ` Roger Pau Monné
  0 siblings, 1 reply; 52+ messages in thread
From: Jan Beulich @ 2014-05-08 10:44 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 08.05.14 at 12:27, <roger.pau@citrix.com> wrote:
> On 07/05/14 13:34, Jan Beulich wrote:
>>>>> On 07.05.14 at 11:48, <roger.pau@citrix.com> wrote:
>>> @@ -369,6 +374,89 @@ static __init void pvh_map_all_iomem(struct domain *d)
>>>          nump = end_pfn - start_pfn;
>>>          pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
>>>      }
>>> +
>>> +    /*
>>> +     * Add the memory removed by the holes at the end of the
>>> +     * memory map.
>>> +     */
>>> +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
>>> +    {
>>> +        if ( entry->type != E820_RAM )
>>> +            continue;
>>> +
>>> +        end_pfn = PFN_UP(entry->addr + entry->size);
>>> +        if ( end_pfn <= nr_pages )
>>> +            continue;
>>> +
>>> +        navail = end_pfn - nr_pages;
>>> +        nmap = navail > nr_holes ? nr_holes : navail;
>>> +        start_pfn = PFN_DOWN(entry->addr) < nr_pages ?
>>> +                        nr_pages : PFN_DOWN(entry->addr);
>>> +        page = alloc_domheap_pages(d, get_order_from_pages(nmap), 0);
>>> +        if ( !page )
>>> +            panic("Not enough RAM for domain 0");
>>> +        for ( j = 0; j < nmap; j++ )
>>> +        {
>>> +            rc = guest_physmap_add_page(d, start_pfn + j, page_to_mfn(page), 0);
>> 
>> I realize that this interface isn't the most flexible one in terms of what
>> page orders it allows to be passed in, but could you at least use order
>> 9 here when the allocation order above is 9 or higher?
> 
> I've changed it to be:
> 
> guest_physmap_add_page(d, start_pfn, page_to_mfn(page), order);
> 
> and removed the loop.

Honestly I'd be very surprised if this worked - the P2M backend
functions generally accept only orders matching up with what the
hardware supports. I.e. I specifically didn't suggest to remove the
loop altogether.

>>> +static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>>> +{
>>> +    struct e820entry *entry;
>>> +    unsigned int i;
>>> +    unsigned long pages, cur_pages = 0;
>>> +
>>> +    /*
>>> +     * Craft the e820 memory map for Dom0 based on the hardware e820 map.
>>> +     */
>>> +    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
>>> +    if ( !d->arch.e820 )
>>> +        panic("Unable to allocate memory for Dom0 e820 map");
>>> +
>>> +    memcpy(d->arch.e820, e820.map, sizeof(struct e820entry) * e820.nr_map);
>>> +    d->arch.nr_e820 = e820.nr_map;
>>> +
>>> +    /* Clamp e820 memory map to match the memory assigned to Dom0 */
>>> +    for ( i = 0, entry = d->arch.e820; i < d->arch.nr_e820; i++, entry++ )
>>> +    {
>>> +        if ( entry->type != E820_RAM )
>>> +            continue;
>>> +
>>> +        if ( nr_pages == cur_pages )
>>> +        {
>>> +            /*
>>> +             * We already have all the assigned memory,
>>> +             * mark this region as reserved.
>>> +             */
>>> +            entry->type = E820_RESERVED;
>> 
>> That seems undesirable, as it will prevent Linux from using that space
>> for MMIO purposes. Plus keeping the region here is consistent with
>> simply shrinking the range below (yielding the remainder uncovered
>> by any E820 entry). I think you ought to zap the entry here.
> 
> I don't think this regions can be used for MMIO, the gpfns in the guest
> p2m are not set as p2m_mmio_direct.

Not at the point you build the domain, but such a use can't and
shouldn't be precluded ones the domain is up.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08  0:12                       ` Mukesh Rathor
@ 2014-05-08 10:52                         ` George Dunlap
  2014-05-08 13:15                         ` David Vrabel
  1 sibling, 0 replies; 52+ messages in thread
From: George Dunlap @ 2014-05-08 10:52 UTC (permalink / raw)
  To: Mukesh Rathor, Roger Pau Monné; +Cc: xen-devel, keir.xen, tim, JBeulich

On 05/08/2014 01:12 AM, Mukesh Rathor wrote:
> On Wed, 7 May 2014 15:38:26 +0200
> Roger Pau Monné <roger.pau@citrix.com> wrote:
>
>> On 07/05/14 15:20, Konrad Rzeszutek Wilk wrote:
>>> For example, right now the Linux PV code can run with an E820 that
>>> looks like swiss cheese - aka like the hosts' one. That is easily
>>> seen if you boot an PV guest with PCI passthrough devices - as the
>>> libxl 'e820_host' option gets turned on which creates an E820
>>> that looks like the hosts. Granted it does not populate the P2M
>>> as such (it is all linear). But the point is that the Linux
>>> code is capable of dealing with this and bring the P2M to sync.
>>>
>>> Hoisting this up in the hypervisor would be a plus - as the
>>> Linux code wouldn't have to do this anymore.
>> IMHO doing it in the hypervisor is clearly the right solution, forcing
>> the guest OS to do all this on it's own just promotes code duplication
>> across the several OSes with PV(H) support.
> I am in agreement with you, but my point was code exists for PV already,
> so if you address pvh only, then that code still has to exist in
> the guest until PV reaches EOL.
>
> FWIW, my agreement with previous maintainers was to keep PVH as close to
> PV as possible.

Wait, which maintainers are you talking about?

One of the big long-term wins of PVH is to get rid of a lot of the cruft 
in Linux resulting from the PV interface.  "As close as possible" should 
mean that if there are several equally supportable / low-cruft ways to 
do something, do it the way PV does.  But the "as possible" escape hatch 
means that when we can reduce cruft / code required in the Linux kernel, 
we should try.

  -George

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-02 23:35         ` Mukesh Rathor
  2014-05-05  7:46           ` Jan Beulich
@ 2014-05-08 12:16           ` Tim Deegan
  2014-05-08 13:25             ` Jan Beulich
  2014-05-08 22:58             ` Mukesh Rathor
  1 sibling, 2 replies; 52+ messages in thread
From: Tim Deegan @ 2014-05-08 12:16 UTC (permalink / raw)
  To: Mukesh Rathor
  Cc: JBeulich, George.Dunlap, eddie.dong, keir.xen, jun.nakajima, xen-devel

At 16:35 -0700 on 02 May (1399044956), Mukesh Rathor wrote:
> > Oh yes -- sorry, I got confused between this and the case where you do
> > use get_pg_owner().  But actually on this subject ISTR objecting to
> > exporting get_pg_owner() from mm.c before.  Oh yes, here it is:
> > 
> > http://lists.xenproject.org/archives/html/xen-devel/2014-01/msg02612.html
> > 
> > So can you use rcu_lock_live_remote_domain_by_id() here instead?
> 
> 
> In p2m_add_foreign() we already get the rcu lock, still do it here? 

I don't know what you mean by that.  p2m_add_foreign() currently
implicitly takes the rcu lock by calling get_pg_owner(); I was
suggecting that you use rcu_lock_live_remote_domain_by_id() instead.

> And, btw, that function, p2m_add_foreign(), went from 
> 
> +    if ( (fdid == DOMID_XEN && (fdom = rcu_lock_domain(dom_xen)) == NULL) ||
> +         (fdid != DOMID_XEN && (fdom = rcu_lock_domain_by_id(fdid)) == NULL) )
> +        goto out;
> 
> to 
> 
> +    fdom = get_pg_owner(foreigndom);
> 
> (causing get_pg_owner to be exported) at Jan's comment/suggestion in v8.

Unless Jan particularly wants to defend that, please don't export
get_pg_owner().  As I said in the link above it doesn't do what you want
anyway (e.g around DOMID_IO).

Tim.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08  0:12                       ` Mukesh Rathor
  2014-05-08 10:52                         ` George Dunlap
@ 2014-05-08 13:15                         ` David Vrabel
  2014-05-08 22:29                           ` Mukesh Rathor
  1 sibling, 1 reply; 52+ messages in thread
From: David Vrabel @ 2014-05-08 13:15 UTC (permalink / raw)
  To: Mukesh Rathor, Roger Pau Monné
  Cc: George.Dunlap, xen-devel, keir.xen, tim, JBeulich

On 08/05/14 01:12, Mukesh Rathor wrote:
> On Wed, 7 May 2014 15:38:26 +0200
> Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
>> On 07/05/14 15:20, Konrad Rzeszutek Wilk wrote:
>>> For example, right now the Linux PV code can run with an E820 that
>>> looks like swiss cheese - aka like the hosts' one. That is easily
>>> seen if you boot an PV guest with PCI passthrough devices - as the
>>> libxl 'e820_host' option gets turned on which creates an E820
>>> that looks like the hosts. Granted it does not populate the P2M
>>> as such (it is all linear). But the point is that the Linux
>>> code is capable of dealing with this and bring the P2M to sync.
>>>
>>> Hoisting this up in the hypervisor would be a plus - as the
>>> Linux code wouldn't have to do this anymore.
>>
>> IMHO doing it in the hypervisor is clearly the right solution, forcing
>> the guest OS to do all this on it's own just promotes code duplication
>> across the several OSes with PV(H) support.
> 
> I am in agreement with you, but my point was code exists for PV already,
> so if you address pvh only, then that code still has to exist in 
> the guest until PV reaches EOL. 
> 
> FWIW, my agreement with previous maintainers was to keep PVH as close to 
> PV as possible.

PVH support in Linux should be in order of preference:

1. Like native.
2. Like HVM guests.
3. Like PV guests.
4. PVH specific.

In this specific case of the memory map of a PVH guest at boot it should
match the p2m (like it would for a HVM guest).

For Linux, it would be acceptable for it to be like PV as an interim
solution, but for a new guest without PV support it really doesn't want
to have to reimplement the games that Linux PV plays (it's complex and
has been historically buggy and broken in various less common corner cases).

Fixing it in Xen would be simpler too, as it wouldn't have to do the
depopulate/repopulate dance.

David

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-08 12:16           ` Tim Deegan
@ 2014-05-08 13:25             ` Jan Beulich
  2014-05-08 22:58             ` Mukesh Rathor
  1 sibling, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2014-05-08 13:25 UTC (permalink / raw)
  To: Mukesh Rathor, Tim Deegan
  Cc: George.Dunlap, xen-devel, keir.xen, eddie.dong, jun.nakajima

>>> On 08.05.14 at 14:16, <tim@xen.org> wrote:
> At 16:35 -0700 on 02 May (1399044956), Mukesh Rathor wrote:
>> And, btw, that function, p2m_add_foreign(), went from 
>> 
>> +    if ( (fdid == DOMID_XEN && (fdom = rcu_lock_domain(dom_xen)) == NULL) ||
>> +         (fdid != DOMID_XEN && (fdom = rcu_lock_domain_by_id(fdid)) == NULL) 
> )
>> +        goto out;
>> 
>> to 
>> 
>> +    fdom = get_pg_owner(foreigndom);
>> 
>> (causing get_pg_owner to be exported) at Jan's comment/suggestion in v8.
> 
> Unless Jan particularly wants to defend that, please don't export
> get_pg_owner().  As I said in the link above it doesn't do what you want
> anyway (e.g around DOMID_IO).

The only reason I suggested using the function was because some other
helper function introduced by the patches mostly re-implemented that
existing one. If it doesn't do what is needed, there's obviously no point
in making it globally accessible.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08 10:44                           ` Jan Beulich
@ 2014-05-08 15:00                             ` Roger Pau Monné
  2014-05-08 15:20                               ` Jan Beulich
  0 siblings, 1 reply; 52+ messages in thread
From: Roger Pau Monné @ 2014-05-08 15:00 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George.Dunlap, xen-devel, keir.xen, tim

On 08/05/14 12:44, Jan Beulich wrote:
>>>> On 08.05.14 at 12:27, <roger.pau@citrix.com> wrote:
>> On 07/05/14 13:34, Jan Beulich wrote:
>>>>>> On 07.05.14 at 11:48, <roger.pau@citrix.com> wrote:
>>>> @@ -369,6 +374,89 @@ static __init void pvh_map_all_iomem(struct domain *d)
>>>>          nump = end_pfn - start_pfn;
>>>>          pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
>>>>      }
>>>> +
>>>> +    /*
>>>> +     * Add the memory removed by the holes at the end of the
>>>> +     * memory map.
>>>> +     */
>>>> +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
>>>> +    {
>>>> +        if ( entry->type != E820_RAM )
>>>> +            continue;
>>>> +
>>>> +        end_pfn = PFN_UP(entry->addr + entry->size);
>>>> +        if ( end_pfn <= nr_pages )
>>>> +            continue;
>>>> +
>>>> +        navail = end_pfn - nr_pages;
>>>> +        nmap = navail > nr_holes ? nr_holes : navail;
>>>> +        start_pfn = PFN_DOWN(entry->addr) < nr_pages ?
>>>> +                        nr_pages : PFN_DOWN(entry->addr);
>>>> +        page = alloc_domheap_pages(d, get_order_from_pages(nmap), 0);
>>>> +        if ( !page )
>>>> +            panic("Not enough RAM for domain 0");
>>>> +        for ( j = 0; j < nmap; j++ )
>>>> +        {
>>>> +            rc = guest_physmap_add_page(d, start_pfn + j, page_to_mfn(page), 0);
>>>
>>> I realize that this interface isn't the most flexible one in terms of what
>>> page orders it allows to be passed in, but could you at least use order
>>> 9 here when the allocation order above is 9 or higher?
>>
>> I've changed it to be:
>>
>> guest_physmap_add_page(d, start_pfn, page_to_mfn(page), order);
>>
>> and removed the loop.
> 
> Honestly I'd be very surprised if this worked - the P2M backend
> functions generally accept only orders matching up with what the
> hardware supports. I.e. I specifically didn't suggest to remove the
> loop altogether.

Sure, just checked now and EPT doesn't support orders > 9
(EPT_TABLE_ORDER). It was working in my case because the order of the
allocated pages is 7.

> 
>>>> +static __init void pvh_setup_e820(struct domain *d, unsigned long nr_pages)
>>>> +{
>>>> +    struct e820entry *entry;
>>>> +    unsigned int i;
>>>> +    unsigned long pages, cur_pages = 0;
>>>> +
>>>> +    /*
>>>> +     * Craft the e820 memory map for Dom0 based on the hardware e820 map.
>>>> +     */
>>>> +    d->arch.e820 = xzalloc_array(struct e820entry, e820.nr_map);
>>>> +    if ( !d->arch.e820 )
>>>> +        panic("Unable to allocate memory for Dom0 e820 map");
>>>> +
>>>> +    memcpy(d->arch.e820, e820.map, sizeof(struct e820entry) * e820.nr_map);
>>>> +    d->arch.nr_e820 = e820.nr_map;
>>>> +
>>>> +    /* Clamp e820 memory map to match the memory assigned to Dom0 */
>>>> +    for ( i = 0, entry = d->arch.e820; i < d->arch.nr_e820; i++, entry++ )
>>>> +    {
>>>> +        if ( entry->type != E820_RAM )
>>>> +            continue;
>>>> +
>>>> +        if ( nr_pages == cur_pages )
>>>> +        {
>>>> +            /*
>>>> +             * We already have all the assigned memory,
>>>> +             * mark this region as reserved.
>>>> +             */
>>>> +            entry->type = E820_RESERVED;
>>>
>>> That seems undesirable, as it will prevent Linux from using that space
>>> for MMIO purposes. Plus keeping the region here is consistent with
>>> simply shrinking the range below (yielding the remainder uncovered
>>> by any E820 entry). I think you ought to zap the entry here.
>>
>> I don't think this regions can be used for MMIO, the gpfns in the guest
>> p2m are not set as p2m_mmio_direct.
> 
> Not at the point you build the domain, but such a use can't and
> shouldn't be precluded ones the domain is up.

OK, setting entry.size = 0 should work.

Roger.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08 15:00                             ` Roger Pau Monné
@ 2014-05-08 15:20                               ` Jan Beulich
  0 siblings, 0 replies; 52+ messages in thread
From: Jan Beulich @ 2014-05-08 15:20 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: George.Dunlap, xen-devel, keir.xen, tim

>>> On 08.05.14 at 17:00, <roger.pau@citrix.com> wrote:
> On 08/05/14 12:44, Jan Beulich wrote:
>>>>> On 08.05.14 at 12:27, <roger.pau@citrix.com> wrote:
>>> On 07/05/14 13:34, Jan Beulich wrote:
>>>>>>> On 07.05.14 at 11:48, <roger.pau@citrix.com> wrote:
>>>>> +    /* Clamp e820 memory map to match the memory assigned to Dom0 */
>>>>> +    for ( i = 0, entry = d->arch.e820; i < d->arch.nr_e820; i++, entry++ )
>>>>> +    {
>>>>> +        if ( entry->type != E820_RAM )
>>>>> +            continue;
>>>>> +
>>>>> +        if ( nr_pages == cur_pages )
>>>>> +        {
>>>>> +            /*
>>>>> +             * We already have all the assigned memory,
>>>>> +             * mark this region as reserved.
>>>>> +             */
>>>>> +            entry->type = E820_RESERVED;
>>>>
>>>> That seems undesirable, as it will prevent Linux from using that space
>>>> for MMIO purposes. Plus keeping the region here is consistent with
>>>> simply shrinking the range below (yielding the remainder uncovered
>>>> by any E820 entry). I think you ought to zap the entry here.
>>>
>>> I don't think this regions can be used for MMIO, the gpfns in the guest
>>> p2m are not set as p2m_mmio_direct.
>> 
>> Not at the point you build the domain, but such a use can't and
>> shouldn't be precluded ones the domain is up.
> 
> OK, setting entry.size = 0 should work.

Please just remove unused entries altogether. You never know
whether some OS can't deal with zero-size ones. And keeping the
number of table entries low is generally also desirable.

Jan

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08  6:37                       ` Jan Beulich
@ 2014-05-08 19:15                         ` Mukesh Rathor
  0 siblings, 0 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-08 19:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George.Dunlap, xen-devel, keir.xen, tim, roger.pau

On Thu, 08 May 2014 07:37:21 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 08.05.14 at 02:04, <mukesh.rathor@oracle.com> wrote:
> > On Wed, 07 May 2014 08:50:16 +0100
> > "Jan Beulich" <JBeulich@suse.com> wrote:
> > 
> >> >>> On 07.05.14 at 03:00, <mukesh.rathor@oracle.com> wrote:
> >> > As Konrad points out, there are other issues with that part of
> >> > the code. So, IMO, we should wait and address that altogether
> >> > for both PV and PVH, and for now leave this as is.
> >> 
> >> I'm fine either way, as long as you and Roger can reach agreement.
> >> I still tend towards considering Roger's position the right one as
> >> a mid/long term route.
> > 
> > Ok, will let Roger drive that on xen, and I'll look at linux side.
> > But since his patch is incremental on top of mine, I find request
> > reasonable that my next version V12 be checked in if no issues, or
> > at least construct_dom0 patch which is sitting reviewed for over 6
> > months now. 
> 
> I would have long committed it if you had chased up Tim's ack on it.
> I'm not going to do that for you, there's enough other patches that
> need taking care of.

Oh, I didn't know thats what it was waiting for. anyways, looks like
he just acked it. So... new version on it's way (hopefully no regressions
on testing after refresh).

thanks for all your help,
mukesh

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 0/4] pvh dom0 patches...
  2014-05-08 13:15                         ` David Vrabel
@ 2014-05-08 22:29                           ` Mukesh Rathor
  0 siblings, 0 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-08 22:29 UTC (permalink / raw)
  To: David Vrabel
  Cc: George.Dunlap, tim, keir.xen, JBeulich, xen-devel, Roger Pau Monné

On Thu, 8 May 2014 14:15:36 +0100
David Vrabel <david.vrabel@citrix.com> wrote:

> On 08/05/14 01:12, Mukesh Rathor wrote:
> > On Wed, 7 May 2014 15:38:26 +0200
> > Roger Pau Monné <roger.pau@citrix.com> wrote:
> > 
> >> On 07/05/14 15:20, Konrad Rzeszutek Wilk wrote:
> >>> For example, right now the Linux PV code can run with an E820 that
> >>> looks like swiss cheese - aka like the hosts' one. That is easily
> >>> seen if you boot an PV guest with PCI passthrough devices - as the
> >>> libxl 'e820_host' option gets turned on which creates an E820
> >>> that looks like the hosts. Granted it does not populate the P2M
> >>> as such (it is all linear). But the point is that the Linux
> >>> code is capable of dealing with this and bring the P2M to sync.
> >>>
> >>> Hoisting this up in the hypervisor would be a plus - as the
> >>> Linux code wouldn't have to do this anymore.
> >>
> >> IMHO doing it in the hypervisor is clearly the right solution,
> >> forcing the guest OS to do all this on it's own just promotes code
> >> duplication across the several OSes with PV(H) support.
> > 
> > I am in agreement with you, but my point was code exists for PV
> > already, so if you address pvh only, then that code still has to
> > exist in the guest until PV reaches EOL. 
> > 
> > FWIW, my agreement with previous maintainers was to keep PVH as
> > close to PV as possible.
> 
> PVH support in Linux should be in order of preference:
> 
> 1. Like native.
> 2. Like HVM guests.
> 3. Like PV guests.
> 4. PVH specific.

I've been thinking:

  PVH == PV guest in HVM container with HAP. (Actually, my initial 
         UP cut didn't require hap, but smp got very complicated. Besides
         eliminating pv mmu became seondary goal).

Now, for prefrence (where choice exist):

   For (each path)
       if (HVM performs better)
           pvh == hvm
       else
           pvh == pv

IOW, HVM good, PV better, PVH best!  Altho, now i believe 
  hvm && pv == better

Historical background jfyi, when I talked about pvh to Keir, Ian P,
and can't remember others on the sidelines of xen summit years ago, 
pv ops didn't exist, and my main motivation was performance. I didn't 
think it would take this long, but I am euphoric to see it finally be
done, well, almost :)...

Mukesh


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages
  2014-05-08 12:16           ` Tim Deegan
  2014-05-08 13:25             ` Jan Beulich
@ 2014-05-08 22:58             ` Mukesh Rathor
  1 sibling, 0 replies; 52+ messages in thread
From: Mukesh Rathor @ 2014-05-08 22:58 UTC (permalink / raw)
  To: Tim Deegan
  Cc: JBeulich, George.Dunlap, eddie.dong, keir.xen, jun.nakajima, xen-devel

On Thu, 8 May 2014 14:16:50 +0200
Tim Deegan <tim@xen.org> wrote:

> At 16:35 -0700 on 02 May (1399044956), Mukesh Rathor wrote:
> > > Oh yes -- sorry, I got confused between this and the case where
> > > you do use get_pg_owner().  But actually on this subject ISTR
> > > objecting to exporting get_pg_owner() from mm.c before.  Oh yes,
> > > here it is:
> > > 
> > > http://lists.xenproject.org/archives/html/xen-devel/2014-01/msg02612.html
> > > 
> > > So can you use rcu_lock_live_remote_domain_by_id() here instead?
> > 
> > 
> > In p2m_add_foreign() we already get the rcu lock, still do it here? 
> 
> I don't know what you mean by that.  p2m_add_foreign() currently
> implicitly takes the rcu lock by calling get_pg_owner(); I was
> suggecting that you use rcu_lock_live_remote_domain_by_id() instead.

Ok, I can revert p2m_add_foreign() back to calling 
rcu_lock_live_remote_domain_by_id instead of get_pg_owner. I thought
you were referring to page_get_owner in atomic_write_ept_entry().

thanks
mukesh

^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2014-05-08 22:58 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-30  1:06 [V10 PATCH 0/4] pvh dom0 patches Mukesh Rathor
2014-04-30  1:06 ` [V10 PATCH 1/4] pvh dom0: construct_dom0 changes Mukesh Rathor
2014-05-06 15:18   ` Roger Pau Monné
2014-04-30  1:06 ` [V10 PATCH 2/4] pvh dom0: Add checks and restrictions for p2m_is_foreign Mukesh Rathor
2014-05-01 16:14   ` Tim Deegan
2014-04-30  1:06 ` [V10 PATCH 3/4] pvh dom0: Add and remove foreign pages Mukesh Rathor
2014-05-01 16:19   ` Tim Deegan
2014-05-02  1:45     ` Mukesh Rathor
2014-05-02  8:38       ` Jan Beulich
2014-05-02  8:55       ` Tim Deegan
2014-05-02 23:35         ` Mukesh Rathor
2014-05-05  7:46           ` Jan Beulich
2014-05-08 12:16           ` Tim Deegan
2014-05-08 13:25             ` Jan Beulich
2014-05-08 22:58             ` Mukesh Rathor
2014-04-30  1:06 ` [V10 PATCH 4/4] dom0: add opt_dom0pvh to setup.c Mukesh Rathor
2014-04-30 14:11 ` [V10 PATCH 0/4] pvh dom0 patches Roger Pau Monné
2014-04-30 18:12   ` Mukesh Rathor
2014-05-01  1:19     ` Mukesh Rathor
2014-05-02 11:05       ` Roger Pau Monné
2014-05-02 12:31         ` Jan Beulich
2014-05-02 14:06           ` Roger Pau Monné
2014-05-02 14:16             ` Jan Beulich
2014-05-02 14:35               ` Roger Pau Monné
2014-05-02 15:41                 ` Jan Beulich
2014-05-02 16:13                   ` Roger Pau Monné
2014-05-02 19:35                     ` Konrad Rzeszutek Wilk
2014-05-03  0:01         ` Mukesh Rathor
2014-05-05  8:52           ` Roger Pau Monné
2014-05-06  0:28             ` Mukesh Rathor
2014-05-06  7:13               ` Roger Pau Monné
2014-05-06  8:09                 ` Jan Beulich
2014-05-07  1:00                 ` Mukesh Rathor
2014-05-07  7:50                   ` Jan Beulich
2014-05-07  9:48                     ` Roger Pau Monné
2014-05-07 11:34                       ` Jan Beulich
2014-05-08 10:27                         ` Roger Pau Monné
2014-05-08 10:44                           ` Jan Beulich
2014-05-08 15:00                             ` Roger Pau Monné
2014-05-08 15:20                               ` Jan Beulich
2014-05-07 13:25                     ` Konrad Rzeszutek Wilk
2014-05-08  0:04                     ` Mukesh Rathor
2014-05-08  6:37                       ` Jan Beulich
2014-05-08 19:15                         ` Mukesh Rathor
2014-05-07 13:20                   ` Konrad Rzeszutek Wilk
2014-05-07 13:38                     ` Roger Pau Monné
2014-05-08  0:12                       ` Mukesh Rathor
2014-05-08 10:52                         ` George Dunlap
2014-05-08 13:15                         ` David Vrabel
2014-05-08 22:29                           ` Mukesh Rathor
2014-05-08  0:07                     ` Mukesh Rathor
2014-05-06 19:38               ` Konrad Rzeszutek Wilk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.