All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC 0 PATCH 0/3]: PVH dom0 construction....
@ 2013-09-25 21:03 Mukesh Rathor
  2013-09-25 21:03 ` [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function Mukesh Rathor
                   ` (2 more replies)
  0 siblings, 3 replies; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-25 21:03 UTC (permalink / raw)
  To: Xen-devel; +Cc: keir.xen, JBeulich

Hi Jan,

I redid construct_dom0 to how you had wanted, temporarily running 
PVH also on dom0 page tables, and removing the PVH special copy
function. Kindly take a look. Other surrouding patches to boot PVH
in dom0 mode are mostly unchanged, so just leaving them out.

thanks
Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function
  2013-09-25 21:03 [RFC 0 PATCH 0/3]: PVH dom0 construction Mukesh Rathor
@ 2013-09-25 21:03 ` Mukesh Rathor
  2013-09-26  7:03   ` Jan Beulich
  2013-09-25 21:03 ` [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions Mukesh Rathor
  2013-09-25 21:03 ` [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes Mukesh Rathor
  2 siblings, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-25 21:03 UTC (permalink / raw)
  To: Xen-devel; +Cc: keir.xen, JBeulich

In this preparatory patch XEN_DOMCTL_memory_mapping code is
put into a function so it can be called later for PVH from
construct_dom0. There is no change in it's functionality.
The function is made non-static in the construct_dom0 patch.

Note, iomem_access_permitted is not moved to the function
because current doesn't point to dom0 in construct_dom0.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/domctl.c |  122 ++++++++++++++++++++++++++----------------------
 1 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index e75918a..fe7ca00 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -46,6 +46,71 @@ static int gdbsx_guest_mem_io(
     return (iop->remain ? -EFAULT : 0);
 }
 
+static long domctl_memory_mapping(struct domain *d, unsigned long gfn,
+                           unsigned long mfn, unsigned long nr_mfns,
+                           bool_t add)
+{
+    unsigned long i;
+    long ret;
+
+    if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
+         ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) ||
+         (gfn + nr_mfns - 1) < gfn ) /* wrap? */
+        return -EINVAL;
+
+    ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add);
+    if ( ret )
+        return ret;
+
+    if ( add )
+    {
+        printk(XENLOG_G_INFO
+               "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n",
+               d->domain_id, gfn, mfn, nr_mfns);
+
+        ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
+        if ( !ret && paging_mode_translate(d) )
+        {
+            for ( i = 0; !ret && i < nr_mfns; i++ )
+                if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) )
+                    ret = -EIO;
+            if ( ret )
+            {
+                printk(XENLOG_G_WARNING
+                       "memory_map:fail: dom%d gfn=%lx mfn=%lx\n",
+                       d->domain_id, gfn + i, mfn + i);
+                while ( i-- )
+                    clear_mmio_p2m_entry(d, gfn + i);
+                if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) &&
+                     is_hardware_domain(current->domain) )
+                    printk(XENLOG_ERR
+                           "memory_map: failed to deny dom%d access to [%lx,%lx]\n",
+                           d->domain_id, mfn, mfn + nr_mfns - 1);
+            }
+        }
+    }
+    else
+    {
+        printk(XENLOG_G_INFO
+               "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n",
+               d->domain_id, gfn, mfn, nr_mfns);
+
+        if ( paging_mode_translate(d) )
+            for ( i = 0; i < nr_mfns; i++ )
+                add |= !clear_mmio_p2m_entry(d, gfn + i);
+        ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
+        if ( !ret && add )
+            ret = -EIO;
+        if ( ret && is_hardware_domain(current->domain) )
+            printk(XENLOG_ERR
+                   "memory_map: error %ld %s dom%d access to [%lx,%lx]\n",
+                   ret, add ? "removing" : "denying", d->domain_id,
+                   mfn, mfn + nr_mfns - 1);
+    }
+
+    return ret;
+}
+
 long arch_do_domctl(
     struct xen_domctl *domctl, struct domain *d,
     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
@@ -625,67 +690,12 @@ long arch_do_domctl(
         unsigned long mfn = domctl->u.memory_mapping.first_mfn;
         unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns;
         int add = domctl->u.memory_mapping.add_mapping;
-        unsigned long i;
-
-        ret = -EINVAL;
-        if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
-             ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) ||
-             (gfn + nr_mfns - 1) < gfn ) /* wrap? */
-            break;
 
         ret = -EPERM;
         if ( !iomem_access_permitted(current->domain, mfn, mfn + nr_mfns - 1) )
             break;
 
-        ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add);
-        if ( ret )
-            break;
-
-        if ( add )
-        {
-            printk(XENLOG_G_INFO
-                   "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n",
-                   d->domain_id, gfn, mfn, nr_mfns);
-
-            ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
-            if ( !ret && paging_mode_translate(d) )
-            {
-                for ( i = 0; !ret && i < nr_mfns; i++ )
-                    if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) )
-                        ret = -EIO;
-                if ( ret )
-                {
-                    printk(XENLOG_G_WARNING
-                           "memory_map:fail: dom%d gfn=%lx mfn=%lx\n",
-                           d->domain_id, gfn + i, mfn + i);
-                    while ( i-- )
-                        clear_mmio_p2m_entry(d, gfn + i);
-                    if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) &&
-                         is_hardware_domain(current->domain) )
-                        printk(XENLOG_ERR
-                               "memory_map: failed to deny dom%d access to [%lx,%lx]\n",
-                               d->domain_id, mfn, mfn + nr_mfns - 1);
-                }
-            }
-        }
-        else
-        {
-            printk(XENLOG_G_INFO
-                   "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n",
-                   d->domain_id, gfn, mfn, nr_mfns);
-
-            if ( paging_mode_translate(d) )
-                for ( i = 0; i < nr_mfns; i++ )
-                    add |= !clear_mmio_p2m_entry(d, gfn + i);
-            ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
-            if ( !ret && add )
-                ret = -EIO;
-            if ( ret && is_hardware_domain(current->domain) )
-                printk(XENLOG_ERR
-                       "memory_map: error %ld %s dom%d access to [%lx,%lx]\n",
-                       ret, add ? "removing" : "denying", d->domain_id,
-                       mfn, mfn + nr_mfns - 1);
-        }
+        ret = domctl_memory_mapping(d, gfn, mfn, nr_mfns, add);
     }
     break;
 
-- 
1.7.2.3

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions
  2013-09-25 21:03 [RFC 0 PATCH 0/3]: PVH dom0 construction Mukesh Rathor
  2013-09-25 21:03 ` [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function Mukesh Rathor
@ 2013-09-25 21:03 ` Mukesh Rathor
  2013-09-26  7:21   ` Jan Beulich
  2013-09-25 21:03 ` [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes Mukesh Rathor
  2 siblings, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-25 21:03 UTC (permalink / raw)
  To: Xen-devel; +Cc: keir.xen, JBeulich

In this preparatory patch also, some pv specific code is
carved out into static functions. No functionality change.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/domain_build.c |  358 +++++++++++++++++++++++-------------------
 1 files changed, 196 insertions(+), 162 deletions(-)

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 232adf8..5125aa2 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -307,6 +307,197 @@ static void __init process_dom0_ioports_disable(void)
     }
 }
 
+static __init void mark_pv_pt_pages_rdonly(struct domain *d,
+                                           l4_pgentry_t *l4start,
+                                           unsigned long vpt_start,
+                                           unsigned long nr_pt_pages)
+{
+    unsigned long count;
+    struct page_info *page;
+    l4_pgentry_t *l4tab;
+    l3_pgentry_t *l3tab, *l3start;
+    l2_pgentry_t *l2tab, *l2start;
+    l1_pgentry_t *l1tab, *l1start;
+
+    /* Pages that are part of page tables must be read only. */
+    l4tab = l4start + l4_table_offset(vpt_start);
+    l3start = l3tab = l4e_to_l3e(*l4tab);
+    l3tab += l3_table_offset(vpt_start);
+    l2start = l2tab = l3e_to_l2e(*l3tab);
+    l2tab += l2_table_offset(vpt_start);
+    l1start = l1tab = l2e_to_l1e(*l2tab);
+    l1tab += l1_table_offset(vpt_start);
+    for ( count = 0; count < nr_pt_pages; count++ )
+    {
+        l1e_remove_flags(*l1tab, _PAGE_RW);
+        page = mfn_to_page(l1e_get_pfn(*l1tab));
+
+        /* Read-only mapping + PGC_allocated + page-table page. */
+        page->count_info         = PGC_allocated | 3;
+        page->u.inuse.type_info |= PGT_validated | 1;
+
+        /* Top-level p.t. is pinned. */
+        if ( (page->u.inuse.type_info & PGT_type_mask) ==
+             (!is_pv_32on64_domain(d) ?
+              PGT_l4_page_table : PGT_l3_page_table) )
+        {
+            page->count_info        += 1;
+            page->u.inuse.type_info += 1 | PGT_pinned;
+        }
+
+        /* Iterate. */
+        if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
+        {
+            if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
+            {
+                if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
+                    l3start = l3tab = l4e_to_l3e(*++l4tab);
+                l2start = l2tab = l3e_to_l2e(*l3tab);
+            }
+            l1start = l1tab = l2e_to_l1e(*l2tab);
+        }
+    }
+}
+
+static __init void setup_pv_p2m_table(
+    struct domain *d, struct vcpu *v, struct elf_dom_parms *parms,
+    unsigned long v_start, unsigned long vphysmap_start,
+    unsigned long vphysmap_end, unsigned long v_end, unsigned long nr_pages)
+{
+    struct page_info *page = NULL;
+    l4_pgentry_t *l4tab = NULL, *l4start = NULL;
+    l3_pgentry_t *l3tab = NULL;
+    l2_pgentry_t *l2tab = NULL;
+    l1_pgentry_t *l1tab = NULL;
+
+    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+    l3tab = NULL;
+    l2tab = NULL;
+    l1tab = NULL;
+
+    /* Set up the phys->machine table if not part of the initial mapping. */
+    if ( parms->p2m_base != UNSET_ADDR )
+    {
+        unsigned long va = vphysmap_start;
+
+        if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
+            panic("DOM0 P->M table overlaps initial mapping");
+
+        while ( va < vphysmap_end )
+        {
+            if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
+                                 >> PAGE_SHIFT) + 3 > nr_pages )
+                panic("Dom0 allocation too small for initial P->M table.\n");
+
+            if ( l1tab )
+            {
+                unmap_domain_page(l1tab);
+                l1tab = NULL;
+            }
+            if ( l2tab )
+            {
+                unmap_domain_page(l2tab);
+                l2tab = NULL;
+            }
+            if ( l3tab )
+            {
+                unmap_domain_page(l3tab);
+                l3tab = NULL;
+            }
+            l4tab = l4start + l4_table_offset(va);
+            if ( !l4e_get_intpte(*l4tab) )
+            {
+                page = alloc_domheap_page(d, 0);
+                if ( !page )
+                    break;
+                /* No mapping, PGC_allocated + page-table page. */
+                page->count_info = PGC_allocated | 2;
+                page->u.inuse.type_info =
+                    PGT_l3_page_table | PGT_validated | 1;
+                l3tab = __map_domain_page(page);
+                clear_page(l3tab);
+                *l4tab = l4e_from_page(page, L4_PROT);
+            } else
+                l3tab = map_domain_page(l4e_get_pfn(*l4tab));
+            l3tab += l3_table_offset(va);
+            if ( !l3e_get_intpte(*l3tab) )
+            {
+                if ( cpu_has_page1gb &&
+                     !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
+                     vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
+                     (page = alloc_domheap_pages(d,
+                                                 L3_PAGETABLE_SHIFT -
+                                                     PAGE_SHIFT,
+                                                 0)) != NULL )
+                {
+                    *l3tab = l3e_from_page(page,
+                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                    va += 1UL << L3_PAGETABLE_SHIFT;
+                    continue;
+                }
+                if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                    break;
+                /* No mapping, PGC_allocated + page-table page. */
+                page->count_info = PGC_allocated | 2;
+                page->u.inuse.type_info =
+                    PGT_l2_page_table | PGT_validated | 1;
+                l2tab = __map_domain_page(page);
+                clear_page(l2tab);
+                *l3tab = l3e_from_page(page, L3_PROT);
+            }
+            else
+               l2tab = map_domain_page(l3e_get_pfn(*l3tab));
+            l2tab += l2_table_offset(va);
+            if ( !l2e_get_intpte(*l2tab) )
+            {
+                if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
+                     vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
+                     (page = alloc_domheap_pages(d,
+                                                 L2_PAGETABLE_SHIFT -
+                                                     PAGE_SHIFT,
+                                                 0)) != NULL )
+                {
+                    *l2tab = l2e_from_page(page,
+                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
+                    if ( opt_allow_superpage )
+                        get_superpage(page_to_mfn(page), d);
+                    va += 1UL << L2_PAGETABLE_SHIFT;
+                    continue;
+                }
+                if ( (page = alloc_domheap_page(d, 0)) == NULL )
+                    break;
+                /* No mapping, PGC_allocated + page-table page. */
+                page->count_info = PGC_allocated | 2;
+                page->u.inuse.type_info =
+                    PGT_l1_page_table | PGT_validated | 1;
+                l1tab = __map_domain_page(page);
+                clear_page(l1tab);
+                *l2tab = l2e_from_page(page, L2_PROT);
+            }
+            else
+                l1tab = map_domain_page(l2e_get_pfn(*l2tab));
+            l1tab += l1_table_offset(va);
+            BUG_ON(l1e_get_intpte(*l1tab));
+            page = alloc_domheap_page(d, 0);
+            if ( !page )
+                break;
+            *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
+            va += PAGE_SIZE;
+            va &= PAGE_MASK;
+        }
+        if ( !page )
+            panic("Not enough RAM for DOM0 P->M table.\n");
+    }
+
+    if ( l1tab )
+        unmap_domain_page(l1tab);
+    if ( l2tab )
+        unmap_domain_page(l2tab);
+    if ( l3tab )
+        unmap_domain_page(l3tab);
+    unmap_domain_page(l4start);
+}
+
 int __init construct_dom0(
     struct domain *d,
     const module_t *image, unsigned long image_headroom,
@@ -705,44 +896,8 @@ int __init construct_dom0(
                COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
     }
 
-    /* Pages that are part of page tables must be read only. */
-    l4tab = l4start + l4_table_offset(vpt_start);
-    l3start = l3tab = l4e_to_l3e(*l4tab);
-    l3tab += l3_table_offset(vpt_start);
-    l2start = l2tab = l3e_to_l2e(*l3tab);
-    l2tab += l2_table_offset(vpt_start);
-    l1start = l1tab = l2e_to_l1e(*l2tab);
-    l1tab += l1_table_offset(vpt_start);
-    for ( count = 0; count < nr_pt_pages; count++ ) 
-    {
-        l1e_remove_flags(*l1tab, _PAGE_RW);
-        page = mfn_to_page(l1e_get_pfn(*l1tab));
-
-        /* Read-only mapping + PGC_allocated + page-table page. */
-        page->count_info         = PGC_allocated | 3;
-        page->u.inuse.type_info |= PGT_validated | 1;
-
-        /* Top-level p.t. is pinned. */
-        if ( (page->u.inuse.type_info & PGT_type_mask) ==
-             (!is_pv_32on64_domain(d) ?
-              PGT_l4_page_table : PGT_l3_page_table) )
-        {
-            page->count_info        += 1;
-            page->u.inuse.type_info += 1 | PGT_pinned;
-        }
-
-        /* Iterate. */
-        if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) )
-        {
-            if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) )
-            {
-                if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) )
-                    l3start = l3tab = l4e_to_l3e(*++l4tab);
-                l2start = l2tab = l3e_to_l2e(*l3tab);
-            }
-            l1start = l1tab = l2e_to_l1e(*l2tab);
-        }
-    }
+    if  ( is_pv_domain(d) )
+        mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages);
 
     /* Mask all upcalls... */
     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
@@ -814,131 +969,10 @@ int __init construct_dom0(
              elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
 
     count = d->tot_pages;
-    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
-    l3tab = NULL;
-    l2tab = NULL;
-    l1tab = NULL;
-    /* Set up the phys->machine table if not part of the initial mapping. */
-    if ( parms.p2m_base != UNSET_ADDR )
-    {
-        unsigned long va = vphysmap_start;
 
-        if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
-            panic("DOM0 P->M table overlaps initial mapping");
-
-        while ( va < vphysmap_end )
-        {
-            if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
-                                 >> PAGE_SHIFT) + 3 > nr_pages )
-                panic("Dom0 allocation too small for initial P->M table.\n");
-
-            if ( l1tab )
-            {
-                unmap_domain_page(l1tab);
-                l1tab = NULL;
-            }
-            if ( l2tab )
-            {
-                unmap_domain_page(l2tab);
-                l2tab = NULL;
-            }
-            if ( l3tab )
-            {
-                unmap_domain_page(l3tab);
-                l3tab = NULL;
-            }
-            l4tab = l4start + l4_table_offset(va);
-            if ( !l4e_get_intpte(*l4tab) )
-            {
-                page = alloc_domheap_page(d, 0);
-                if ( !page )
-                    break;
-                /* No mapping, PGC_allocated + page-table page. */
-                page->count_info = PGC_allocated | 2;
-                page->u.inuse.type_info =
-                    PGT_l3_page_table | PGT_validated | 1;
-                l3tab = __map_domain_page(page);
-                clear_page(l3tab);
-                *l4tab = l4e_from_page(page, L4_PROT);
-            } else
-                l3tab = map_domain_page(l4e_get_pfn(*l4tab));
-            l3tab += l3_table_offset(va);
-            if ( !l3e_get_intpte(*l3tab) )
-            {
-                if ( cpu_has_page1gb &&
-                     !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
-                     vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
-                     (page = alloc_domheap_pages(d,
-                                                 L3_PAGETABLE_SHIFT -
-                                                     PAGE_SHIFT,
-                                                 0)) != NULL )
-                {
-                    *l3tab = l3e_from_page(page,
-                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
-                    va += 1UL << L3_PAGETABLE_SHIFT;
-                    continue;
-                }
-                if ( (page = alloc_domheap_page(d, 0)) == NULL )
-                    break;
-                /* No mapping, PGC_allocated + page-table page. */
-                page->count_info = PGC_allocated | 2;
-                page->u.inuse.type_info =
-                    PGT_l2_page_table | PGT_validated | 1;
-                l2tab = __map_domain_page(page);
-                clear_page(l2tab);
-                *l3tab = l3e_from_page(page, L3_PROT);
-            }
-            else
-               l2tab = map_domain_page(l3e_get_pfn(*l3tab));
-            l2tab += l2_table_offset(va);
-            if ( !l2e_get_intpte(*l2tab) )
-            {
-                if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
-                     vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
-                     (page = alloc_domheap_pages(d,
-                                                 L2_PAGETABLE_SHIFT -
-                                                     PAGE_SHIFT,
-                                                 0)) != NULL )
-                {
-                    *l2tab = l2e_from_page(page,
-                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
-                    if ( opt_allow_superpage )
-                        get_superpage(page_to_mfn(page), d);
-                    va += 1UL << L2_PAGETABLE_SHIFT;
-                    continue;
-                }
-                if ( (page = alloc_domheap_page(d, 0)) == NULL )
-                    break;
-                /* No mapping, PGC_allocated + page-table page. */
-                page->count_info = PGC_allocated | 2;
-                page->u.inuse.type_info =
-                    PGT_l1_page_table | PGT_validated | 1;
-                l1tab = __map_domain_page(page);
-                clear_page(l1tab);
-                *l2tab = l2e_from_page(page, L2_PROT);
-            }
-            else
-                l1tab = map_domain_page(l2e_get_pfn(*l2tab));
-            l1tab += l1_table_offset(va);
-            BUG_ON(l1e_get_intpte(*l1tab));
-            page = alloc_domheap_page(d, 0);
-            if ( !page )
-                break;
-            *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
-            va += PAGE_SIZE;
-            va &= PAGE_MASK;
-        }
-        if ( !page )
-            panic("Not enough RAM for DOM0 P->M table.\n");
-    }
-
-    if ( l1tab )
-        unmap_domain_page(l1tab);
-    if ( l2tab )
-        unmap_domain_page(l2tab);
-    if ( l3tab )
-        unmap_domain_page(l3tab);
-    unmap_domain_page(l4start);
+    if ( is_pv_domain(d) )
+        setup_pv_p2m_table(d, v, &parms, v_start, vphysmap_start,
+                           vphysmap_end, v_end, nr_pages);
 
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
-- 
1.7.2.3

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-25 21:03 [RFC 0 PATCH 0/3]: PVH dom0 construction Mukesh Rathor
  2013-09-25 21:03 ` [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function Mukesh Rathor
  2013-09-25 21:03 ` [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions Mukesh Rathor
@ 2013-09-25 21:03 ` Mukesh Rathor
  2013-09-26  8:02   ` Jan Beulich
  2 siblings, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-25 21:03 UTC (permalink / raw)
  To: Xen-devel; +Cc: keir.xen, JBeulich

This patch changes construct_dom0 to boot in PVH mode. Changes
need to support it are also included here.

Signed-off-by: Mukesh Rathor <mukesh.rathor@oracle.com>
---
 xen/arch/x86/domain_build.c |  231 +++++++++++++++++++++++++++++++++++++++----
 xen/arch/x86/domctl.c       |    2 +-
 xen/arch/x86/mm/hap/hap.c   |   14 +++
 xen/include/asm-x86/hap.h   |    1 +
 xen/include/xen/domain.h    |    3 +
 5 files changed, 231 insertions(+), 20 deletions(-)

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 5125aa2..3fd2b6c 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -35,6 +35,7 @@
 #include <asm/setup.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
+#include <asm/hap.h>
 
 #include <public/version.h>
 
@@ -307,6 +308,136 @@ static void __init process_dom0_ioports_disable(void)
     }
 }
 
+/*
+ * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
+ * the entire io region mapped in the EPT/NPT.
+ *
+ * PVH FIXME: The following doesn't map MMIO ranges when they sit above the
+ *            highest E820 covered address.
+ */
+static __init void pvh_map_all_iomem(struct domain *d)
+{
+    unsigned long start_pfn, end_pfn, end = 0, start = 0;
+    const struct e820entry *entry;
+    unsigned int i, nump;
+    int rc;
+
+    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
+    {
+        end = entry->addr + entry->size;
+
+        if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ||
+             i == e820.nr_map - 1 )
+        {
+            start_pfn = PFN_DOWN(start);
+            end_pfn = PFN_UP(end);
+
+            if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE )
+                end_pfn = PFN_UP(entry->addr);
+
+            if ( start_pfn < end_pfn )
+            {
+                nump = end_pfn - start_pfn;
+                /* Add pages to the mapping */
+                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
+                BUG_ON(rc);
+            }
+            start = end;
+        }
+    }
+
+    /* If the e820 ended under 4GB, we must map the remaining space upto 4GB */
+    if ( end < GB(4) )
+    {
+        start_pfn = PFN_UP(end);
+        end_pfn = (GB(4)) >> PAGE_SHIFT;
+        nump = end_pfn - start_pfn;
+        rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
+        BUG_ON(rc);
+    }
+}
+
+static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
+                                   unsigned long mfn, unsigned long vphysmap_s)
+{
+    if ( is_pvh_domain(d) )
+    {
+        int rc = guest_physmap_add_page(d, pfn, mfn, 0);
+        BUG_ON(rc);
+        return;
+    }
+    if ( !is_pv_32on64_domain(d) )
+        ((unsigned long *)vphysmap_s)[pfn] = mfn;
+    else
+        ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+    set_gpfn_from_mfn(mfn, pfn);
+}
+
+static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v,
+                                                 unsigned long v_start)
+{
+    int i, j, k;
+    l4_pgentry_t *l4tab = NULL, *l4start = NULL;
+    l3_pgentry_t *l3tab = NULL;
+    l2_pgentry_t *l2tab = NULL;
+    l1_pgentry_t *l1tab = NULL;
+    l4_pgentry_t sav_guest_l4;
+    unsigned long cr3_pfn;
+
+    ASSERT(paging_mode_enabled(v->domain));
+
+    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+    l4tab = l4start + l4_table_offset(v_start);
+    sav_guest_l4 = *l4tab;
+
+    /* Give guest a clean slate to start with */
+    clear_page(l4start);
+    *l4tab = sav_guest_l4;
+    BUG_ON(!l4e_get_pfn(sav_guest_l4));
+
+    l3tab = map_l3t_from_l4e(*l4tab);
+    for (i=0; i < PAGE_SIZE / sizeof(l3_pgentry_t); i++, l3tab++)
+    {
+        if ( l3e_get_pfn(*l3tab) == 0 )
+            continue;
+
+        l2tab = map_l2t_from_l3e(*l3tab);
+        for (j=0; j < PAGE_SIZE / sizeof(l2_pgentry_t); j++, l2tab++)
+        {
+            if ( l2e_get_pfn(*l2tab) == 0 )
+                continue;
+
+            l1tab = map_l1t_from_l2e(*l2tab);
+
+            for (k=0; k < PAGE_SIZE / sizeof(l2_pgentry_t); k++, l1tab++)
+            {
+                if ( l1e_get_pfn(*l1tab) == 0 )
+                    continue;
+
+                *l1tab = l1e_from_pfn(get_gpfn_from_mfn(l1e_get_pfn(*l1tab)),
+                                      l1e_get_flags(*l1tab));
+            }
+            *l2tab = l2e_from_pfn(get_gpfn_from_mfn(l2e_get_pfn(*l2tab)),
+                                  l2e_get_flags(*l2tab));
+        }
+        *l3tab = l3e_from_pfn(get_gpfn_from_mfn(l3e_get_pfn(*l3tab)),
+                              l3e_get_flags(*l3tab));
+    }
+    *l4tab = l4e_from_pfn(get_gpfn_from_mfn(l4e_get_pfn(*l4tab)),
+                          l4e_get_flags(*l4tab));
+
+    cr3_pfn = get_gpfn_from_mfn(paddr_to_pfn(v->arch.cr3));
+    v->arch.hvm_vcpu.guest_cr[3] = pfn_to_paddr(cr3_pfn);
+
+    /*
+     * now we update the paging modes (hap_update_paging_modes). This will
+     * create monitor_table for us, update v->arch.cr3, and vmcs.cr3.
+     */
+    paging_update_paging_modes(v);
+
+}
+
 static __init void mark_pv_pt_pages_rdonly(struct domain *d,
                                            l4_pgentry_t *l4start,
                                            unsigned long vpt_start,
@@ -513,7 +644,7 @@ int __init construct_dom0(
     unsigned long alloc_spfn;
     unsigned long alloc_epfn;
     unsigned long initrd_pfn = -1, initrd_mfn = 0;
-    unsigned long count;
+    unsigned long count, shared_info_paddr = 0;
     struct page_info *page = NULL;
     start_info_t *si;
     struct vcpu *v = d->vcpu[0];
@@ -526,6 +657,7 @@ int __init construct_dom0(
     l3_pgentry_t *l3tab = NULL, *l3start = NULL;
     l2_pgentry_t *l2tab = NULL, *l2start = NULL;
     l1_pgentry_t *l1tab = NULL, *l1start = NULL;
+    u32 save_pvh_pg_mode = 0;
 
     /*
      * This fully describes the memory layout of the initial domain. All 
@@ -603,12 +735,20 @@ int __init construct_dom0(
         goto out;
     }
 
-    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE &&
-         !test_bit(XENFEAT_dom0, parms.f_supported) )
+    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE )
     {
-        printk("Kernel does not support Dom0 operation\n");
-        rc = -EINVAL;
-        goto out;
+        if ( !test_bit(XENFEAT_dom0, parms.f_supported) )
+        {
+            printk("Kernel does not support Dom0 operation\n");
+            rc = -EINVAL;
+            goto out;
+        }
+        if ( is_pvh_domain(d) &&
+             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) )
+        {
+            printk("Kernel does not support PVH mode\n");
+            return -EINVAL;
+        }
     }
 
     if ( compat32 )
@@ -673,6 +813,14 @@ int __init construct_dom0(
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
                         sizeof(struct dom0_vga_console_info));
+
+    if ( is_pvh_domain(d) )
+    {
+        /* note, following is paddr and not maddr */
+        shared_info_paddr = round_pgup(vstartinfo_end) - v_start;
+        vstartinfo_end   += PAGE_SIZE;
+    }
+
     vpt_start        = round_pgup(vstartinfo_end);
     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
     {
@@ -868,6 +1016,9 @@ int __init construct_dom0(
                                     L1_PROT : COMPAT_L1_PROT));
         l1tab++;
 
+        if ( is_pvh_domain(d) )
+            continue;
+
         page = mfn_to_page(mfn);
         if ( (page->u.inuse.type_info == 0) &&
              !get_page_and_type(page, d, PGT_writable_page) )
@@ -912,6 +1063,16 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, cpu);
     }
 
+    /*
+     * pvh: we temporarily disable paging mode so that we can build cr3 needed
+     * to run on dom0's page tables.
+     */
+    if ( is_pvh_domain(d) )
+    {
+        save_pvh_pg_mode = d->arch.paging.mode;
+        d->arch.paging.mode = 0;
+    }
+
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
@@ -974,6 +1135,17 @@ int __init construct_dom0(
         setup_pv_p2m_table(d, v, &parms, v_start, vphysmap_start,
                            vphysmap_end, v_end, nr_pages);
 
+    if ( is_pvh_domain(d) )
+    {
+        hap_set_pvh_alloc_for_dom0(d, nr_pages);
+
+        /*
+         * We enable paging mode again so guest_physmap_add_page will do the
+         * right thing for us.
+         */
+        d->arch.paging.mode = save_pvh_pg_mode;
+    }
+
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
     {
@@ -990,11 +1162,7 @@ int __init construct_dom0(
         if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
             mfn = alloc_epfn - (pfn - REVERSE_START);
 #endif
-        if ( !is_pv_32on64_domain(d) )
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-        else
-            ((unsigned int *)vphysmap_start)[pfn] = mfn;
-        set_gpfn_from_mfn(mfn, pfn);
+        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
         if (!(pfn & 0xfffff))
             process_pending_softirqs();
     }
@@ -1010,8 +1178,8 @@ int __init construct_dom0(
             if ( !page->u.inuse.type_info &&
                  !get_page_and_type(page, d, PGT_writable_page) )
                 BUG();
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
             ++pfn;
             if (!(pfn & 0xfffff))
                 process_pending_softirqs();
@@ -1031,11 +1199,7 @@ int __init construct_dom0(
 #ifndef NDEBUG
 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
 #endif
-            if ( !is_pv_32on64_domain(d) )
-                ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            else
-                ((unsigned int *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
 #undef pfn
             page++; pfn++;
             if (!(pfn & 0xfffff))
@@ -1059,6 +1223,15 @@ int __init construct_dom0(
         si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
     }
 
+    /*
+     * PVH: We need to update si->shared_info while we are on dom0 page tables,
+     * but need to defer the p2m update until after we have fixed up the
+     * page tables for PVH so that the m2p for the si pte entry returns
+     * correct pfn.
+     */
+    if ( is_pvh_domain(d) )
+        si->shared_info = shared_info_paddr;
+
     if ( is_pv_32on64_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
 
@@ -1089,11 +1262,18 @@ int __init construct_dom0(
     regs->eip = parms.virt_entry;
     regs->esp = vstack_end;
     regs->esi = vstartinfo_start;
-    regs->eflags = X86_EFLAGS_IF;
+    regs->eflags = X86_EFLAGS_IF | 0x2;
 
     if ( opt_dom0_shadow )
+    {
+        if ( is_pvh_domain(d) )
+        {
+            printk("Invalid option dom0_shadow for PVH\n");
+            return -EINVAL;
+        }
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
+    }
 
     if ( supervisor_mode_kernel )
     {
@@ -1183,6 +1363,19 @@ int __init construct_dom0(
         printk(" Xen warning: dom0 kernel broken ELF: %s\n",
                elf_check_broken(&elf));
 
+    if ( is_pvh_domain(d) )
+    {
+        /* finally, fixup the page table, replacing mfns with pfns */
+        pvh_fixup_page_tables_for_hap(v, v_start);
+
+        /* the pt has correct pfn for si, now update the mfn in the p2m */
+        mfn = virt_to_mfn(d->shared_info);
+        pfn = shared_info_paddr >> PAGE_SHIFT;
+        dom0_update_physmap(d, pfn, mfn, 0);
+
+        pvh_map_all_iomem(d);
+    }
+
     iommu_dom0_init(dom0);
     return 0;
 
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index fe7ca00..998827c 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -46,7 +46,7 @@ static int gdbsx_guest_mem_io(
     return (iop->remain ? -EFAULT : 0);
 }
 
-static long domctl_memory_mapping(struct domain *d, unsigned long gfn,
+long domctl_memory_mapping(struct domain *d, unsigned long gfn,
                            unsigned long mfn, unsigned long nr_mfns,
                            bool_t add)
 {
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index bff05d9..2f5a48b 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -580,6 +580,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc,
     }
 }
 
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
+{
+    int rc;
+    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
+
+   /* Copied from: libxl_get_required_shadow_memory() */
+    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+    num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);
+    paging_lock(d);
+    rc = hap_set_allocation(d, num_pages, NULL);
+    paging_unlock(d);
+    BUG_ON(rc);
+}
+
 static const struct paging_mode hap_paging_real_mode;
 static const struct paging_mode hap_paging_protected_mode;
 static const struct paging_mode hap_paging_pae_mode;
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index e03f983..aab8558 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
 
 #endif /* XEN_HAP_H */
 
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index a057069..6436bab 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -89,4 +89,7 @@ extern unsigned int xen_processor_pmbits;
 
 extern bool_t opt_dom0_vcpus_pin;
 
+extern long domctl_memory_mapping(struct domain *d, unsigned long gfn,
+                    unsigned long mfn, unsigned long nr_mfns, bool_t add_map);
+
 #endif /* __XEN_DOMAIN_H__ */
-- 
1.7.2.3

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function
  2013-09-25 21:03 ` [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function Mukesh Rathor
@ 2013-09-26  7:03   ` Jan Beulich
  0 siblings, 0 replies; 40+ messages in thread
From: Jan Beulich @ 2013-09-26  7:03 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> +static long domctl_memory_mapping(struct domain *d, unsigned long gfn,
> +                           unsigned long mfn, unsigned long nr_mfns,
> +                           bool_t add)
> +{
> +    unsigned long i;
> +    long ret;
> +
> +    if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
> +         ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) ||
> +         (gfn + nr_mfns - 1) < gfn ) /* wrap? */
> +        return -EINVAL;
> +
> +    ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add);
> +    if ( ret )
> +        return ret;

Sigh - I said this more than once before: The order of the checks
in the original code _is_ relevant. You can't move the first and
third checks here, but leave the second one there. The way you
use it in the later patch, the function is mis-named, and the
checks aren't needed for that other case anyway (maybe the
XSM one ought to be done, but that's the last check, so can be
moved here if necessary). Hence just leave them in the caller
(and thus retain their proper order).

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions
  2013-09-25 21:03 ` [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions Mukesh Rathor
@ 2013-09-26  7:21   ` Jan Beulich
  2013-09-26 23:32     ` Mukesh Rathor
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-09-26  7:21 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> +static __init void setup_pv_p2m_table(

Let's call this setup_pv_physmap() or some such; avoid including p2m
in the name in particular.

> +    struct domain *d, struct vcpu *v, struct elf_dom_parms *parms,
> +    unsigned long v_start, unsigned long vphysmap_start,
> +    unsigned long vphysmap_end, unsigned long v_end, unsigned long nr_pages)

Parameters could be grouped a little more logically.

> +{
> +    struct page_info *page = NULL;
> +    l4_pgentry_t *l4tab = NULL, *l4start = NULL;
> +    l3_pgentry_t *l3tab = NULL;
> +    l2_pgentry_t *l2tab = NULL;
> +    l1_pgentry_t *l1tab = NULL;
> +
> +    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));

This appears to be the only use of "v", so it'd be more logical to
pass pagetable_get_pfn(v->arch.guest_table) or
v->arch.guest_table.

Also, with l4start being initialized here, there's no point in setting
it to NULL in its declaration.

> +    l3tab = NULL;
> +    l2tab = NULL;
> +    l1tab = NULL;

Their declarations above have initializers, so what's the point of
these?

> +
> +    /* Set up the phys->machine table if not part of the initial mapping. */
> +    if ( parms->p2m_base != UNSET_ADDR )

This check could now be easily left at the call site, thus reducing
indentation of the entire body below. Which would perhaps allow
not passing parms to this function at all.

> +    {
> +        unsigned long va = vphysmap_start;

Now that you would no longer clobber vphysmap_start (in the caller),
there's no need for a separate variable here. 

> +
> +        if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
> +            panic("DOM0 P->M table overlaps initial mapping");
> +
> +        while ( va < vphysmap_end )
> +        {
> +            if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
> +                                 >> PAGE_SHIFT) + 3 > nr_pages )
> +                panic("Dom0 allocation too small for initial P->M table.\n");
> +
> +            if ( l1tab )
> +            {
> +                unmap_domain_page(l1tab);
> +                l1tab = NULL;
> +            }
> +            if ( l2tab )
> +            {
> +                unmap_domain_page(l2tab);
> +                l2tab = NULL;
> +            }
> +            if ( l3tab )
> +            {
> +                unmap_domain_page(l3tab);
> +                l3tab = NULL;
> +            }
> +            l4tab = l4start + l4_table_offset(va);

The way this l4tab gets used here (and l[123]tab below) makes
these names bogus - they should be pl[1234]e instead. The
original uses were just attributed to the desire of re-using
existing variables, which is mute with this stuff getting moved
into its own function.

This also applies to the similarly named variables in
mark_pv_pt_pages_rdonly().

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-25 21:03 ` [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes Mukesh Rathor
@ 2013-09-26  8:02   ` Jan Beulich
  2013-09-27  0:17     ` Mukesh Rathor
  2013-09-27  1:55     ` Mukesh Rathor
  0 siblings, 2 replies; 40+ messages in thread
From: Jan Beulich @ 2013-09-26  8:02 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> @@ -307,6 +308,136 @@ static void __init process_dom0_ioports_disable(void)
>      }
>  }
>  
> +/*
> + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
> + * the entire io region mapped in the EPT/NPT.
> + *
> + * PVH FIXME: The following doesn't map MMIO ranges when they sit above the
> + *            highest E820 covered address.

This absolutely needs fixing before this can go in.

> + */
> +static __init void pvh_map_all_iomem(struct domain *d)
> +{
> +    unsigned long start_pfn, end_pfn, end = 0, start = 0;
> +    const struct e820entry *entry;
> +    unsigned int i, nump;
> +    int rc;
> +
> +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
> +    {
> +        end = entry->addr + entry->size;
> +
> +        if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ||

The inclusion of E820_UNUSABLE here clearly needs an explanation
(ideally in the shape of a code comment).

> +             i == e820.nr_map - 1 )
> +        {
> +            start_pfn = PFN_DOWN(start);
> +            end_pfn = PFN_UP(end);
> +
> +            if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE )
> +                end_pfn = PFN_UP(entry->addr);

Better coded as if/else construct (making more obvious what the
intention here is - it took me quite some time to see that what
you're doing here is correct, because the general case really is
what sits in the if() body, and the exception is what gets done
before the if(), which is counter intuitive).

> +
> +            if ( start_pfn < end_pfn )
> +            {
> +                nump = end_pfn - start_pfn;
> +                /* Add pages to the mapping */
> +                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
> +                BUG_ON(rc);
> +            }
> +            start = end;
> +        }
> +    }
> +
> +    /* If the e820 ended under 4GB, we must map the remaining space upto 4GB */

This is arbitrary (related to the FIXME above).

> +static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v,
> +                                                 unsigned long v_start)
> +{
> +    int i, j, k;
> +    l4_pgentry_t *l4tab = NULL, *l4start = NULL;
> +    l3_pgentry_t *l3tab = NULL;
> +    l2_pgentry_t *l2tab = NULL;
> +    l1_pgentry_t *l1tab = NULL;
> +    l4_pgentry_t sav_guest_l4;
> +    unsigned long cr3_pfn;
> +
> +    ASSERT(paging_mode_enabled(v->domain));
> +
> +    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
> +    l4tab = l4start + l4_table_offset(v_start);

Many of the comments on patch 2 apply here too.

> +    sav_guest_l4 = *l4tab;
> +
> +    /* Give guest a clean slate to start with */
> +    clear_page(l4start);
> +    *l4tab = sav_guest_l4;
> +    BUG_ON(!l4e_get_pfn(sav_guest_l4));

This assumes that the initial mapping is covered by a single L4
entry. While true for Linux, this is not generic, and hence needs
fixing (the problem continues further down).

> +
> +    l3tab = map_l3t_from_l4e(*l4tab);
> +    for (i=0; i < PAGE_SIZE / sizeof(l3_pgentry_t); i++, l3tab++)

Coding style (not going to make further notes on violations).

> +    {
> +        if ( l3e_get_pfn(*l3tab) == 0 )

Is that really a good check? Shouldn't that rather look at
_PAGE_PRESENT?

> +            continue;
> +
> +        l2tab = map_l2t_from_l3e(*l3tab);
> +        for (j=0; j < PAGE_SIZE / sizeof(l2_pgentry_t); j++, l2tab++)
> +        {
> +            if ( l2e_get_pfn(*l2tab) == 0 )
> +                continue;
> +
> +            l1tab = map_l1t_from_l2e(*l2tab);
> +
> +            for (k=0; k < PAGE_SIZE / sizeof(l2_pgentry_t); k++, l1tab++)

l2_pgentry_t? You be better off using sizeof(*l1tab) here anyway
(and similarly in the other loops).

> @@ -513,7 +644,7 @@ int __init construct_dom0(
>      unsigned long alloc_spfn;
>      unsigned long alloc_epfn;
>      unsigned long initrd_pfn = -1, initrd_mfn = 0;
> -    unsigned long count;
> +    unsigned long count, shared_info_paddr = 0;

Even if benign for x86-64, please used paddr_t for physical
addresses.

> @@ -603,12 +735,20 @@ int __init construct_dom0(
>          goto out;
>      }
>  
> -    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE &&
> -         !test_bit(XENFEAT_dom0, parms.f_supported) )
> +    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE )
>      {
> -        printk("Kernel does not support Dom0 operation\n");
> -        rc = -EINVAL;
> -        goto out;
> +        if ( !test_bit(XENFEAT_dom0, parms.f_supported) )
> +        {
> +            printk("Kernel does not support Dom0 operation\n");
> +            rc = -EINVAL;
> +            goto out;
> +        }
> +        if ( is_pvh_domain(d) &&
> +             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) )
> +        {
> +            printk("Kernel does not support PVH mode\n");
> +            return -EINVAL;

Inconsistent: Above you see "goto out", and here you use "return".

> @@ -673,6 +813,14 @@ int __init construct_dom0(
>      vstartinfo_end   = (vstartinfo_start +
>                          sizeof(struct start_info) +
>                          sizeof(struct dom0_vga_console_info));
> +
> +    if ( is_pvh_domain(d) )
> +    {
> +        /* note, following is paddr and not maddr */
> +        shared_info_paddr = round_pgup(vstartinfo_end) - v_start;

Hardly worth the comment considering the variable name.

> @@ -868,6 +1016,9 @@ int __init construct_dom0(
>                                      L1_PROT : COMPAT_L1_PROT));
>          l1tab++;
>  
> +        if ( is_pvh_domain(d) )
> +            continue;
> +
>          page = mfn_to_page(mfn);
>          if ( (page->u.inuse.type_info == 0) &&
>               !get_page_and_type(page, d, PGT_writable_page) )

So why is the remaining part of this loop not applicable to PVH?

> @@ -912,6 +1063,16 @@ int __init construct_dom0(
>          (void)alloc_vcpu(d, i, cpu);
>      }
>  
> +    /*
> +     * pvh: we temporarily disable paging mode so that we can build cr3 needed
> +     * to run on dom0's page tables.
> +     */
> +    if ( is_pvh_domain(d) )
> +    {
> +        save_pvh_pg_mode = d->arch.paging.mode;
> +        d->arch.paging.mode = 0;
> +    }

Does this really need to be in a conditional?

> @@ -1089,11 +1262,18 @@ int __init construct_dom0(
>      regs->eip = parms.virt_entry;
>      regs->esp = vstack_end;
>      regs->esi = vstartinfo_start;
> -    regs->eflags = X86_EFLAGS_IF;
> +    regs->eflags = X86_EFLAGS_IF | 0x2;

Unrelated change?

> +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)

__init

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions
  2013-09-26  7:21   ` Jan Beulich
@ 2013-09-26 23:32     ` Mukesh Rathor
  0 siblings, 0 replies; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-26 23:32 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Thu, 26 Sep 2013 08:21:37 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > +static __init void setup_pv_p2m_table(
> 
...
> > +            if ( l3tab )
> > +            {
> > +                unmap_domain_page(l3tab);
> > +                l3tab = NULL;
> > +            }
> > +            l4tab = l4start + l4_table_offset(va);
> 
> The way this l4tab gets used here (and l[123]tab below) makes
> these names bogus - they should be pl[1234]e instead. The
> original uses were just attributed to the desire of re-using
> existing variables, which is mute with this stuff getting moved
> into its own function.
> 
> This also applies to the similarly named variables in
> mark_pv_pt_pages_rdonly().

Ok, all done. Also, noticed the exhisting code for 
mark_pv_pt_pages_rdonly unnecessarily sets l[321]start
in two places.

thanks
mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-26  8:02   ` Jan Beulich
@ 2013-09-27  0:17     ` Mukesh Rathor
  2013-09-27  6:54       ` Jan Beulich
  2013-09-27  1:55     ` Mukesh Rathor
  1 sibling, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-27  0:17 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Thu, 26 Sep 2013 09:02:41 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > @@ -307,6 +308,136 @@ static void __init
> > process_dom0_ioports_disable(void) }
> >  }
> >  
> > +/*
> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0
> > will have
> > + * the entire io region mapped in the EPT/NPT.
> > + *
> > + * PVH FIXME: The following doesn't map MMIO ranges when they sit
> > above the
> > + *            highest E820 covered address.
> 
> This absolutely needs fixing before this can go in.

Any suggestions on how to fix it? Mapping all the way to end could
result in a huge hap table. 


> > +    const struct e820entry *entry;
> > +    unsigned int i, nump;
> > +    int rc;
> > +
> > +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
> > +    {
> > +        end = entry->addr + entry->size;
> > +
> > +        if ( entry->type == E820_RAM || entry->type ==
> > E820_UNUSABLE ||
> 
> The inclusion of E820_UNUSABLE here clearly needs an explanation
> (ideally in the shape of a code comment).
> 
> > +             i == e820.nr_map - 1 )
> > +        {
> > +            start_pfn = PFN_DOWN(start);
> > +            end_pfn = PFN_UP(end);
> > +
> > +            if ( entry->type == E820_RAM || entry->type ==
> > E820_UNUSABLE )
> > +                end_pfn = PFN_UP(entry->addr);
> 
> Better coded as if/else construct (making more obvious what the
> intention here is - it took me quite some time to see that what
> you're doing here is correct, because the general case really is
> what sits in the if() body, and the exception is what gets done
> before the if(), which is counter intuitive).

Based on xen_set_identity_and_release linux function. Took me a while
to figure that one too. So, I tried simplifying it, but in the end
came back to this. Not sure where I would the else.

thanks
Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-26  8:02   ` Jan Beulich
  2013-09-27  0:17     ` Mukesh Rathor
@ 2013-09-27  1:55     ` Mukesh Rathor
  2013-09-27  7:01       ` Jan Beulich
  1 sibling, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-27  1:55 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Thu, 26 Sep 2013 09:02:41 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > @@ -307,6 +308,136 @@ static void __init
.....
> 

> 
> > +    sav_guest_l4 = *l4tab;
> > +
> > +    /* Give guest a clean slate to start with */
> > +    clear_page(l4start);
> > +    *l4tab = sav_guest_l4;
> > +    BUG_ON(!l4e_get_pfn(sav_guest_l4));
> 
> This assumes that the initial mapping is covered by a single L4
> entry. While true for Linux, this is not generic, and hence needs
> fixing (the problem continues further down).

True. How about I just create the opposite of init_guest_l4_table,
ie, clear_guest_l4_table and just clear out xen entries?

> 
> > +    {
> > +        if ( l3e_get_pfn(*l3tab) == 0 )
> 
> Is that really a good check? Shouldn't that rather look at
> _PAGE_PRESENT?

Ok, I will change that.

> Hardly worth the comment considering the variable name.
> 
> > @@ -868,6 +1016,9 @@ int __init construct_dom0(
> >                                      L1_PROT : COMPAT_L1_PROT));
> >          l1tab++;
> >  
> > +        if ( is_pvh_domain(d) )
> > +            continue;
> > +
> >          page = mfn_to_page(mfn);
> >          if ( (page->u.inuse.type_info == 0) &&
> >               !get_page_and_type(page, d, PGT_writable_page) )
> 
> So why is the remaining part of this loop not applicable to PVH?

My bad, looks like it should be. I'll remove it. BTW, looking at it again
I realized we don't really need to set type_info to PGT* for PVH, but it's 
harmless I guess. Should I just leave it or condition them for PV only?

> >          (void)alloc_vcpu(d, i, cpu);
> >      }
> >  
> > +    /*
> > +     * pvh: we temporarily disable paging mode so that we can
> > build cr3 needed
> > +     * to run on dom0's page tables.
> > +     */
> > +    if ( is_pvh_domain(d) )
> > +    {
> > +        save_pvh_pg_mode = d->arch.paging.mode;
> > +        d->arch.paging.mode = 0;
> > +    }
> 
> Does this really need to be in a conditional?

Not really, wasn't sure what you'd prefer. I'll remove conditions.

> > @@ -1089,11 +1262,18 @@ int __init construct_dom0(
> >      regs->eip = parms.virt_entry;
> >      regs->esp = vstack_end;
> >      regs->esi = vstartinfo_start;
> > -    regs->eflags = X86_EFLAGS_IF;
> > +    regs->eflags = X86_EFLAGS_IF | 0x2;
> 
> Unrelated change?

Nop, we need to make sure the resvd bit is set in eflags otherwise
it won't vmenter (invalid guest state). Should be harmless for PV,
right? Not sure where it does it for PV before actually scheduling it..

thanks
Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-27  0:17     ` Mukesh Rathor
@ 2013-09-27  6:54       ` Jan Beulich
  2013-10-03  0:53         ` Mukesh Rathor
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-09-27  6:54 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:
>> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
>> > +/*
>> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
>> > + * the entire io region mapped in the EPT/NPT.
>> > + *
>> > + * PVH FIXME: The following doesn't map MMIO ranges when they sit above the
>> > + *            highest E820 covered address.
>> 
>> This absolutely needs fixing before this can go in.
> 
> Any suggestions on how to fix it? Mapping all the way to end could
> result in a huge hap table. 

You'll probably need a call down from Dom0 telling you where it
finds/puts MMIO resources. Or perhaps that could be mapped
in on demand from the EPT fault handler (since these regions
shouldn't be subject to DMA, and hence IOMMU faults shouldn't
occur - perhaps that's even a reason to not share page tables
at least in dom0-strict mode)?

>> > +    const struct e820entry *entry;
>> > +    unsigned int i, nump;
>> > +    int rc;
>> > +
>> > +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
>> > +    {
>> > +        end = entry->addr + entry->size;
>> > +
>> > +        if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ||
>> 
>> The inclusion of E820_UNUSABLE here clearly needs an explanation
>> (ideally in the shape of a code comment).
>> 
>> > +             i == e820.nr_map - 1 )
>> > +        {
>> > +            start_pfn = PFN_DOWN(start);
>> > +            end_pfn = PFN_UP(end);
>> > +
>> > +            if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE )
>> > +                end_pfn = PFN_UP(entry->addr);
>> 
>> Better coded as if/else construct (making more obvious what the
>> intention here is - it took me quite some time to see that what
>> you're doing here is correct, because the general case really is
>> what sits in the if() body, and the exception is what gets done
>> before the if(), which is counter intuitive).
> 
> Based on xen_set_identity_and_release linux function. Took me a while
> to figure that one too. So, I tried simplifying it, but in the end
> came back to this. Not sure where I would the else.

            if ( entry->type != E820_RAM && entry->type != E820_UNUSABLE )
                end_pfn = PFN_UP(end);
            else
                end_pfn = PFN_UP(entry->addr);

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-27  1:55     ` Mukesh Rathor
@ 2013-09-27  7:01       ` Jan Beulich
  2013-09-27 23:03         ` Mukesh Rathor
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-09-27  7:01 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 27.09.13 at 03:55, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich" <JBeulich@suse.com> wrote:
>> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
>> > +    sav_guest_l4 = *l4tab;
>> > +
>> > +    /* Give guest a clean slate to start with */
>> > +    clear_page(l4start);
>> > +    *l4tab = sav_guest_l4;
>> > +    BUG_ON(!l4e_get_pfn(sav_guest_l4));
>> 
>> This assumes that the initial mapping is covered by a single L4
>> entry. While true for Linux, this is not generic, and hence needs
>> fixing (the problem continues further down).
> 
> True. How about I just create the opposite of init_guest_l4_table,
> ie, clear_guest_l4_table and just clear out xen entries?

I'd prefer you clearing everything below and above the range you
need mapped, instead of using clear_page().

>> > @@ -868,6 +1016,9 @@ int __init construct_dom0(
>> >                                      L1_PROT : COMPAT_L1_PROT));
>> >          l1tab++;
>> >  
>> > +        if ( is_pvh_domain(d) )
>> > +            continue;
>> > +
>> >          page = mfn_to_page(mfn);
>> >          if ( (page->u.inuse.type_info == 0) &&
>> >               !get_page_and_type(page, d, PGT_writable_page) )
>> 
>> So why is the remaining part of this loop not applicable to PVH?
> 
> My bad, looks like it should be. I'll remove it. BTW, looking at it again
> I realized we don't really need to set type_info to PGT* for PVH, but it's 
> harmless I guess. Should I just leave it or condition them for PV only?

No, I'm pretty certain you want them marked writable. If nothing
else then for forward compatibility with an eventual change
needing to mark certain pages R/O. But I could also imaging the
page sharing code to look at this attribute of a page (but I say
this without knowing that code at all).

>> > @@ -1089,11 +1262,18 @@ int __init construct_dom0(
>> >      regs->eip = parms.virt_entry;
>> >      regs->esp = vstack_end;
>> >      regs->esi = vstartinfo_start;
>> > -    regs->eflags = X86_EFLAGS_IF;
>> > +    regs->eflags = X86_EFLAGS_IF | 0x2;
>> 
>> Unrelated change?
> 
> Nop, we need to make sure the resvd bit is set in eflags otherwise
> it won't vmenter (invalid guest state). Should be harmless for PV,
> right? Not sure where it does it for PV before actually scheduling it..

PV doesn't set this anywhere - the hardware doesn't allow the
flag to be cleared (writes are ignored). If VMENTER is picky
about this, the GUEST_RFLAGS write at the end of
vmx_vmenter_helper() should be doing this instead of having to
do it here (and obviously in some other place for DomU creation).

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-27  7:01       ` Jan Beulich
@ 2013-09-27 23:03         ` Mukesh Rathor
  2013-09-30  6:56           ` Jan Beulich
  0 siblings, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-09-27 23:03 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Fri, 27 Sep 2013 08:01:16 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 27.09.13 at 03:55, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
> > <JBeulich@suse.com> wrote:
> >> > @@ -868,6 +1016,9 @@ int __init construct_dom0(
> >> >                                      L1_PROT : COMPAT_L1_PROT));
> >> >          l1tab++;
> >> >  
> >> > +        if ( is_pvh_domain(d) )
> >> > +            continue;
> >> > +
> >> >          page = mfn_to_page(mfn);
> >> >          if ( (page->u.inuse.type_info == 0) &&
> >> >               !get_page_and_type(page, d, PGT_writable_page) )
> >> 
> >> So why is the remaining part of this loop not applicable to PVH?
> > 
> > My bad, looks like it should be. I'll remove it. BTW, looking at it
> > again I realized we don't really need to set type_info to PGT* for
> > PVH, but it's harmless I guess. Should I just leave it or condition
> > them for PV only?
> 
> No, I'm pretty certain you want them marked writable. If nothing
> else then for forward compatibility with an eventual change
> needing to mark certain pages R/O. But I could also imaging the
> page sharing code to look at this attribute of a page (but I say
> this without knowing that code at all).

Sorry, I meant PGT_l*_page_table settings for type_info. Yes, we do 
need the PGT_writable_page settings.

> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0(
> >> >      regs->eip = parms.virt_entry;
> >> >      regs->esp = vstack_end;
> >> >      regs->esi = vstartinfo_start;
> >> > -    regs->eflags = X86_EFLAGS_IF;
> >> > +    regs->eflags = X86_EFLAGS_IF | 0x2;
> >> 
> >> Unrelated change?
> > 
> > Nop, we need to make sure the resvd bit is set in eflags otherwise
> > it won't vmenter (invalid guest state). Should be harmless for PV,
> > right? Not sure where it does it for PV before actually scheduling
> > it..
> 
> PV doesn't set this anywhere - the hardware doesn't allow the
> flag to be cleared (writes are ignored). If VMENTER is picky
> about this, the GUEST_RFLAGS write at the end of
> vmx_vmenter_helper() should be doing this instead of having to
> do it here (and obviously in some other place for DomU creation).

For domU we set it in arch_set_info_guest. vmx_vmenter_helper gets
called on every vmentry, we just need this setting once. So I think
this is the best place. Do you want me to if it:

regs->eflags = X86_EFLAGS_IF;
if ( pvh )
    regs->eflags |= 0x2.

thanks
Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-27 23:03         ` Mukesh Rathor
@ 2013-09-30  6:56           ` Jan Beulich
  2013-10-08  0:52             ` Mukesh Rathor
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-09-30  6:56 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Fri, 27 Sep 2013 08:01:16 +0100
> "Jan Beulich" <JBeulich@suse.com> wrote:
> 
>> >>> On 27.09.13 at 03:55, Mukesh Rathor <mukesh.rathor@oracle.com>
>> >>> wrote:
>> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
>> > <JBeulich@suse.com> wrote:
>> >> > @@ -868,6 +1016,9 @@ int __init construct_dom0(
>> >> >                                      L1_PROT : COMPAT_L1_PROT));
>> >> >          l1tab++;
>> >> >  
>> >> > +        if ( is_pvh_domain(d) )
>> >> > +            continue;
>> >> > +
>> >> >          page = mfn_to_page(mfn);
>> >> >          if ( (page->u.inuse.type_info == 0) &&
>> >> >               !get_page_and_type(page, d, PGT_writable_page) )
>> >> 
>> >> So why is the remaining part of this loop not applicable to PVH?
>> > 
>> > My bad, looks like it should be. I'll remove it. BTW, looking at it
>> > again I realized we don't really need to set type_info to PGT* for
>> > PVH, but it's harmless I guess. Should I just leave it or condition
>> > them for PV only?
>> 
>> No, I'm pretty certain you want them marked writable. If nothing
>> else then for forward compatibility with an eventual change
>> needing to mark certain pages R/O. But I could also imaging the
>> page sharing code to look at this attribute of a page (but I say
>> this without knowing that code at all).
> 
> Sorry, I meant PGT_l*_page_table settings for type_info. Yes, we do 
> need the PGT_writable_page settings.
> 
>> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0(
>> >> >      regs->eip = parms.virt_entry;
>> >> >      regs->esp = vstack_end;
>> >> >      regs->esi = vstartinfo_start;
>> >> > -    regs->eflags = X86_EFLAGS_IF;
>> >> > +    regs->eflags = X86_EFLAGS_IF | 0x2;
>> >> 
>> >> Unrelated change?
>> > 
>> > Nop, we need to make sure the resvd bit is set in eflags otherwise
>> > it won't vmenter (invalid guest state). Should be harmless for PV,
>> > right? Not sure where it does it for PV before actually scheduling
>> > it..
>> 
>> PV doesn't set this anywhere - the hardware doesn't allow the
>> flag to be cleared (writes are ignored). If VMENTER is picky
>> about this, the GUEST_RFLAGS write at the end of
>> vmx_vmenter_helper() should be doing this instead of having to
>> do it here (and obviously in some other place for DomU creation).
> 
> For domU we set it in arch_set_info_guest.

Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep
canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did
this adjustment without really explaining why it can't be done
centrally in just the two places copying regs->eflags into the
VMCS/VMCB spot.

> vmx_vmenter_helper gets
> called on every vmentry, we just need this setting once.

Would a debugger update guest state via arch_set_info_guest()?
I doubt it. It would imo be a desirable up front cleanup patch to
move this bogus thing out of arch_set_info_guest() into
vmx_vmenter_helper() (and whatever SVM equivalent, should
SVM too be incapable of dealing with the flag being clear). See
how e.g. hvm_load_cpu_ctxt() already sets the flag? It's really
like being done almost at random...

The only place where it gets legitimately enforced outside of
the vmx_vmenter_helper() is in the x86 emulator code.

And if we'd have such a cleanup patch, doing away with the literal
2 in favor of a proper symbolic (e.g. X86_EFLAGS_MBS) should
probably be done at once.

> So I think this is the best place. Do you want me to if it:
> 
> regs->eflags = X86_EFLAGS_IF;
> if ( pvh )
>     regs->eflags |= 0x2.

No, that would be pointless.

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-27  6:54       ` Jan Beulich
@ 2013-10-03  0:53         ` Mukesh Rathor
  2013-10-04  6:53           ` Jan Beulich
  0 siblings, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-10-03  0:53 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Fri, 27 Sep 2013 07:54:39 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
> > <JBeulich@suse.com> wrote:
> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >> >>> wrote:
> >> > +/*
> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus,
> >> > dom0 will have
> >> > + * the entire io region mapped in the EPT/NPT.
> >> > + *
> >> > + * PVH FIXME: The following doesn't map MMIO ranges when they
> >> > sit above the
> >> > + *            highest E820 covered address.
> >> 
> >> This absolutely needs fixing before this can go in.
> > 
> > Any suggestions on how to fix it? Mapping all the way to end could
> > result in a huge hap table. 
> 
> You'll probably need a call down from Dom0 telling you where it
> finds/puts MMIO resources. Or perhaps that could be mapped
> in on demand from the EPT fault handler (since these regions
> shouldn't be subject to DMA, and hence IOMMU faults shouldn't
> occur - perhaps that's even a reason to not share page tables
> at least in dom0-strict mode)?

Thinking about mapping in on demand from the EPT fault handler, how
would I know if the access beyond last e820 entry is genuine and not 
a faulty pte in a buggy guest? Could I consult the mmconfig table (?) 
or the ACPI table in xen? Any pointers would be helpful... my 
knowledge runs out quickly here.

FWIW, at present pv-ops linux doesn't allow any mmio access beyond
the last e820 entry. So, we'd need a fix there too. In my very orig
patch, I was updating all IO mappings on demand by putting hook
in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another 
possibility would be do that for any mappings above the last
e820 entry. What do you think?

For testing purposes, do you have reference for hardware? I don't see 
any here with such configuration.

thanks
mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-03  0:53         ` Mukesh Rathor
@ 2013-10-04  6:53           ` Jan Beulich
  2013-10-04 13:35             ` Konrad Rzeszutek Wilk
  2013-10-08  0:58             ` Mukesh Rathor
  0 siblings, 2 replies; 40+ messages in thread
From: Jan Beulich @ 2013-10-04  6:53 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Fri, 27 Sep 2013 07:54:39 +0100
> "Jan Beulich" <JBeulich@suse.com> wrote:
> 
>> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com>
>> >>> wrote:
>> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
>> > <JBeulich@suse.com> wrote:
>> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
>> >> >>> wrote:
>> >> > +/*
>> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus,
>> >> > dom0 will have
>> >> > + * the entire io region mapped in the EPT/NPT.
>> >> > + *
>> >> > + * PVH FIXME: The following doesn't map MMIO ranges when they
>> >> > sit above the
>> >> > + *            highest E820 covered address.
>> >> 
>> >> This absolutely needs fixing before this can go in.
>> > 
>> > Any suggestions on how to fix it? Mapping all the way to end could
>> > result in a huge hap table. 
>> 
>> You'll probably need a call down from Dom0 telling you where it
>> finds/puts MMIO resources. Or perhaps that could be mapped
>> in on demand from the EPT fault handler (since these regions
>> shouldn't be subject to DMA, and hence IOMMU faults shouldn't
>> occur - perhaps that's even a reason to not share page tables
>> at least in dom0-strict mode)?
> 
> Thinking about mapping in on demand from the EPT fault handler, how
> would I know if the access beyond last e820 entry is genuine and not 
> a faulty pte in a buggy guest? Could I consult the mmconfig table (?) 
> or the ACPI table in xen? Any pointers would be helpful... my 
> knowledge runs out quickly here.

You'd have to inspect all the BARs of the devices the domain owns.
Hence the thought of having Dom0 tell you about those resource
assignments.

> FWIW, at present pv-ops linux doesn't allow any mmio access beyond
> the last e820 entry. So, we'd need a fix there too. In my very orig
> patch, I was updating all IO mappings on demand by putting hook
> in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another 
> possibility would be do that for any mappings above the last
> e820 entry. What do you think?

Special casing IOMAP page table creation might be an option, but
has the downside of allowing kernel bugs to propagate into Xen's
view of the world.

> For testing purposes, do you have reference for hardware? I don't see 
> any here with such configuration.

Nothing specific, but I know that SR-IOV virtual functions easily
cause kernels to run out of MMIO space below 4G (namely when
the hole is only around 1Gb or even less), and Intel must have
knowledge of graphics cards having so huge a frame buffer that
it can only be mapped above 4G.

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-04  6:53           ` Jan Beulich
@ 2013-10-04 13:35             ` Konrad Rzeszutek Wilk
  2013-10-04 14:05               ` Jan Beulich
  2013-10-08  0:58             ` Mukesh Rathor
  1 sibling, 1 reply; 40+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-10-04 13:35 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Fri, Oct 04, 2013 at 07:53:20AM +0100, Jan Beulich wrote:
> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> > On Fri, 27 Sep 2013 07:54:39 +0100
> > "Jan Beulich" <JBeulich@suse.com> wrote:
> > 
> >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com>
> >> >>> wrote:
> >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
> >> > <JBeulich@suse.com> wrote:
> >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >> >> >>> wrote:
> >> >> > +/*
> >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus,
> >> >> > dom0 will have
> >> >> > + * the entire io region mapped in the EPT/NPT.
> >> >> > + *
> >> >> > + * PVH FIXME: The following doesn't map MMIO ranges when they
> >> >> > sit above the
> >> >> > + *            highest E820 covered address.
> >> >> 
> >> >> This absolutely needs fixing before this can go in.
> >> > 
> >> > Any suggestions on how to fix it? Mapping all the way to end could
> >> > result in a huge hap table. 
> >> 
> >> You'll probably need a call down from Dom0 telling you where it
> >> finds/puts MMIO resources. Or perhaps that could be mapped
> >> in on demand from the EPT fault handler (since these regions
> >> shouldn't be subject to DMA, and hence IOMMU faults shouldn't
> >> occur - perhaps that's even a reason to not share page tables
> >> at least in dom0-strict mode)?
> > 
> > Thinking about mapping in on demand from the EPT fault handler, how
> > would I know if the access beyond last e820 entry is genuine and not 
> > a faulty pte in a buggy guest? Could I consult the mmconfig table (?) 
> > or the ACPI table in xen? Any pointers would be helpful... my 
> > knowledge runs out quickly here.
> 
> You'd have to inspect all the BARs of the devices the domain owns.
> Hence the thought of having Dom0 tell you about those resource
> assignments.

Doesn't that happen via PHYSDEVOP_pci_device_add hypercalls?
> 
> > FWIW, at present pv-ops linux doesn't allow any mmio access beyond
> > the last e820 entry. So, we'd need a fix there too. In my very orig
> > patch, I was updating all IO mappings on demand by putting hook
> > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another 
> > possibility would be do that for any mappings above the last
> > e820 entry. What do you think?
> 
> Special casing IOMAP page table creation might be an option, but
> has the downside of allowing kernel bugs to propagate into Xen's
> view of the world.
> 
> > For testing purposes, do you have reference for hardware? I don't see 
> > any here with such configuration.
> 
> Nothing specific, but I know that SR-IOV virtual functions easily
> cause kernels to run out of MMIO space below 4G (namely when
> the hole is only around 1Gb or even less), and Intel must have
> knowledge of graphics cards having so huge a frame buffer that
> it can only be mapped above 4G.

Right, but the BIOS Writers Guide and docs all talk setting the MCFG
up for that. Granted the MCFG (or was the ACPI spec?) says that the 
MCFG regions do not have to be defined in the E820.

You pointed out also that the MCFG entries might come out from
the ACPI DSDT. Which I think all comes back to dom0 parsing this and
providing this sort of information back to the hypervisor?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-04 13:35             ` Konrad Rzeszutek Wilk
@ 2013-10-04 14:05               ` Jan Beulich
  2013-10-04 16:02                 ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-10-04 14:05 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel, keir.xen

>>> On 04.10.13 at 15:35, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> On Fri, Oct 04, 2013 at 07:53:20AM +0100, Jan Beulich wrote:
>> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
>> > On Fri, 27 Sep 2013 07:54:39 +0100
>> > "Jan Beulich" <JBeulich@suse.com> wrote:
>> > 
>> >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com>
>> >> >>> wrote:
>> >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
>> >> > <JBeulich@suse.com> wrote:
>> >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
>> >> >> >>> wrote:
>> >> >> > +/*
>> >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus,
>> >> >> > dom0 will have
>> >> >> > + * the entire io region mapped in the EPT/NPT.
>> >> >> > + *
>> >> >> > + * PVH FIXME: The following doesn't map MMIO ranges when they
>> >> >> > sit above the
>> >> >> > + *            highest E820 covered address.
>> >> >> 
>> >> >> This absolutely needs fixing before this can go in.
>> >> > 
>> >> > Any suggestions on how to fix it? Mapping all the way to end could
>> >> > result in a huge hap table. 
>> >> 
>> >> You'll probably need a call down from Dom0 telling you where it
>> >> finds/puts MMIO resources. Or perhaps that could be mapped
>> >> in on demand from the EPT fault handler (since these regions
>> >> shouldn't be subject to DMA, and hence IOMMU faults shouldn't
>> >> occur - perhaps that's even a reason to not share page tables
>> >> at least in dom0-strict mode)?
>> > 
>> > Thinking about mapping in on demand from the EPT fault handler, how
>> > would I know if the access beyond last e820 entry is genuine and not 
>> > a faulty pte in a buggy guest? Could I consult the mmconfig table (?) 
>> > or the ACPI table in xen? Any pointers would be helpful... my 
>> > knowledge runs out quickly here.
>> 
>> You'd have to inspect all the BARs of the devices the domain owns.
>> Hence the thought of having Dom0 tell you about those resource
>> assignments.
> 
> Doesn't that happen via PHYSDEVOP_pci_device_add hypercalls?

That may (and I think does) happen before resource assignment.

>> > FWIW, at present pv-ops linux doesn't allow any mmio access beyond
>> > the last e820 entry. So, we'd need a fix there too. In my very orig
>> > patch, I was updating all IO mappings on demand by putting hook
>> > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another 
>> > possibility would be do that for any mappings above the last
>> > e820 entry. What do you think?
>> 
>> Special casing IOMAP page table creation might be an option, but
>> has the downside of allowing kernel bugs to propagate into Xen's
>> view of the world.
>> 
>> > For testing purposes, do you have reference for hardware? I don't see 
>> > any here with such configuration.
>> 
>> Nothing specific, but I know that SR-IOV virtual functions easily
>> cause kernels to run out of MMIO space below 4G (namely when
>> the hole is only around 1Gb or even less), and Intel must have
>> knowledge of graphics cards having so huge a frame buffer that
>> it can only be mapped above 4G.
> 
> Right, but the BIOS Writers Guide and docs all talk setting the MCFG
> up for that. Granted the MCFG (or was the ACPI spec?) says that the 
> MCFG regions do not have to be defined in the E820.

What do MCFG regions have to do with device MMIO ones?

> You pointed out also that the MCFG entries might come out from
> the ACPI DSDT. Which I think all comes back to dom0 parsing this and
> providing this sort of information back to the hypervisor?

For the MCFG, yes. But not for individual BARs of devices.

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-04 14:05               ` Jan Beulich
@ 2013-10-04 16:02                 ` Konrad Rzeszutek Wilk
  2013-10-04 16:07                   ` Jan Beulich
  0 siblings, 1 reply; 40+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-10-04 16:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Fri, Oct 04, 2013 at 03:05:57PM +0100, Jan Beulich wrote:
> >>> On 04.10.13 at 15:35, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > On Fri, Oct 04, 2013 at 07:53:20AM +0100, Jan Beulich wrote:
> >> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> >> > On Fri, 27 Sep 2013 07:54:39 +0100
> >> > "Jan Beulich" <JBeulich@suse.com> wrote:
> >> > 
> >> >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com>
> >> >> >>> wrote:
> >> >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
> >> >> > <JBeulich@suse.com> wrote:
> >> >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >> >> >> >>> wrote:
> >> >> >> > +/*
> >> >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus,
> >> >> >> > dom0 will have
> >> >> >> > + * the entire io region mapped in the EPT/NPT.
> >> >> >> > + *
> >> >> >> > + * PVH FIXME: The following doesn't map MMIO ranges when they
> >> >> >> > sit above the
> >> >> >> > + *            highest E820 covered address.
> >> >> >> 
> >> >> >> This absolutely needs fixing before this can go in.
> >> >> > 
> >> >> > Any suggestions on how to fix it? Mapping all the way to end could
> >> >> > result in a huge hap table. 
> >> >> 
> >> >> You'll probably need a call down from Dom0 telling you where it
> >> >> finds/puts MMIO resources. Or perhaps that could be mapped
> >> >> in on demand from the EPT fault handler (since these regions
> >> >> shouldn't be subject to DMA, and hence IOMMU faults shouldn't
> >> >> occur - perhaps that's even a reason to not share page tables
> >> >> at least in dom0-strict mode)?
> >> > 
> >> > Thinking about mapping in on demand from the EPT fault handler, how
> >> > would I know if the access beyond last e820 entry is genuine and not 
> >> > a faulty pte in a buggy guest? Could I consult the mmconfig table (?) 
> >> > or the ACPI table in xen? Any pointers would be helpful... my 
> >> > knowledge runs out quickly here.
> >> 
> >> You'd have to inspect all the BARs of the devices the domain owns.
> >> Hence the thought of having Dom0 tell you about those resource
> >> assignments.
> > 
> > Doesn't that happen via PHYSDEVOP_pci_device_add hypercalls?
> 
> That may (and I think does) happen before resource assignment.
> 
> >> > FWIW, at present pv-ops linux doesn't allow any mmio access beyond
> >> > the last e820 entry. So, we'd need a fix there too. In my very orig
> >> > patch, I was updating all IO mappings on demand by putting hook
> >> > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another 
> >> > possibility would be do that for any mappings above the last
> >> > e820 entry. What do you think?
> >> 
> >> Special casing IOMAP page table creation might be an option, but
> >> has the downside of allowing kernel bugs to propagate into Xen's
> >> view of the world.
> >> 
> >> > For testing purposes, do you have reference for hardware? I don't see 
> >> > any here with such configuration.
> >> 
> >> Nothing specific, but I know that SR-IOV virtual functions easily
> >> cause kernels to run out of MMIO space below 4G (namely when
> >> the hole is only around 1Gb or even less), and Intel must have
> >> knowledge of graphics cards having so huge a frame buffer that
> >> it can only be mapped above 4G.
> > 
> > Right, but the BIOS Writers Guide and docs all talk setting the MCFG
> > up for that. Granted the MCFG (or was the ACPI spec?) says that the 
> > MCFG regions do not have to be defined in the E820.
> 
> What do MCFG regions have to do with device MMIO ones?

Actually - nothing at all. I somehow was under the impression that
MCFG and MMIO regions would be in the same memory area (as in
MCFG follows the end of MMIO region). But of course
nothing would be that simple.
> 
> > You pointed out also that the MCFG entries might come out from
> > the ACPI DSDT. Which I think all comes back to dom0 parsing this and
> > providing this sort of information back to the hypervisor?
> 
> For the MCFG, yes. But not for individual BARs of devices.

So back to hooking up a new hypercall in the PCI subsystem when
resource assigment has been completed? And also if the PCI subsystem
decides to re-write the resource addresses to odd locations.

Can't one also trap for the configuration changes on the PCI
devices and extract the physical locations then?

> 
> Jan
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-04 16:02                 ` Konrad Rzeszutek Wilk
@ 2013-10-04 16:07                   ` Jan Beulich
  2013-10-04 20:59                     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-10-04 16:07 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel, keir.xen

>>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> So back to hooking up a new hypercall in the PCI subsystem when
> resource assigment has been completed? And also if the PCI subsystem
> decides to re-write the resource addresses to odd locations.
> 
> Can't one also trap for the configuration changes on the PCI
> devices and extract the physical locations then?

Yes, of course we could be snooping the CFG writes, but that's
as simple as it sounds only for the port CF8 based accesses. For
MCFG based accesses it would mean we'd have to write protect
the whole MCFG range, and use emulation there. Not very
pretty, but doable.

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-04 16:07                   ` Jan Beulich
@ 2013-10-04 20:59                     ` Konrad Rzeszutek Wilk
  2013-10-05  1:06                       ` Mukesh Rathor
  0 siblings, 1 reply; 40+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-10-04 20:59 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Fri, Oct 04, 2013 at 05:07:59PM +0100, Jan Beulich wrote:
> >>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > So back to hooking up a new hypercall in the PCI subsystem when
> > resource assigment has been completed? And also if the PCI subsystem
> > decides to re-write the resource addresses to odd locations.
> > 
> > Can't one also trap for the configuration changes on the PCI
> > devices and extract the physical locations then?
> 
> Yes, of course we could be snooping the CFG writes, but that's
> as simple as it sounds only for the port CF8 based accesses. For
> MCFG based accesses it would mean we'd have to write protect
> the whole MCFG range, and use emulation there. Not very
> pretty, but doable.

Hypervisor call is then a more appropiate if it can be done (Mukesh
says that v0 of the patch had something like that in so he will try
to recreate it) and then as a fallback we could do the emulation.

> 
> Jan
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-04 20:59                     ` Konrad Rzeszutek Wilk
@ 2013-10-05  1:06                       ` Mukesh Rathor
  2013-10-07  7:12                         ` Jan Beulich
  0 siblings, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-10-05  1:06 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel, keir.xen, Jan Beulich

On Fri, 4 Oct 2013 16:59:53 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:

> On Fri, Oct 04, 2013 at 05:07:59PM +0100, Jan Beulich wrote:
> > >>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk
> > >>> <konrad.wilk@oracle.com> wrote:
> > > So back to hooking up a new hypercall in the PCI subsystem when
> > > resource assigment has been completed? And also if the PCI
> > > subsystem decides to re-write the resource addresses to odd
> > > locations.
> > > 
> > > Can't one also trap for the configuration changes on the PCI
> > > devices and extract the physical locations then?
> > 
> > Yes, of course we could be snooping the CFG writes, but that's
> > as simple as it sounds only for the port CF8 based accesses. For
> > MCFG based accesses it would mean we'd have to write protect
> > the whole MCFG range, and use emulation there. Not very
> > pretty, but doable.
> 
> Hypervisor call is then a more appropiate if it can be done (Mukesh
> says that v0 of the patch had something like that in so he will try
> to recreate it) and then as a fallback we could do the emulation.

Right, looks like it has to be guest driven. xen can provide the
facility via the PHYSDEVOP_map_iomem I had in V0. For linux, we will do 
such mappings above the highest e820 entry via the ioremap path. 

As a result of this, we don't need to change pvh_map_all_iomem() and
it will continue to map upto the highest e820 entry. I will change
the comment:

 * PVH FIXME: The following doesn't map MMIO ranges when they sit above the
 *            highest E820 covered address.

to

 * Note: we map the ranges upto 4GB or the last e820 entry, whichever is
 *       higher. Any ranges beyond are mapped by the guest via
 *       PHYSDEVOP_map_iomem.

thanks
Mukesh


diff -r 774a211e6b8f -r 2c728d96f876 xen/arch/x86/physdev.c
--- a/xen/arch/x86/physdev.c    Wed Jan 30 12:30:34 2013 -0800
+++ b/xen/arch/x86/physdev.c    Mon Jan 28 15:30:45 2013 -0800
@@ -740,6 +740,25 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         break;
     }
 
+    case PHYSDEVOP_map_iomem : {
+
+       struct physdev_map_iomem iomem;
+        struct domain *d = current->domain;
+
+        ret = -EPERM;
+        if ( !IS_PRIV(d) || !is_pvh_domain(d))
+            break;
+        d = rcu_lock_current_domain();
+        
+        ret = -EFAULT;
+        if ( copy_from_guest(&iomem, arg, 1) != 0 )
+            break;
+
+       ret = domctl_memory_mapping(d, iomem.first_gfn, iomem.first_mfn, 
+                                   iomem.nr_mfns, iomem.add_mapping);
+        break;
+    }
+
     default:
         ret = -ENOSYS;
         break;
diff -r 774a211e6b8f -r 2c728d96f876 xen/include/public/physdev.h
--- a/xen/include/public/physdev.h      Wed Jan 30 12:30:34 2013 -0800
+++ b/xen/include/public/physdev.h      Mon Jan 28 15:30:45 2013 -0800
@@ -330,6 +330,20 @@ struct physdev_dbgp_op {
 typedef struct physdev_dbgp_op physdev_dbgp_op_t;
 DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t);
 
+ 
+/* Map given gfns to mfns where mfns are part of IO space. */
+#define PHYSDEVOP_map_iomem        30
+struct physdev_map_iomem {
+    /* IN */
+    uint64_t first_gfn;
+    uint64_t first_mfn;
+    uint32_t nr_mfns;
+    uint32_t add_mapping;        /* 1 == add mapping;  0 == unmap */
+
+};
+typedef struct physdev_map_iomem physdev_map_iomem_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_map_iomem_t);
+
 /*
  * Notify that some PIRQ-bound event channels have been unmasked.
  * ** This command is obsolete since interface version 0x00030202 and is **

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-05  1:06                       ` Mukesh Rathor
@ 2013-10-07  7:12                         ` Jan Beulich
  0 siblings, 0 replies; 40+ messages in thread
From: Jan Beulich @ 2013-10-07  7:12 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk, Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 05.10.13 at 03:06, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Fri, 4 Oct 2013 16:59:53 -0400
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
>> On Fri, Oct 04, 2013 at 05:07:59PM +0100, Jan Beulich wrote:
>> > >>> On 04.10.13 at 18:02, Konrad Rzeszutek Wilk
>> > >>> <konrad.wilk@oracle.com> wrote:
>> > > Can't one also trap for the configuration changes on the PCI
>> > > devices and extract the physical locations then?
>> > 
>> > Yes, of course we could be snooping the CFG writes, but that's
>> > as simple as it sounds only for the port CF8 based accesses. For
>> > MCFG based accesses it would mean we'd have to write protect
>> > the whole MCFG range, and use emulation there. Not very
>> > pretty, but doable.
>> 
>> Hypervisor call is then a more appropiate if it can be done (Mukesh
>> says that v0 of the patch had something like that in so he will try
>> to recreate it) and then as a fallback we could do the emulation.
> 
> Right, looks like it has to be guest driven. xen can provide the
> facility via the PHYSDEVOP_map_iomem I had in V0. For linux, we will do 
> such mappings above the highest e820 entry via the ioremap path. 

As I tried to make clear before - this is wrong both conceptually and
technically: Conceptually because there's no need for a driver to
ioremap() all MMIO ranges in order for a device to make use of them.
And technically because you would allow bad arguments passed to
ioremap() propagate into bad IOMMU mappings, while the IOMMU is
precisely there to catch bad accesses (even for Dom0, at least in
dom0-strict mode).

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-09-30  6:56           ` Jan Beulich
@ 2013-10-08  0:52             ` Mukesh Rathor
  2013-10-08  7:43               ` Jan Beulich
  0 siblings, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-10-08  0:52 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Mon, 30 Sep 2013 07:56:30 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > On Fri, 27 Sep 2013 08:01:16 +0100
> > "Jan Beulich" <JBeulich@suse.com> wrote:
.......
> >> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0(
> >> >> >      regs->eip = parms.virt_entry;
> >> >> >      regs->esp = vstack_end;
> >> >> >      regs->esi = vstartinfo_start;
> >> >> > -    regs->eflags = X86_EFLAGS_IF;
> >> >> > +    regs->eflags = X86_EFLAGS_IF | 0x2;
> >> >> 
> >> >> Unrelated change?
> >> > 
> >> > Nop, we need to make sure the resvd bit is set in eflags
> >> > otherwise it won't vmenter (invalid guest state). Should be
> >> > harmless for PV, right? Not sure where it does it for PV before
> >> > actually scheduling it..
> >> 
> >> PV doesn't set this anywhere - the hardware doesn't allow the
> >> flag to be cleared (writes are ignored). If VMENTER is picky
> >> about this, the GUEST_RFLAGS write at the end of
> >> vmx_vmenter_helper() should be doing this instead of having to
> >> do it here (and obviously in some other place for DomU creation).
> > 
> > For domU we set it in arch_set_info_guest.
> 
> Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep
> canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did
> this adjustment without really explaining why it can't be done
> centrally in just the two places copying regs->eflags into the
> VMCS/VMCB spot.

I beg to differ.... such nit picking is equally bogus IMHO. The
bit needs to be set once, putting it in vmx_vmenter_helper adds an
unnecessary slowdown IMO. 

> > vmx_vmenter_helper gets
> > called on every vmentry, we just need this setting once.
> 
> Would a debugger update guest state via arch_set_info_guest()?
> I doubt it. It would imo be a desirable up front cleanup patch to
> move this bogus thing out of arch_set_info_guest() into
> vmx_vmenter_helper() (and whatever SVM equivalent, should
> SVM too be incapable of dealing with the flag being clear). See
> how e.g. hvm_load_cpu_ctxt() already sets the flag? It's really
> like being done almost at random...

The debugger would always read eflags, muck with only
the bits it needs to, leaving the resvd bit as is, then send it down.

> The only place where it gets legitimately enforced outside of
> the vmx_vmenter_helper() is in the x86 emulator code.
> 
> And if we'd have such a cleanup patch, doing away with the literal
> 2 in favor of a proper symbolic (e.g. X86_EFLAGS_MBS) should
> probably be done at once.

Having X86_EFLAGS_MBS makes sense.

> > So I think this is the best place. Do you want me to if it:
> > 
> > regs->eflags = X86_EFLAGS_IF;
> > if ( pvh )
> >     regs->eflags |= 0x2.
> 
> No, that would be pointless.


Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-04  6:53           ` Jan Beulich
  2013-10-04 13:35             ` Konrad Rzeszutek Wilk
@ 2013-10-08  0:58             ` Mukesh Rathor
  2013-10-08  7:51               ` Jan Beulich
  1 sibling, 1 reply; 40+ messages in thread
From: Mukesh Rathor @ 2013-10-08  0:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Fri, 04 Oct 2013 07:53:20 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 03.10.13 at 02:53, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > On Fri, 27 Sep 2013 07:54:39 +0100
> > "Jan Beulich" <JBeulich@suse.com> wrote:
> > 
> >> >>> On 27.09.13 at 02:17, Mukesh Rathor <mukesh.rathor@oracle.com>
> >> >>> wrote:
> >> > On Thu, 26 Sep 2013 09:02:41 +0100 "Jan Beulich"
> >> > <JBeulich@suse.com> wrote:
> >> >> >>> On 25.09.13 at 23:03, Mukesh Rathor
> >> >> >>> <mukesh.rathor@oracle.com> wrote:
> >> >> > +/*
> >> >> > + * Set the 1:1 map for all non-RAM regions for dom 0. Thus,
> >> >> > dom0 will have
> >> >> > + * the entire io region mapped in the EPT/NPT.
> >> >> > + *
> >> >> > + * PVH FIXME: The following doesn't map MMIO ranges when they
> >> >> > sit above the
> >> >> > + *            highest E820 covered address.
> >> >> 
> >> >> This absolutely needs fixing before this can go in.
> >> > 
> >> > Any suggestions on how to fix it? Mapping all the way to end
> >> > could result in a huge hap table. 
> >> 
> >> You'll probably need a call down from Dom0 telling you where it
> >> finds/puts MMIO resources. Or perhaps that could be mapped
> >> in on demand from the EPT fault handler (since these regions
> >> shouldn't be subject to DMA, and hence IOMMU faults shouldn't
> >> occur - perhaps that's even a reason to not share page tables
> >> at least in dom0-strict mode)?
> > 
> > Thinking about mapping in on demand from the EPT fault handler, how
> > would I know if the access beyond last e820 entry is genuine and
> > not a faulty pte in a buggy guest? Could I consult the mmconfig
> > table (?) or the ACPI table in xen? Any pointers would be
> > helpful... my knowledge runs out quickly here.
> 
> You'd have to inspect all the BARs of the devices the domain owns.
> Hence the thought of having Dom0 tell you about those resource
> assignments.
> 
> > FWIW, at present pv-ops linux doesn't allow any mmio access beyond
> > the last e820 entry. So, we'd need a fix there too. In my very orig
> > patch, I was updating all IO mappings on demand by putting hook
> > in linux native_pte_update if it was _PAGE_BIT_IOMAP. Another 
> > possibility would be do that for any mappings above the last
> > e820 entry. What do you think?
> 
> Special casing IOMAP page table creation might be an option, but
> has the downside of allowing kernel bugs to propagate into Xen's
> view of the world.
> 
> > For testing purposes, do you have reference for hardware? I don't
> > see any here with such configuration.
> 
> Nothing specific, but I know that SR-IOV virtual functions easily
> cause kernels to run out of MMIO space below 4G (namely when
> the hole is only around 1Gb or even less), and Intel must have
> knowledge of graphics cards having so huge a frame buffer that
> it can only be mapped above 4G.

In that case, I don't see why this is a MUST for the patch. Combined
with the fact that at present pv-ops doesn't even allow for mapping
above the last e820 entry, I think this can be left FIXME/bug-fix for 
near future that can be done relatively quickly by an expert in that
area if learning that takes me a long time.

thanks
Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  0:52             ` Mukesh Rathor
@ 2013-10-08  7:43               ` Jan Beulich
  2013-10-09 21:59                 ` Mukesh Rathor
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-10-08  7:43 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 08.10.13 at 02:52, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Mon, 30 Sep 2013 07:56:30 +0100
> "Jan Beulich" <JBeulich@suse.com> wrote:
> 
>> >>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com>
>> >>> wrote:
>> > On Fri, 27 Sep 2013 08:01:16 +0100
>> > "Jan Beulich" <JBeulich@suse.com> wrote:
> .......
>> >> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0(
>> >> >> >      regs->eip = parms.virt_entry;
>> >> >> >      regs->esp = vstack_end;
>> >> >> >      regs->esi = vstartinfo_start;
>> >> >> > -    regs->eflags = X86_EFLAGS_IF;
>> >> >> > +    regs->eflags = X86_EFLAGS_IF | 0x2;
>> >> >> 
>> >> >> Unrelated change?
>> >> > 
>> >> > Nop, we need to make sure the resvd bit is set in eflags
>> >> > otherwise it won't vmenter (invalid guest state). Should be
>> >> > harmless for PV, right? Not sure where it does it for PV before
>> >> > actually scheduling it..
>> >> 
>> >> PV doesn't set this anywhere - the hardware doesn't allow the
>> >> flag to be cleared (writes are ignored). If VMENTER is picky
>> >> about this, the GUEST_RFLAGS write at the end of
>> >> vmx_vmenter_helper() should be doing this instead of having to
>> >> do it here (and obviously in some other place for DomU creation).
>> > 
>> > For domU we set it in arch_set_info_guest.
>> 
>> Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep
>> canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did
>> this adjustment without really explaining why it can't be done
>> centrally in just the two places copying regs->eflags into the
>> VMCS/VMCB spot.
> 
> I beg to differ.... such nit picking is equally bogus IMHO. The
> bit needs to be set once, putting it in vmx_vmenter_helper adds an
> unnecessary slowdown IMO. 

An "or" being a measurable slowdown?

>> > vmx_vmenter_helper gets
>> > called on every vmentry, we just need this setting once.
>> 
>> Would a debugger update guest state via arch_set_info_guest()?
>> I doubt it. It would imo be a desirable up front cleanup patch to
>> move this bogus thing out of arch_set_info_guest() into
>> vmx_vmenter_helper() (and whatever SVM equivalent, should
>> SVM too be incapable of dealing with the flag being clear). See
>> how e.g. hvm_load_cpu_ctxt() already sets the flag? It's really
>> like being done almost at random...
> 
> The debugger would always read eflags, muck with only
> the bits it needs to, leaving the resvd bit as is, then send it down.

So you'd expect every debugger to be responsible for setting this
bit? Pretty odd a requirement, when it can be done centrally in a
single place, covering all cases.

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  0:58             ` Mukesh Rathor
@ 2013-10-08  7:51               ` Jan Beulich
  2013-10-08  8:03                 ` Jan Beulich
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-10-08  7:51 UTC (permalink / raw)
  To: Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 08.10.13 at 02:58, Mukesh Rathor <mukesh.rathor@oracle.com> wrote:
> On Fri, 04 Oct 2013 07:53:20 +0100
> "Jan Beulich" <JBeulich@suse.com> wrote:
>> Nothing specific, but I know that SR-IOV virtual functions easily
>> cause kernels to run out of MMIO space below 4G (namely when
>> the hole is only around 1Gb or even less), and Intel must have
>> knowledge of graphics cards having so huge a frame buffer that
>> it can only be mapped above 4G.
> 
> In that case, I don't see why this is a MUST for the patch. Combined
> with the fact that at present pv-ops doesn't even allow for mapping
> above the last e820 entry, I think this can be left FIXME/bug-fix for 
> near future that can be done relatively quickly by an expert in that
> area if learning that takes me a long time.

Okay, so it seems you're back in Linux-only mode. I'm asking
for there not being fundamental holes. Dealing with the majority
of the "fixme"s you have already sprinkled around the code has
at least a clear path associated with it. Here, however, we're not
even certain how to properly deal with the problem.

And just to be clear - _any_ of the "fixme"s we intend to allow to
go in with this series is a compromise already, with - afaict - no
precedent. The more you ask for, the more likely it'll become for
at least me to start re-thinking about this. Over the past years I
think we made good progress in morphing Xen from an
experimental project to a stable, enterprise ready one. I'm not
eager to see us get pushed back significantly on that road.

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  7:51               ` Jan Beulich
@ 2013-10-08  8:03                 ` Jan Beulich
  2013-10-08  9:39                   ` George Dunlap
  0 siblings, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-10-08  8:03 UTC (permalink / raw)
  To: George Dunlap, Mukesh Rathor; +Cc: xen-devel, keir.xen

>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote:
> And just to be clear - _any_ of the "fixme"s we intend to allow to
> go in with this series is a compromise already, with - afaict - no
> precedent. The more you ask for, the more likely it'll become for
> at least me to start re-thinking about this. Over the past years I
> think we made good progress in morphing Xen from an
> experimental project to a stable, enterprise ready one. I'm not
> eager to see us get pushed back significantly on that road.

Considering the thread we're in - perhaps before adding Dom0
support with more hacks/fixme-s it would be more appropriate
to eliminate all the ones in the current pending series?

Question to both of you: What's the status of this anyway? If
I'm not mistaken there was (on IRC) talk of a regression the
latest series caused for HVM guests? Was this already sorted
out? Is there going to be an updated series getting us
reasonably close to finally get this in?

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  8:03                 ` Jan Beulich
@ 2013-10-08  9:39                   ` George Dunlap
  2013-10-08  9:57                     ` Jan Beulich
  2013-10-08 12:30                     ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 40+ messages in thread
From: George Dunlap @ 2013-10-08  9:39 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On 10/08/2013 09:03 AM, Jan Beulich wrote:
>>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote:
>> And just to be clear - _any_ of the "fixme"s we intend to allow to
>> go in with this series is a compromise already, with - afaict - no
>> precedent. The more you ask for, the more likely it'll become for
>> at least me to start re-thinking about this. Over the past years I
>> think we made good progress in morphing Xen from an
>> experimental project to a stable, enterprise ready one. I'm not
>> eager to see us get pushed back significantly on that road.
>
> Considering the thread we're in - perhaps before adding Dom0
> support with more hacks/fixme-s it would be more appropriate
> to eliminate all the ones in the current pending series?
>
> Question to both of you: What's the status of this anyway? If
> I'm not mistaken there was (on IRC) talk of a regression the
> latest series caused for HVM guests? Was this already sorted
> out? Is there going to be an updated series getting us
> reasonably close to finally get this in?

Right now I've had to put PVH on the back burner: I've got a talk for 
LinuxCon EU to prepare, and two for XenSummit.

There are two bugs that have been identified -- the HVM crash due to a 
mistake in one of the "code motion" patches, and the bug with setting 
VMXE in the guest CR.  Tim has also noticed another functional change in 
the code-motion patch (which cannot, I think, be causing the HVM crash).

If it were just a question of cleaning up those bits, I could probably 
have another draft posted sometime this week.  But if we're stepping 
back and looking at whether this is the right approach, or whether 
something like Tim has suggested -- basically making PVH to be HVM minus 
qemu plus a handful of hypercalls, and most of the changes in the domain 
builder rather than in Xen -- that will take a bit longer, particularly 
because it would probably mean me having to understand and modify the 
Linux side of things as well.  At this point I'm not really sure what 
the best approach is going forward.

  -George

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  9:39                   ` George Dunlap
@ 2013-10-08  9:57                     ` Jan Beulich
  2013-10-08 10:01                       ` George Dunlap
  2013-10-08 12:30                     ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 40+ messages in thread
From: Jan Beulich @ 2013-10-08  9:57 UTC (permalink / raw)
  To: George Dunlap; +Cc: xen-devel, keir.xen

>>> On 08.10.13 at 11:39, George Dunlap <george.dunlap@eu.citrix.com> wrote:
> If it were just a question of cleaning up those bits, I could probably 
> have another draft posted sometime this week.  But if we're stepping 
> back and looking at whether this is the right approach, or whether 
> something like Tim has suggested -- basically making PVH to be HVM minus 
> qemu plus a handful of hypercalls, and most of the changes in the domain 
> builder rather than in Xen -- that will take a bit longer, particularly 
> because it would probably mean me having to understand and modify the 
> Linux side of things as well.  At this point I'm not really sure what 
> the best approach is going forward.

Indeed, this all sounds more like being in need of a discussion on
the summit then (which however would very likely mean that this
won't make 4.4).

Jan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  9:57                     ` Jan Beulich
@ 2013-10-08 10:01                       ` George Dunlap
  2013-10-08 10:19                         ` Lars Kurth
  0 siblings, 1 reply; 40+ messages in thread
From: George Dunlap @ 2013-10-08 10:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen, Lars Kurth

On 10/08/2013 10:57 AM, Jan Beulich wrote:
>>>> On 08.10.13 at 11:39, George Dunlap <george.dunlap@eu.citrix.com> wrote:
>> If it were just a question of cleaning up those bits, I could probably
>> have another draft posted sometime this week.  But if we're stepping
>> back and looking at whether this is the right approach, or whether
>> something like Tim has suggested -- basically making PVH to be HVM minus
>> qemu plus a handful of hypercalls, and most of the changes in the domain
>> builder rather than in Xen -- that will take a bit longer, particularly
>> because it would probably mean me having to understand and modify the
>> Linux side of things as well.  At this point I'm not really sure what
>> the best approach is going forward.
>
> Indeed, this all sounds more like being in need of a discussion on
> the summit then (which however would very likely mean that this
> won't make 4.4).

Hmm, which will make my "PVH technical deep-dive" talk a bit challenging 
to write. :-)

Lars, any suggestions?  I could just back out the talk (which would 
reduce my talk-writing load a bit), or I could try to talk about things 
in general and the issues we've been discussing.

  -George

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08 10:01                       ` George Dunlap
@ 2013-10-08 10:19                         ` Lars Kurth
  0 siblings, 0 replies; 40+ messages in thread
From: Lars Kurth @ 2013-10-08 10:19 UTC (permalink / raw)
  To: George Dunlap, Jan Beulich; +Cc: xen-devel, keir.xen

George,

there are two options. 

Option 1: Withdraw the talk. 
Option 2: Don't withdraw the talk
* Have a discussion on Wednesday
* Cover the current design and highlight the issues and possible solutions.
* Have a Bof afterwards to get more community involvement. BoF slot 6, which is on the next day is still up for grabs. We could also see whether maybe Stefano Panella would swap his BoF slot such that the discussion could immediately follow your talk.

Option 2 potentially has the advantage of getting more people involved in PVH going forward. Option 1 feels a little closed.

Regards
Lars

-----Original Message-----
From: George Dunlap [mailto:george.dunlap@eu.citrix.com] 
Sent: 08 October 2013 11:01
To: Jan Beulich
Cc: keir.xen@gmail.com; xen-devel; Mukesh Rathor; Lars Kurth
Subject: Re: [Xen-devel] [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes

On 10/08/2013 10:57 AM, Jan Beulich wrote:
>>>> On 08.10.13 at 11:39, George Dunlap <george.dunlap@eu.citrix.com> wrote:
>> If it were just a question of cleaning up those bits, I could 
>> probably have another draft posted sometime this week.  But if we're 
>> stepping back and looking at whether this is the right approach, or 
>> whether something like Tim has suggested -- basically making PVH to 
>> be HVM minus qemu plus a handful of hypercalls, and most of the 
>> changes in the domain builder rather than in Xen -- that will take a 
>> bit longer, particularly because it would probably mean me having to 
>> understand and modify the Linux side of things as well.  At this 
>> point I'm not really sure what the best approach is going forward.
>
> Indeed, this all sounds more like being in need of a discussion on the 
> summit then (which however would very likely mean that this won't make 
> 4.4).

Hmm, which will make my "PVH technical deep-dive" talk a bit challenging to write. :-)

Lars, any suggestions?  I could just back out the talk (which would reduce my talk-writing load a bit), or I could try to talk about things in general and the issues we've been discussing.

  -George

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  9:39                   ` George Dunlap
  2013-10-08  9:57                     ` Jan Beulich
@ 2013-10-08 12:30                     ` Konrad Rzeszutek Wilk
  2013-10-09 13:02                       ` George Dunlap
  2013-10-09 17:50                       ` Tim Deegan
  1 sibling, 2 replies; 40+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-10-08 12:30 UTC (permalink / raw)
  To: George Dunlap, Jan Beulich; +Cc: xen-devel, keir.xen

George Dunlap <george.dunlap@eu.citrix.com> wrote:
>On 10/08/2013 09:03 AM, Jan Beulich wrote:
>>>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote:
>>> And just to be clear - _any_ of the "fixme"s we intend to allow to
>>> go in with this series is a compromise already, with - afaict - no
>>> precedent. The more you ask for, the more likely it'll become for
>>> at least me to start re-thinking about this. Over the past years I
>>> think we made good progress in morphing Xen from an
>>> experimental project to a stable, enterprise ready one. I'm not
>>> eager to see us get pushed back significantly on that road.
>>
>> Considering the thread we're in - perhaps before adding Dom0
>> support with more hacks/fixme-s it would be more appropriate
>> to eliminate all the ones in the current pending series?
>>
>> Question to both of you: What's the status of this anyway? If
>> I'm not mistaken there was (on IRC) talk of a regression the
>> latest series caused for HVM guests? Was this already sorted
>> out? Is there going to be an updated series getting us
>> reasonably close to finally get this in?
>
>Right now I've had to put PVH on the back burner: I've got a talk for 
>LinuxCon EU to prepare, and two for XenSummit.
>
>There are two bugs that have been identified -- the HVM crash due to a 
>mistake in one of the "code motion" patches, and the bug with setting 
>VMXE in the guest CR.  Tim has also noticed another functional change
>in 
>the code-motion patch (which cannot, I think, be causing the HVM
>crash).

These issues sprang up with your patches,  not with the ones that Mukesh posted ? Which did get tested for regression and passed with flying colours? That means there is a working baseline. Would that help?
Mukesh do you have any ideas what might be amiss?

>
>If it were just a question of cleaning up those bits, I could probably 
>have another draft posted sometime this week.  But if we're stepping 
>back and looking at whether this is the right approach, or whether 
>something like Tim has suggested -- basically making PVH to be HVM
>minus 
>qemu plus a handful of hypercalls, and most of the changes in the
>domain 
>builder rather than in Xen -- that will take a bit longer, particularly
>
>because it would probably mean me having to understand and modify the 
>Linux side of things as well.  At this point I'm not really sure what 
>the best approach is going forward.

Arrg.  I am not really sure how to express myself here but from the start Mukesh has asking for assistance and review of ideas and design of this and gotten it and acted on it. Now after two years of going this path folks are suggesting a new design?

Sadly I won't be able to be on Wed (family matters) call but my opinion is that we should keep on heading this way. If we want HVM with less (so no QEMU) that could be a separate mode that would work in parallel with PVH and be a separate project. They should be able to coexist right?
>
>  -George
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xen.org
>http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08 12:30                     ` Konrad Rzeszutek Wilk
@ 2013-10-09 13:02                       ` George Dunlap
  2013-10-09 13:13                         ` Andrew Cooper
  2013-10-09 17:50                       ` Tim Deegan
  1 sibling, 1 reply; 40+ messages in thread
From: George Dunlap @ 2013-10-09 13:02 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel, keir.xen, Jan Beulich

On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote:
> George Dunlap <george.dunlap@eu.citrix.com> wrote:
>> On 10/08/2013 09:03 AM, Jan Beulich wrote:
>>>>>> On 08.10.13 at 09:51, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>> And just to be clear - _any_ of the "fixme"s we intend to allow to
>>>> go in with this series is a compromise already, with - afaict - no
>>>> precedent. The more you ask for, the more likely it'll become for
>>>> at least me to start re-thinking about this. Over the past years I
>>>> think we made good progress in morphing Xen from an
>>>> experimental project to a stable, enterprise ready one. I'm not
>>>> eager to see us get pushed back significantly on that road.
>>>
>>> Considering the thread we're in - perhaps before adding Dom0
>>> support with more hacks/fixme-s it would be more appropriate
>>> to eliminate all the ones in the current pending series?
>>>
>>> Question to both of you: What's the status of this anyway? If
>>> I'm not mistaken there was (on IRC) talk of a regression the
>>> latest series caused for HVM guests? Was this already sorted
>>> out? Is there going to be an updated series getting us
>>> reasonably close to finally get this in?
>>
>> Right now I've had to put PVH on the back burner: I've got a talk for
>> LinuxCon EU to prepare, and two for XenSummit.
>>
>> There are two bugs that have been identified -- the HVM crash due to a
>> mistake in one of the "code motion" patches, and the bug with setting
>> VMXE in the guest CR.  Tim has also noticed another functional change
>> in
>> the code-motion patch (which cannot, I think, be causing the HVM
>> crash).
>
> These issues sprang up with your patches,  not with the ones that Mukesh posted ? Which did get tested for regression and passed with flying colours? That means there is a working baseline. Would that help?
> Mukesh do you have any ideas what might be amiss?

The issues with the code motion patch are almost certainly mine.  Andy 
Cooper ran one of Mukesh's versions through one of the XenRT tests 
(probably a nightly) and it came up fine.  I just need to go through and 
figure out what I messed up.

The issue with the cr4 turned up because of the work Roger Pau Monne has 
done porting FreeBSD to PVH.  The core issue here is that the guest CR4 
has VMXE set even though nested virt is disabled; so when the kernel 
does "read, set/clear bit, write", the resulting write also has the VMXE 
bit set, which causes the HVM handler to crash the guest.  The core 
issue, that the VMXE bit is set even though nested virt is disabled, was 
probably there in Mukesh's series.  I'm not sure if his handler would 
crash in a similar way, but it probably did; in any case, there would 
have been problems when it came to live migration.  But that's a 
one-line fix that Roger has already tested.

>
>>
>> If it were just a question of cleaning up those bits, I could probably
>> have another draft posted sometime this week.  But if we're stepping
>> back and looking at whether this is the right approach, or whether
>> something like Tim has suggested -- basically making PVH to be HVM
>> minus
>> qemu plus a handful of hypercalls, and most of the changes in the
>> domain
>> builder rather than in Xen -- that will take a bit longer, particularly
>>
>> because it would probably mean me having to understand and modify the
>> Linux side of things as well.  At this point I'm not really sure what
>> the best approach is going forward.
>
> Arrg.  I am not really sure how to express myself here but from the start Mukesh has asking for assistance and review of ideas and design of this and gotten it and acted on it. Now after two years of going this path folks are suggesting a new design?
>
> Sadly I won't be able to be on Wed (family matters) call but my opinion is that we should keep on heading this way. If we want HVM with less (so no QEMU) that could be a separate mode that would work in parallel with PVH and be a separate project. They should be able to coexist right?

So to begin with, the series I posted, from a guest perspective, is 
already close to what Tim described.  The main thing a PVH guest has 
that an HVM guest doesn't are:
  - Starts in 64-bit mode
  - A few more hypercalls
  - PV CPUID and IO
  - More state set when bringing up a cpu

The main difference between PV and HVM CPUID is the special-casing for 
dom0, which would presumably have to be extended in order to run a PVH 
dom0 in any case; and the IO is either the same, or again has special 
cases for driver domains and dom0, and would need to be implemented one 
way or another for PVH guests as well.

Most of what Tim seems do be advocating are things that should be able 
to be changed "behind the scenes": having the domain builder put the 
guest in long mode + paging, &c.

So it might be worth just checking this version in (once it gets fixed 
up), and look at refactoring things at a later date.

  -George

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-09 13:02                       ` George Dunlap
@ 2013-10-09 13:13                         ` Andrew Cooper
  2013-10-09 13:16                           ` George Dunlap
  0 siblings, 1 reply; 40+ messages in thread
From: Andrew Cooper @ 2013-10-09 13:13 UTC (permalink / raw)
  To: George Dunlap; +Cc: xen-devel, keir.xen, Jan Beulich

On 09/10/13 14:02, George Dunlap wrote:
> On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote:
>>
>> These issues sprang up with your patches,  not with the ones that
>> Mukesh posted ? Which did get tested for regression and passed with
>> flying colours? That means there is a working baseline. Would that help?
>> Mukesh do you have any ideas what might be amiss?
>
> The issues with the code motion patch are almost certainly mine.  Andy
> Cooper ran one of Mukesh's versions through one of the XenRT tests
> (probably a nightly) and it came up fine.  I just need to go through
> and figure out what I messed up.

I ran Mukesh's v10 series through a XenServer BST, which confirmed no
glaring function regressions in PV and HVM domains.

~Andrew

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-09 13:13                         ` Andrew Cooper
@ 2013-10-09 13:16                           ` George Dunlap
  2013-10-09 14:37                             ` Andrew Cooper
  0 siblings, 1 reply; 40+ messages in thread
From: George Dunlap @ 2013-10-09 13:16 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, keir.xen, Jan Beulich

On 09/10/13 14:13, Andrew Cooper wrote:
> On 09/10/13 14:02, George Dunlap wrote:
>> On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote:
>>> These issues sprang up with your patches,  not with the ones that
>>> Mukesh posted ? Which did get tested for regression and passed with
>>> flying colours? That means there is a working baseline. Would that help?
>>> Mukesh do you have any ideas what might be amiss?
>> The issues with the code motion patch are almost certainly mine.  Andy
>> Cooper ran one of Mukesh's versions through one of the XenRT tests
>> (probably a nightly) and it came up fine.  I just need to go through
>> and figure out what I messed up.
> I ran Mukesh's v10 series through a XenServer BST, which confirmed no
> glaring function regressions in PV and HVM domains.

I'm not sure anyone outside the XenServer team knows what a "BST" might 
entail. :-)  Is that "Basic Smoke Test"?  "Build Sanity Test"?  "Big 
Stress Test"?

  -George

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-09 13:16                           ` George Dunlap
@ 2013-10-09 14:37                             ` Andrew Cooper
  0 siblings, 0 replies; 40+ messages in thread
From: Andrew Cooper @ 2013-10-09 14:37 UTC (permalink / raw)
  To: George Dunlap; +Cc: xen-devel, keir.xen, Jan Beulich

On 09/10/13 14:16, George Dunlap wrote:
> On 09/10/13 14:13, Andrew Cooper wrote:
>> On 09/10/13 14:02, George Dunlap wrote:
>>> On 10/08/2013 01:30 PM, Konrad Rzeszutek Wilk wrote:
>>>> These issues sprang up with your patches,  not with the ones that
>>>> Mukesh posted ? Which did get tested for regression and passed with
>>>> flying colours? That means there is a working baseline. Would that
>>>> help?
>>>> Mukesh do you have any ideas what might be amiss?
>>> The issues with the code motion patch are almost certainly mine.  Andy
>>> Cooper ran one of Mukesh's versions through one of the XenRT tests
>>> (probably a nightly) and it came up fine.  I just need to go through
>>> and figure out what I messed up.
>> I ran Mukesh's v10 series through a XenServer BST, which confirmed no
>> glaring function regressions in PV and HVM domains.
>
> I'm not sure anyone outside the XenServer team knows what a "BST"
> might entail. :-)  Is that "Basic Smoke Test"?  "Build Sanity Test"? 
> "Big Stress Test"?
>
>  -George

For those interested, XenRT is the test system for XenServer, and has
several categories.

The first is the BVT - "Basic Verification Test".  This does install,
and running a single test from each of the primary areas (e.g. single PV
vm, single HVM vm, one migrate, one vlan, etc).  This test is performed
automatically on all code submission.

The next is the BST - "Branch Safety Test".  This is a substantially
larger set of basic functional tests.  It is 5-8 hours to run and is run
as-and-when a developer decides they have put in a large chunk of work
and want rather more testing than a BVT, but don't want to commit the
resources of a nightly on something which might crash and burn.

The third is the Nightly test.  This is 2.5K tests taking ~16h and ~150
machines, which is a fairly comprehensive test of XenServer.

Beyond that there are sets of tests run less often, including the
nightly and weekend stress tests.

~Andrew

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08 12:30                     ` Konrad Rzeszutek Wilk
  2013-10-09 13:02                       ` George Dunlap
@ 2013-10-09 17:50                       ` Tim Deegan
  2013-10-09 22:31                         ` Mukesh Rathor
  1 sibling, 1 reply; 40+ messages in thread
From: Tim Deegan @ 2013-10-09 17:50 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: George Dunlap, xen-devel, keir.xen, Jan Beulich

At 08:30 -0400 on 08 Oct (1381221032), Konrad Rzeszutek Wilk wrote:
> George Dunlap <george.dunlap@eu.citrix.com> wrote:
> >If it were just a question of cleaning up those bits, I could probably 
> >have another draft posted sometime this week.  But if we're stepping 
> >back and looking at whether this is the right approach, or whether 
> >something like Tim has suggested -- basically making PVH to be HVM
> >minus 
> >qemu plus a handful of hypercalls, and most of the changes in the
> >domain 
> >builder rather than in Xen -- that will take a bit longer, particularly
> >
> >because it would probably mean me having to understand and modify the 
> >Linux side of things as well.  At this point I'm not really sure what 
> >the best approach is going forward.
> 
> Arrg.  I am not really sure how to express myself here but from the
> start Mukesh has asking for assistance and review of ideas and design
> of this and gotten it and acted on it. Now after two years of going
> this path folks are suggesting a new design?

Sorry for not making the suggestion sooner -- it honestly hadn't
occurred to me.  I had read a number of revisions of the PVH Xen patches
(and many discussions of what the new type of guests should be called)
before thinking that there didn't need to be a new kind of guest at all.

FWIW, I don't think this would be a complete redesign. AFAICT the guest
kernel changes would stay as they are, and most of the toolstack changes
too.  Some of the Xen changes would stay (implementation of setvcpuinfo,
for example) and some would just go away.

Anyway, given that I can't offer to help with the coding (I have
basically no time between now and XenSummit) I don't want to stand in
the way of this.  If it goes in in something like the current form I'll
try and work out a follow-up series that does it the other way.

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-08  7:43               ` Jan Beulich
@ 2013-10-09 21:59                 ` Mukesh Rathor
  0 siblings, 0 replies; 40+ messages in thread
From: Mukesh Rathor @ 2013-10-09 21:59 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, keir.xen

On Tue, 08 Oct 2013 08:43:43 +0100
"Jan Beulich" <JBeulich@suse.com> wrote:

> >>> On 08.10.13 at 02:52, Mukesh Rathor <mukesh.rathor@oracle.com>
> >>> wrote:
> > On Mon, 30 Sep 2013 07:56:30 +0100
> > "Jan Beulich" <JBeulich@suse.com> wrote:
> > 
> >> >>> On 28.09.13 at 01:03, Mukesh Rathor <mukesh.rathor@oracle.com>
> >> >>> wrote:
> >> > On Fri, 27 Sep 2013 08:01:16 +0100
> >> > "Jan Beulich" <JBeulich@suse.com> wrote:
> > .......
> >> >> >> > @@ -1089,11 +1262,18 @@ int __init construct_dom0(
> >> >> >> >      regs->eip = parms.virt_entry;
> >> >> >> >      regs->esp = vstack_end;
> >> >> >> >      regs->esi = vstartinfo_start;
> >> >> >> > -    regs->eflags = X86_EFLAGS_IF;
> >> >> >> > +    regs->eflags = X86_EFLAGS_IF | 0x2;
> >> >> >> 
> >> >> >> Unrelated change?
> >> >> > 
> >> >> > Nop, we need to make sure the resvd bit is set in eflags
> >> >> > otherwise it won't vmenter (invalid guest state). Should be
> >> >> > harmless for PV, right? Not sure where it does it for PV
> >> >> > before actually scheduling it..
> >> >> 
> >> >> PV doesn't set this anywhere - the hardware doesn't allow the
> >> >> flag to be cleared (writes are ignored). If VMENTER is picky
> >> >> about this, the GUEST_RFLAGS write at the end of
> >> >> vmx_vmenter_helper() should be doing this instead of having to
> >> >> do it here (and obviously in some other place for DomU
> >> >> creation).
> >> > 
> >> > For domU we set it in arch_set_info_guest.
> >> 
> >> Which is bogus too. 15910:ec3b23d8d544 ("hvm: Always keep
> >> canonical copy of RIP/RSP/RFLAGS in guest_cpu_user_regs()") did
> >> this adjustment without really explaining why it can't be done
> >> centrally in just the two places copying regs->eflags into the
> >> VMCS/VMCB spot.
> > 
> > I beg to differ.... such nit picking is equally bogus IMHO. The
> > bit needs to be set once, putting it in vmx_vmenter_helper adds an
> > unnecessary slowdown IMO. 
> 
> An "or" being a measurable slowdown?

Well, unnecessary dirtying of cache line. But, I'll make the change
just to be done with this.

> >> > vmx_vmenter_helper gets
> >> > called on every vmentry, we just need this setting once.
> >> 
> >> Would a debugger update guest state via arch_set_info_guest()?
> >> I doubt it. It would imo be a desirable up front cleanup patch to
> >> move this bogus thing out of arch_set_info_guest() into
> >> vmx_vmenter_helper() (and whatever SVM equivalent, should
> >> SVM too be incapable of dealing with the flag being clear). See
> >> how e.g. hvm_load_cpu_ctxt() already sets the flag? It's really
> >> like being done almost at random...
> > 
> > The debugger would always read eflags, muck with only
> > the bits it needs to, leaving the resvd bit as is, then send it
> > down.
> 
> So you'd expect every debugger to be responsible for setting this
> bit? Pretty odd a requirement, when it can be done centrally in a
> single place, covering all cases.

Like I said, the debuggers are expected to read existing eflags,
change the bits (leaving resvd bit set). But, ir-relevant now
after the above change.

Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes
  2013-10-09 17:50                       ` Tim Deegan
@ 2013-10-09 22:31                         ` Mukesh Rathor
  0 siblings, 0 replies; 40+ messages in thread
From: Mukesh Rathor @ 2013-10-09 22:31 UTC (permalink / raw)
  To: Tim Deegan; +Cc: George Dunlap, xen-devel, keir.xen, Jan Beulich

On Wed, 9 Oct 2013 18:50:40 +0100
Tim Deegan <tim@xen.org> wrote:

> At 08:30 -0400 on 08 Oct (1381221032), Konrad Rzeszutek Wilk wrote:
> > George Dunlap <george.dunlap@eu.citrix.com> wrote:
> > >If it were just a question of cleaning up those bits, I could
> > >probably have another draft posted sometime this week.  But if
> > >we're stepping back and looking at whether this is the right
> > >approach, or whether something like Tim has suggested -- basically
> > >making PVH to be HVM minus 
> > >qemu plus a handful of hypercalls, and most of the changes in the
> > >domain 
> > >builder rather than in Xen -- that will take a bit longer,
> > >particularly
> > >
> > >because it would probably mean me having to understand and modify
> > >the Linux side of things as well.  At this point I'm not really
> > >sure what the best approach is going forward.
> > 
> > Arrg.  I am not really sure how to express myself here but from the
> > start Mukesh has asking for assistance and review of ideas and
> > design of this and gotten it and acted on it. Now after two years
> > of going this path folks are suggesting a new design?
> 
> Sorry for not making the suggestion sooner -- it honestly hadn't
> occurred to me.  I had read a number of revisions of the PVH Xen
> patches (and many discussions of what the new type of guests should
> be called) before thinking that there didn't need to be a new kind of
> guest at all.

The orig series didn't have a new guest type, but I was sorta compelled
to create one. From guest perspective, it's in PV mode with auto
translate. It appears this is now losing sight of the orig goal. The
main problem being solved was 64bit PV syscall overhead. The best way to
solve the sys call overhead was to run in HVM container. I discussed
briefly with Ian P and then Keir on the sidelines of xen summit. Then I
did code walk with Steffano and Ian C at hackathon last year. Looked
like we were all on same page.

Still, if something could be done better, I'm in favor. But, lets
get something done asap so as to not get left behind. Everyone will
have a different opinion on the best approach, and unless some
compromises are made things can just drag on..... 

> FWIW, I don't think this would be a complete redesign. AFAICT the
> guest kernel changes would stay as they are, and most of the
> toolstack changes too.  Some of the Xen changes would stay
> (implementation of setvcpuinfo, for example) and some would just go
> away.

Good idea, but keep in mind that dom0 will need most of the xen 
changes for coming in PVH mode. So, there might not be enough to make
it worthwhile.

thanks
Mukesh

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2013-10-09 22:31 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-25 21:03 [RFC 0 PATCH 0/3]: PVH dom0 construction Mukesh Rathor
2013-09-25 21:03 ` [RFC 0 PATCH 1/3] PVH dom0: create domctl_memory_mapping() function Mukesh Rathor
2013-09-26  7:03   ` Jan Beulich
2013-09-25 21:03 ` [RFC 0 PATCH 2/3] PVH dom0: move some pv specific code to static functions Mukesh Rathor
2013-09-26  7:21   ` Jan Beulich
2013-09-26 23:32     ` Mukesh Rathor
2013-09-25 21:03 ` [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes Mukesh Rathor
2013-09-26  8:02   ` Jan Beulich
2013-09-27  0:17     ` Mukesh Rathor
2013-09-27  6:54       ` Jan Beulich
2013-10-03  0:53         ` Mukesh Rathor
2013-10-04  6:53           ` Jan Beulich
2013-10-04 13:35             ` Konrad Rzeszutek Wilk
2013-10-04 14:05               ` Jan Beulich
2013-10-04 16:02                 ` Konrad Rzeszutek Wilk
2013-10-04 16:07                   ` Jan Beulich
2013-10-04 20:59                     ` Konrad Rzeszutek Wilk
2013-10-05  1:06                       ` Mukesh Rathor
2013-10-07  7:12                         ` Jan Beulich
2013-10-08  0:58             ` Mukesh Rathor
2013-10-08  7:51               ` Jan Beulich
2013-10-08  8:03                 ` Jan Beulich
2013-10-08  9:39                   ` George Dunlap
2013-10-08  9:57                     ` Jan Beulich
2013-10-08 10:01                       ` George Dunlap
2013-10-08 10:19                         ` Lars Kurth
2013-10-08 12:30                     ` Konrad Rzeszutek Wilk
2013-10-09 13:02                       ` George Dunlap
2013-10-09 13:13                         ` Andrew Cooper
2013-10-09 13:16                           ` George Dunlap
2013-10-09 14:37                             ` Andrew Cooper
2013-10-09 17:50                       ` Tim Deegan
2013-10-09 22:31                         ` Mukesh Rathor
2013-09-27  1:55     ` Mukesh Rathor
2013-09-27  7:01       ` Jan Beulich
2013-09-27 23:03         ` Mukesh Rathor
2013-09-30  6:56           ` Jan Beulich
2013-10-08  0:52             ` Mukesh Rathor
2013-10-08  7:43               ` Jan Beulich
2013-10-09 21:59                 ` Mukesh Rathor

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.