From mboxrd@z Thu Jan 1 00:00:00 1970 From: Mukesh Rathor Subject: [PATCH 20/20] PVH xen: PVH dom0 creation.... Date: Tue, 14 May 2013 17:52:48 -0700 Message-ID: <1368579168-30829-21-git-send-email-mukesh.rathor@oracle.com> References: <1368579168-30829-1-git-send-email-mukesh.rathor@oracle.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1368579168-30829-1-git-send-email-mukesh.rathor@oracle.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Xen-devel@lists.xensource.com List-Id: xen-devel@lists.xenproject.org Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in PVH mode. opt_dom0pvh, which when specified in the command line, causes dom0 to boot in PVH mode. Note, the call to elf_load_binary() is moved down after required PVH setup so that we can use the code path for both PV and PVH. Change in V2: - Map the entire IO region upfront in the P2M for PVH dom0. Change in V3: - Fixup pvh_map_all_iomem() to make sure map upto 4GB of io space. - remove use of dbg_* functions. Change in V5: - no need to pass around v_start, and update comment in public/xen.h. Signed-off-by: Mukesh Rathor --- xen/arch/x86/domain_build.c | 269 ++++++++++++++++++++++++++++++++++--------- xen/arch/x86/mm/hap/hap.c | 14 +++ xen/arch/x86/setup.c | 6 +- xen/include/asm-x86/hap.h | 1 + xen/include/public/xen.h | 2 + 5 files changed, 236 insertions(+), 56 deletions(-) diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index c5a0e0c..9083a3b 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -36,6 +36,7 @@ #include /* for bzimage_parse */ #include #include +#include #include @@ -309,6 +310,72 @@ static void __init process_dom0_ioports_disable(void) } } +/* + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have + * the entire io region mapped in the EPT/NPT. + * + * pvh fixme: The following doesn't map MMIO ranges when they sit above the + * highest E820 covered address. + */ +static __init void pvh_map_all_iomem(struct domain *d) +{ + unsigned long start_pfn, end_pfn, end = 0, start = 0; + const struct e820entry *entry; + unsigned int i, nump; + int rc; + + for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ ) + { + end = entry->addr + entry->size; + + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE || + i == e820.nr_map - 1 ) + { + start_pfn = PFN_DOWN(start); + end_pfn = PFN_UP(end); + + if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ) + end_pfn = PFN_UP(entry->addr); + + if ( start_pfn < end_pfn ) + { + nump = end_pfn - start_pfn; + /* Add pages to the mapping */ + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); + BUG_ON(rc); + } + start = end; + } + } + + /* If the e820 ended under 4GB, we must map the remaining space upto 4GB */ + if ( end < GB(4) ) + { + start_pfn = PFN_UP(end); + end_pfn = (GB(4)) >> PAGE_SHIFT; + nump = end_pfn - start_pfn; + rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1); + BUG_ON(rc); + } +} + +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn, + unsigned long mfn, unsigned long vphysmap_s) +{ + if ( is_pvh_domain(d) ) + { + int rc = guest_physmap_add_page(d, pfn, mfn, 0); + BUG_ON(rc); + return; + } + if ( !is_pv_32on64_domain(d) ) + ((unsigned long *)vphysmap_s)[pfn] = mfn; + else + ((unsigned int *)vphysmap_s)[pfn] = mfn; + + set_gpfn_from_mfn(mfn, pfn); +} + /* * Copy or zero function for dom0 only during boot. This because * raw_copy_to_guest -> copy_to_user_hvm -> __hvm_copy needs curr to @@ -353,6 +420,7 @@ int __init construct_dom0( void *(*bootstrap_map)(const module_t *), char *cmdline) { + char *si_buf=NULL; int i, cpu, rc, compatible, compat32, order, machine; struct cpu_user_regs *regs; unsigned long pfn, mfn; @@ -361,7 +429,7 @@ int __init construct_dom0( unsigned long alloc_spfn; unsigned long alloc_epfn; unsigned long initrd_pfn = -1, initrd_mfn = 0; - unsigned long count; + unsigned long count, shared_info_paddr = 0; struct page_info *page = NULL; start_info_t *si; struct vcpu *v = d->vcpu[0]; @@ -449,11 +517,19 @@ int __init construct_dom0( return -EINVAL; } - if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE && - !test_bit(XENFEAT_dom0, parms.f_supported) ) + if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE ) { - printk("Kernel does not support Dom0 operation\n"); - return -EINVAL; + if ( !test_bit(XENFEAT_dom0, parms.f_supported) ) + { + printk("Kernel does not support Dom0 operation\n"); + return -EINVAL; + } + if ( is_pvh_domain(d) && + !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) + { + printk("Kernel does not support PVH mode\n"); + return -EINVAL; + } } if ( compat32 ) @@ -518,6 +594,14 @@ int __init construct_dom0( vstartinfo_end = (vstartinfo_start + sizeof(struct start_info) + sizeof(struct dom0_vga_console_info)); + + if ( is_pvh_domain(d) ) + { + /* note, following is paddr as opposed to maddr */ + shared_info_paddr = round_pgup(vstartinfo_end) - v_start; + vstartinfo_end += PAGE_SIZE; + } + vpt_start = round_pgup(vstartinfo_end); for ( nr_pt_pages = 2; ; nr_pt_pages++ ) { @@ -659,16 +743,34 @@ int __init construct_dom0( maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; } - clear_page(l4tab); - init_guest_l4_table(l4tab, d); - v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); - if ( is_pv_32on64_domain(d) ) - v->arch.guest_table_user = v->arch.guest_table; + if ( is_pvh_domain(d) ) + { + v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] = (vpt_start - v_start); + + /* HAP is required for PVH and pfns are sequentially mapped there */ + pfn = 0; + } + else + { + clear_page(l4tab); + init_guest_l4_table(l4tab, d); + v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); + if ( is_pv_32on64_domain(d) ) + v->arch.guest_table_user = v->arch.guest_table; + pfn = alloc_spfn; + } l4tab += l4_table_offset(v_start); - pfn = alloc_spfn; for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) { + /* + * initrd chunk's mfns are allocated from a separate mfn chunk. Hence + * we need to adjust for them. + */ + signed long pvh_adj = is_pvh_domain(d) ? + (PFN_UP(initrd_len) - alloc_spfn) << PAGE_SHIFT + : 0; + if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table; @@ -695,16 +797,17 @@ int __init construct_dom0( clear_page(l3tab); if ( count == 0 ) l3tab += l3_table_offset(v_start); - *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT); + *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, L4_PROT); l4tab++; } - *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT); + *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT); l3tab++; } - *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT); + *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT); l2tab++; } - if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) + if ( is_pvh_domain(d) || + count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) mfn = pfn++; else mfn = initrd_mfn++; @@ -712,6 +815,9 @@ int __init construct_dom0( L1_PROT : COMPAT_L1_PROT)); l1tab++; + if ( is_pvh_domain(d) ) + continue; + page = mfn_to_page(mfn); if ( (page->u.inuse.type_info == 0) && !get_page_and_type(page, d, PGT_writable_page) ) @@ -740,6 +846,9 @@ int __init construct_dom0( COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); } + if ( is_pvh_domain(d) ) + goto pvh_skip_pt_rdonly; + /* Pages that are part of page tables must be read only. */ l4tab = l4start + l4_table_offset(vpt_start); l3start = l3tab = l4e_to_l3e(*l4tab); @@ -779,6 +888,8 @@ int __init construct_dom0( } } +pvh_skip_pt_rdonly: + /* Mask all upcalls... */ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; @@ -802,35 +913,20 @@ int __init construct_dom0( write_ptbase(v); mapcache_override_current(v); - /* Copy the OS image and free temporary buffer. */ - elf.dest = (void*)vkern_start; - rc = elf_load_binary(&elf); - if ( rc < 0 ) - { - printk("Failed to load the kernel binary\n"); - return rc; - } - bootstrap_map(NULL); - - if ( UNSET_ADDR != parms.virt_hypercall ) + /* Set up start info area. */ + if ( is_pvh_domain(d) ) { - if ( (parms.virt_hypercall < v_start) || - (parms.virt_hypercall >= v_end) ) + /* avoid calling copy for every write to the vstartinfo_start */ + if ( (si_buf = xmalloc_bytes(PAGE_SIZE)) == NULL ) { - mapcache_override_current(NULL); - write_ptbase(current); - printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); - return -1; + printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; } - hypercall_page_initialise( - d, (void *)(unsigned long)parms.virt_hypercall); + si = (start_info_t *)si_buf; } + else + si = (start_info_t *)vstartinfo_start; - /* Free temporary buffers. */ - discard_initial_images(); - - /* Set up start info area. */ - si = (start_info_t *)vstartinfo_start; clear_page(si); si->nr_pages = nr_pages; @@ -847,6 +943,10 @@ int __init construct_dom0( elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : ""); count = d->tot_pages; + + if ( is_pvh_domain(d) ) + goto pvh_skip_guest_p2m_table; + l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); l3tab = NULL; l2tab = NULL; @@ -973,6 +1073,11 @@ int __init construct_dom0( unmap_domain_page(l3tab); unmap_domain_page(l4start); +pvh_skip_guest_p2m_table: + + if (is_pvh_domain(d) ) + hap_set_pvh_alloc_for_dom0(d, nr_pages); + /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < count; pfn++ ) { @@ -989,11 +1094,8 @@ int __init construct_dom0( if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) mfn = alloc_epfn - (pfn - REVERSE_START); #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); + if (!(pfn & 0xfffff)) process_pending_softirqs(); } @@ -1009,8 +1111,8 @@ int __init construct_dom0( if ( !page->u.inuse.type_info && !get_page_and_type(page, d, PGT_writable_page) ) BUG(); - ((unsigned long *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + + dom0_update_physmap(d, pfn, mfn, vphysmap_start); ++pfn; if (!(pfn & 0xfffff)) process_pending_softirqs(); @@ -1030,11 +1132,7 @@ int __init construct_dom0( #ifndef NDEBUG #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) #endif - if ( !is_pv_32on64_domain(d) ) - ((unsigned long *)vphysmap_start)[pfn] = mfn; - else - ((unsigned int *)vphysmap_start)[pfn] = mfn; - set_gpfn_from_mfn(mfn, pfn); + dom0_update_physmap(d, pfn, mfn, vphysmap_start); #undef pfn page++; pfn++; if (!(pfn & 0xfffff)) @@ -1042,6 +1140,50 @@ int __init construct_dom0( } } + /* Copy the OS image and free temporary buffer. */ + elf.dest = (void*)vkern_start; + rc = elf_load_binary(&elf); + if ( rc < 0 ) + { + printk("Failed to load the kernel binary\n"); + return rc; + } + bootstrap_map(NULL); + + if ( UNSET_ADDR != parms.virt_hypercall ) + { + void *addr; + if ( is_pvh_domain(d) ) + { + if ( (addr = xzalloc_bytes(PAGE_SIZE)) == NULL ) + { + printk("pvh: xzalloc failed for %ld bytes.\n", PAGE_SIZE); + return -ENOMEM; + } + } + else + addr = (void *)parms.virt_hypercall; + + if ( (parms.virt_hypercall < v_start) || + (parms.virt_hypercall >= v_end) ) + { + mapcache_override_current(NULL); + write_ptbase(current); + printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); + return -1; + } + hypercall_page_initialise(d, addr); + + if ( is_pvh_domain(d) ) + { + early_pvh_copy_or_zero(parms.virt_hypercall, addr, PAGE_SIZE); + xfree(addr); + } + } + + /* Free temporary buffers. */ + discard_initial_images(); + if ( initrd_len != 0 ) { si->mod_start = vinitrd_start ?: initrd_pfn; @@ -1057,6 +1199,16 @@ int __init construct_dom0( si->console.dom0.info_off = sizeof(struct start_info); si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); } + if ( is_pvh_domain(d) ) + { + unsigned long mfn = virt_to_mfn(d->shared_info); + unsigned long pfn = shared_info_paddr >> PAGE_SHIFT; + si->shared_info = shared_info_paddr; + dom0_update_physmap(d, pfn, mfn, 0); + + early_pvh_copy_or_zero(vstartinfo_start, si_buf, PAGE_SIZE); + xfree(si_buf); + } if ( is_pv_32on64_domain(d) ) xlat_start_info(si, XLAT_start_info_console_dom0); @@ -1088,12 +1240,18 @@ int __init construct_dom0( regs->eip = parms.virt_entry; regs->esp = vstack_end; regs->esi = vstartinfo_start; - regs->eflags = X86_EFLAGS_IF; + regs->eflags = X86_EFLAGS_IF | 0x2; if ( opt_dom0_shadow ) - if ( paging_enable(d, PG_SH_enable) == 0 ) + { + if ( is_pvh_domain(d) ) + { + printk("Invalid option dom0_shadow for PVH\n"); + return -EINVAL; + } + if ( paging_enable(d, PG_SH_enable) == 0 ) paging_update_paging_modes(v); - + } if ( supervisor_mode_kernel ) { v->arch.pv_vcpu.kernel_ss &= ~3; @@ -1170,6 +1328,9 @@ int __init construct_dom0( BUG_ON(rc != 0); + if ( is_pvh_domain(d) ) + pvh_map_all_iomem(d); + iommu_dom0_init(dom0); return 0; diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c index 5aa0852..674c324 100644 --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -580,6 +580,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, } } +/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */ +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages) +{ + int rc; + unsigned long memkb = num_pages * (PAGE_SIZE / 1024); + + memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024)); + num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT); + paging_lock(d); + rc = hap_set_allocation(d, num_pages, NULL); + paging_unlock(d); + BUG_ON(rc); +} + static const struct paging_mode hap_paging_real_mode; static const struct paging_mode hap_paging_protected_mode; static const struct paging_mode hap_paging_pae_mode; diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index 6d35d1d..60f4dd8 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -549,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p) { char *memmap_type = NULL; char *cmdline, *kextra, *loader; - unsigned int initrdidx; + unsigned int initrdidx, domcr_flags = 0; multiboot_info_t *mbi = __va(mbi_p); module_t *mod = (module_t *)__va(mbi->mods_addr); unsigned long nr_pages, modules_headroom, *module_map; @@ -1321,7 +1321,9 @@ void __init __start_xen(unsigned long mbi_p) panic("Could not protect TXT memory regions\n"); /* Create initial domain 0. */ - dom0 = domain_create(0, DOMCRF_s3_integrity, 0); + domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0); + domcr_flags |= DOMCRF_s3_integrity; + dom0 = domain_create(0, domcr_flags, 0); if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0\n"); diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h index e03f983..aab8558 100644 --- a/xen/include/asm-x86/hap.h +++ b/xen/include/asm-x86/hap.h @@ -63,6 +63,7 @@ int hap_track_dirty_vram(struct domain *d, XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages); #endif /* XEN_HAP_H */ diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h index 3cab74f..28d1e13 100644 --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -693,6 +693,8 @@ typedef struct shared_info shared_info_t; * c. list of allocated page frames [mfn_list, nr_pages] * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] + * d1. struct shared_info_t [shared_info] + * (above if auto translated guest) * e. bootstrap page tables [pt_base and CR3 (x86)] * f. bootstrap stack [register ESP (x86)] * 4. Bootstrap elements are packed together, but each is 4kB-aligned. -- 1.7.2.3