From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1422972AbXBIK5d (ORCPT ); Fri, 9 Feb 2007 05:57:33 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1422860AbXBIK5d (ORCPT ); Fri, 9 Feb 2007 05:57:33 -0500 Received: from ozlabs.org ([203.10.76.45]:41553 "EHLO ozlabs.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1422972AbXBIK5a (ORCPT ); Fri, 9 Feb 2007 05:57:30 -0500 Subject: [PATCH 6b/10] lguest: the host code (lg.ko) From: Rusty Russell To: lkml - Kernel Mailing List Cc: Andrew Morton , Andi Kleen , virtualization In-Reply-To: <1171018524.2718.58.camel@localhost.localdomain> References: <1171012296.2718.26.camel@localhost.localdomain> <1171012458.2718.30.camel@localhost.localdomain> <1171012693.2718.37.camel@localhost.localdomain> <1171012761.2718.40.camel@localhost.localdomain> <1171018524.2718.58.camel@localhost.localdomain> Content-Type: text/plain Date: Fri, 09 Feb 2007 21:56:30 +1100 Message-Id: <1171018590.2718.60.camel@localhost.localdomain> Mime-Version: 1.0 X-Mailer: Evolution 2.8.1 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org This is the host module (lg.ko) which supports lguest: arch/i386/lguest/hypervisor.S: The actual guest <-> host switching code. This is compiled into a C array, which is mapped to 0xFFC01000 in host and guests. arch/i386/lguest/core.c: The core of the hypervisor, which calls into the assembler code which does this actual switch. Also contains helper routines. arch/i386/lguest/hypercalls.c: The entry point for the 19 hypercalls. arch/i386/lguest/interrupts_and_traps.c: Handling of interrupts and traps, except page faults. arch/i386/lguest/io.c: I/O from guest to host, and between guests. arch/i386/lguest/lguest_user.c: /dev/lguest interface for lguest program to launch/control guests. arch/i386/lguest/page_tables.c: Shadow Page table handling: generally we build up the shadow page tables by converting from guest page tables when a fault occurs. arch/i386/lguest/segments.c: Segmentation (GDT) handling: we have to ensure they're trimmed to avoid guest access to the switching code. Signed-off-by: Rusty Russell =================================================================== --- /dev/null +++ b/arch/i386/lguest/core.c @@ -0,0 +1,425 @@ +/* World's simplest hypervisor, to test paravirt_ops and show + * unbelievers that virtualization is the future. Plus, it's fun! */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lg.h" + +/* This is our hypervisor, compiled from hypervisor.S. */ +static char __initdata hypervisor_blob[] = { +#include "hypervisor-blob.c" +}; + +#define MAX_LGUEST_GUESTS \ + ((HYPERVISOR_SIZE-sizeof(hypervisor_blob))/sizeof(struct lguest_state)) + +static struct vm_struct *hypervisor_vma; +static int cpu_had_pge; +static struct { + unsigned long offset; + unsigned short segment; +} lguest_entry; +struct page *hype_pages; /* Contiguous pages. */ +struct lguest lguests[MAX_LGUEST_GUESTS]; +DECLARE_MUTEX(lguest_lock); + +/* IDT entries are at start of hypervisor. */ +const unsigned long *__lguest_default_idt_entries(void) +{ + return (void *)HYPE_ADDR; +} + +/* Next is switch_to_guest */ +static void *__lguest_switch_to_guest(void) +{ + return (void *)HYPE_ADDR + HYPE_DATA_SIZE; +} + +/* Then we use everything else to hold guest state. */ +struct lguest_state *__lguest_states(void) +{ + return (void *)HYPE_ADDR + sizeof(hypervisor_blob); +} + +static __init int map_hypervisor(void) +{ + unsigned int i; + int err; + struct page *pages[HYPERVISOR_PAGES], **pagep = pages; + + hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, + get_order(HYPERVISOR_SIZE)); + if (!hype_pages) + return -ENOMEM; + + hypervisor_vma = __get_vm_area(HYPERVISOR_SIZE, VM_ALLOC, + HYPE_ADDR, VMALLOC_END); + if (!hypervisor_vma) { + err = -ENOMEM; + printk("lguest: could not map hypervisor pages high\n"); + goto free_pages; + } + + for (i = 0; i < HYPERVISOR_PAGES; i++) + pages[i] = hype_pages + i; + + err = map_vm_area(hypervisor_vma, PAGE_KERNEL, &pagep); + if (err) { + printk("lguest: map_vm_area failed: %i\n", err); + goto free_vma; + } + memcpy(hypervisor_vma->addr, hypervisor_blob, sizeof(hypervisor_blob)); + + /* Setup LGUEST segments on all cpus */ + for_each_possible_cpu(i) { + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; + } + + /* Initialize entry point into hypervisor. */ + lguest_entry.offset = (long)__lguest_switch_to_guest(); + lguest_entry.segment = LGUEST_CS; + + printk("lguest: mapped hypervisor at %p\n", hypervisor_vma->addr); + return 0; + +free_vma: + vunmap(hypervisor_vma->addr); +free_pages: + __free_pages(hype_pages, get_order(HYPERVISOR_SIZE)); + return err; +} + +static __exit void unmap_hypervisor(void) +{ + vunmap(hypervisor_vma->addr); + __free_pages(hype_pages, get_order(HYPERVISOR_SIZE)); +} + +/* IN/OUT insns: enough to get us past boot-time probing. */ +static int emulate_insn(struct lguest *lg) +{ + u8 insn; + unsigned int insnlen = 0, in = 0, shift = 0; + unsigned long physaddr = guest_pa(lg, lg->state->regs.eip); + + /* This only works for addresses in linear mapping... */ + if (lg->state->regs.eip < lg->page_offset) + return 0; + lhread(lg, &insn, physaddr, 1); + + /* Operand size prefix means it's actually for ax. */ + if (insn == 0x66) { + shift = 16; + insnlen = 1; + lhread(lg, &insn, physaddr + insnlen, 1); + } + + switch (insn & 0xFE) { + case 0xE4: /* in ,%al */ + insnlen += 2; + in = 1; + break; + case 0xEC: /* in (%dx),%al */ + insnlen += 1; + in = 1; + break; + case 0xE6: /* out %al, */ + insnlen += 2; + break; + case 0xEE: /* out %al,(%dx) */ + insnlen += 1; + break; + default: + return 0; + } + + if (in) { + /* Lower bit tells is whether it's a 16 or 32 bit access */ + if (insn & 0x1) + lg->state->regs.eax = 0xFFFFFFFF; + else + lg->state->regs.eax |= (0xFFFF << shift); + } + lg->state->regs.eip += insnlen; + return 1; +} + +int find_free_guest(void) +{ + unsigned int i; + for (i = 0; i < MAX_LGUEST_GUESTS; i++) + if (!lguests[i].state) + return i; + return -1; +} + +int lguest_address_ok(const struct lguest *lg, unsigned long addr) +{ + return addr / PAGE_SIZE < lg->pfn_limit; +} + +/* Just like get_user, but don't let guest access lguest binary. */ +u32 lhread_u32(struct lguest *lg, u32 addr) +{ + u32 val = 0; + + /* Don't let them access lguest_add */ + if (!lguest_address_ok(lg, addr) + || get_user(val, (u32 __user *)addr) != 0) + kill_guest(lg, "bad read address %u", addr); + return val; +} + +void lhwrite_u32(struct lguest *lg, u32 addr, u32 val) +{ + if (!lguest_address_ok(lg, addr) + || put_user(val, (u32 __user *)addr) != 0) + kill_guest(lg, "bad write address %u", addr); +} + +void lhread(struct lguest *lg, void *b, u32 addr, unsigned bytes) +{ + if (addr + bytes < addr || !lguest_address_ok(lg, addr+bytes) + || copy_from_user(b, (void __user *)addr, bytes) != 0) { + /* copy_from_user should do this, but as we rely on it... */ + memset(b, 0, bytes); + kill_guest(lg, "bad read address %u len %u", addr, bytes); + } +} + +void lhwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes) +{ + if (addr + bytes < addr + || !lguest_address_ok(lg, addr+bytes) + || copy_to_user((void __user *)addr, b, bytes) != 0) + kill_guest(lg, "bad write address %u len %u", addr, bytes); +} + +/* Saves exporting idt_table from kernel */ +static struct desc_struct *get_idt_table(void) +{ + struct Xgt_desc_struct idt; + + asm("sidt %0":"=m" (idt)); + return (void *)idt.address; +} + +extern asmlinkage void math_state_restore(void); + +static int usermode(struct lguest_regs *regs) +{ + return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL; +} + +/* Trap page resets this when it reloads gs. */ +static int new_gfp_eip(struct lguest *lg, struct lguest_regs *regs) +{ + u32 eip; + get_user(eip, &lg->lguest_data->gs_gpf_eip); + if (eip == regs->eip) + return 0; + put_user(regs->eip, &lg->lguest_data->gs_gpf_eip); + return 1; +} + +static void set_ts(unsigned int guest_ts) +{ + u32 cr0; + if (guest_ts) { + asm("movl %%cr0,%0":"=r" (cr0)); + if (!(cr0 & 8)) + asm("movl %0,%%cr0": :"r" (cr0|8)); + } +} + +static void run_guest_once(struct lguest *lg) +{ + unsigned int clobber; + + /* Put eflags on stack, lcall does rest. */ + asm volatile("pushf; lcall *lguest_entry" + : "=a"(clobber), "=d"(clobber) + : "0"(lg->state), "1"(get_idt_table()) + : "memory"); +} + +int run_guest(struct lguest *lg, char *__user user) +{ + struct lguest_regs *regs = &lg->state->regs; + + while (!lg->dead) { + unsigned int cr2 = 0; /* Damn gcc */ + + /* Hypercalls first: we might have been out to userspace */ + if (do_async_hcalls(lg)) + goto pending_dma; + + if (regs->trapnum == LGUEST_TRAP_ENTRY) { + /* Only do hypercall once. */ + regs->trapnum = 255; + if (hypercall(lg, regs)) + goto pending_dma; + } + + if (signal_pending(current)) + return -EINTR; + maybe_do_interrupt(lg); + + if (lg->dead) + break; + + if (lg->halted) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + continue; + } + + /* Restore limits on TLS segments if in user mode. */ + if (usermode(regs)) { + unsigned int i; + for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++) + lg->state->gdt_table[GDT_ENTRY_TLS_MIN+i].a + |= lg->tls_limits[i]; + } + + local_irq_disable(); + map_trap_page(lg); + + /* Host state to be restored after the guest returns. */ + asm("sidt %0":"=m"(lg->state->host.idt)); + lg->state->host.gdt = __get_cpu_var(cpu_gdt_descr); + + /* Even if *we* don't want FPU trap, guest might... */ + set_ts(lg->ts); + + run_guest_once(lg); + + /* Save cr2 now if we page-faulted. */ + if (regs->trapnum == 14) + asm("movl %%cr2,%0" :"=r" (cr2)); + else if (regs->trapnum == 7) + math_state_restore(); + local_irq_enable(); + + switch (regs->trapnum) { + case 13: /* We've intercepted a GPF. */ + if (regs->errcode == 0) { + if (emulate_insn(lg)) + continue; + + /* FIXME: If it's reloading %gs in a loop? */ + if (usermode(regs) && new_gfp_eip(lg,regs)) + continue; + } + + if (reflect_trap(lg, &lg->gpf_trap, 1)) + continue; + break; + case 14: /* We've intercepted a page fault. */ + if (demand_page(lg, cr2, regs->errcode & 2)) + continue; + + /* If lguest_data is NULL, this won't hurt. */ + put_user(cr2, &lg->lguest_data->cr2); + if (reflect_trap(lg, &lg->page_trap, 1)) + continue; + kill_guest(lg, "unhandled page fault at %#x" + " (eip=%#x, errcode=%#x)", + cr2, regs->eip, regs->errcode); + break; + case 7: /* We've intercepted a Device Not Available fault. */ + /* If they don't want to know, just absorb it. */ + if (!lg->ts) + continue; + if (reflect_trap(lg, &lg->fpu_trap, 0)) + continue; + kill_guest(lg, "unhandled FPU fault at %#x", + regs->eip); + break; + case 32 ... 255: /* Real interrupt, fall thru */ + cond_resched(); + case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ + continue; + case 6: /* Invalid opcode before they installed handler */ + check_bug_kill(lg); + } + kill_guest(lg,"unhandled trap %i at %#x (err=%i)", + regs->trapnum, regs->eip, regs->errcode); + } + return -ENOENT; + +pending_dma: + put_user(lg->pending_dma, (unsigned long *)user); + put_user(lg->pending_addr, (unsigned long *)user+1); + return sizeof(unsigned long)*2; +} + +#define STRUCT_LGUEST_ELEM_SIZE(elem) sizeof(((struct lguest_state *)0)->elem) + +static void adjust_pge(void *on) +{ + if (on) + write_cr4(read_cr4() | X86_CR4_PGE); + else + write_cr4(read_cr4() & ~X86_CR4_PGE); +} + +static int __init init(void) +{ + int err; + + if (paravirt_enabled()) + return -EPERM; + + err = map_hypervisor(); + if (err) + return err; + + err = init_pagetables(hype_pages); + if (err) { + unmap_hypervisor(); + return err; + } + lguest_io_init(); + + err = lguest_device_init(); + if (err) { + free_pagetables(); + unmap_hypervisor(); + return err; + } + if (cpu_has_pge) { /* We have a broader idea of "global". */ + cpu_had_pge = 1; + on_each_cpu(adjust_pge, 0, 0, 1); + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + } + return 0; +} + +static void __exit fini(void) +{ + lguest_device_remove(); + free_pagetables(); + unmap_hypervisor(); + if (cpu_had_pge) { + set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + on_each_cpu(adjust_pge, (void *)1, 0, 1); + } +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); =================================================================== --- /dev/null +++ b/arch/i386/lguest/hypercalls.c @@ -0,0 +1,199 @@ +/* Actual hypercalls, which allow guests to actually do something. + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include "lg.h" + +static void guest_set_stack(struct lguest *lg, + u32 seg, u32 esp, unsigned int pages) +{ + /* You cannot have a stack segment with priv level 0. */ + if ((seg & 0x3) != GUEST_DPL) + kill_guest(lg, "bad stack segment %i", seg); + if (pages > 2) + kill_guest(lg, "bad stack pages %u", pages); + lg->state->tss.ss1 = seg; + lg->state->tss.esp1 = esp; + lg->stack_pages = pages; + pin_stack_pages(lg); +} + +/* Return true if DMA to host userspace now pending. */ +static int do_hcall(struct lguest *lg, struct lguest_regs *regs) +{ + switch (regs->eax) { + case LHCALL_FLUSH_ASYNC: + break; + case LHCALL_LGUEST_INIT: + kill_guest(lg, "already have lguest_data"); + break; + case LHCALL_CRASH: { + char msg[128]; + lhread(lg, msg, regs->edx, sizeof(msg)); + msg[sizeof(msg)-1] = '\0'; + kill_guest(lg, "CRASH: %s", msg); + break; + } + case LHCALL_LOAD_GDT: + load_guest_gdt(lg, regs->edx, regs->ebx); + break; + case LHCALL_NEW_PGTABLE: + guest_new_pagetable(lg, regs->edx); + break; + case LHCALL_FLUSH_TLB: + if (regs->edx) + guest_pagetable_clear_all(lg); + else + guest_pagetable_flush_user(lg); + break; + case LHCALL_LOAD_IDT_ENTRY: + load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_SET_STACK: + guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_TS: + lg->ts = regs->edx; + break; + case LHCALL_TIMER_READ: { + u32 now = jiffies; + mb(); + regs->eax = now - lg->last_timer; + lg->last_timer = now; + break; + } + case LHCALL_TIMER_START: + lg->timer_on = 1; + if (regs->edx != HZ) + kill_guest(lg, "Bad clock speed %i", regs->edx); + lg->last_timer = jiffies; + break; + case LHCALL_HALT: + lg->halted = 1; + break; + case LHCALL_GET_WALLCLOCK: { + struct timeval tv; + do_gettimeofday(&tv); + regs->eax = tv.tv_sec; + break; + } + case LHCALL_BIND_DMA: + regs->eax = bind_dma(lg, regs->edx, regs->ebx, + regs->ecx >> 8, regs->ecx & 0xFF); + break; + case LHCALL_SEND_DMA: + return send_dma(lg, regs->edx, regs->ebx); + case LHCALL_SET_PTE: + guest_set_pte(lg, regs->edx, regs->ebx, regs->ecx); + break; + case LHCALL_SET_UNKNOWN_PTE: + guest_pagetable_clear_all(lg); + break; + case LHCALL_SET_PUD: + guest_set_pud(lg, regs->edx, regs->ebx); + break; + case LHCALL_LOAD_TLS: + guest_load_tls(lg, (struct desc_struct __user*)regs->edx); + break; + default: + kill_guest(lg, "Bad hypercall %i\n", regs->eax); + } + return 0; +} + +#define log(...) \ + do { \ + mm_segment_t oldfs = get_fs(); \ + char buf[100]; \ + sprintf(buf, "lguest:" __VA_ARGS__); \ + set_fs(KERNEL_DS); \ + sys_write(1, buf, strlen(buf)); \ + set_fs(oldfs); \ + } while(0) + +/* We always do queued calls before actual hypercall. */ +int do_async_hcalls(struct lguest *lg) +{ + unsigned int i, pending; + u8 st[LHCALL_RING_SIZE]; + + if (!lg->lguest_data) + return 0; + + copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)); + for (i = 0; i < ARRAY_SIZE(st); i++) { + struct lguest_regs regs; + unsigned int n = lg->next_hcall; + + if (st[n] == 0xFF) + break; + + if (++lg->next_hcall == LHCALL_RING_SIZE) + lg->next_hcall = 0; + + get_user(regs.eax, &lg->lguest_data->hcalls[n].eax); + get_user(regs.edx, &lg->lguest_data->hcalls[n].edx); + get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx); + get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx); + pending = do_hcall(lg, ®s); + put_user(0xFF, &lg->lguest_data->hcall_status[n]); + if (pending) + return 1; + } + + set_wakeup_process(lg, NULL); + return 0; +} + +int hypercall(struct lguest *lg, struct lguest_regs *regs) +{ + int pending; + + if (!lg->lguest_data) { + if (regs->eax != LHCALL_LGUEST_INIT) { + kill_guest(lg, "hypercall %i before LGUEST_INIT", + regs->eax); + return 0; + } + + lg->lguest_data = (struct lguest_data __user *)regs->edx; + /* We check here so we can simply copy_to_user/from_user */ + if (!lguest_address_ok(lg, (long)lg->lguest_data) + || !lguest_address_ok(lg, (long)(lg->lguest_data+1))){ + kill_guest(lg, "bad guest page %p", lg->lguest_data); + return 0; + } + get_user(lg->noirq_start, &lg->lguest_data->noirq_start); + get_user(lg->noirq_end, &lg->lguest_data->noirq_end); + /* We reserve the top pgd entry. */ + put_user(4U*1024*1024, &lg->lguest_data->reserve_mem); + put_user(lg->guestid, &lg->lguest_data->guestid); + put_user(clocksource_khz2mult(tsc_khz, 22), + &lg->lguest_data->clock_mult); + return 0; + } + pending = do_hcall(lg, regs); + set_wakeup_process(lg, NULL); + return pending; +} =================================================================== --- /dev/null +++ b/arch/i386/lguest/hypervisor.S @@ -0,0 +1,170 @@ +/* This code sits at 0xFFFF1000 to do the low-level guest<->host switch. + Layout is: default_idt_entries (1k), then switch_to_guest entry point. */ +#include +#include +#include "lg.h" + +#define SAVE_REGS \ + /* Save old guest/host state */ \ + pushl %es; \ + pushl %ds; \ + pushl %fs; \ + pushl %eax; \ + pushl %gs; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + +.text +ENTRY(_start) /* ld complains unless _start is defined. */ +/* %eax contains ptr to target guest state, %edx contains host idt. */ +switch_to_guest: + pushl %ss + SAVE_REGS + /* Save old stack, switch to guest's stack. */ + movl %esp, LGUEST_STATE_host_stackptr(%eax) + movl %eax, %esp + /* Guest registers will be at: %esp-$LGUEST_STATE_regs */ + addl $LGUEST_STATE_regs, %esp + /* Switch to guest's GDT, IDT. */ + lgdt LGUEST_STATE_gdt(%eax) + lidt LGUEST_STATE_idt(%eax) + /* Save page table top. */ + movl %cr3, %ebx + movl %ebx, LGUEST_STATE_host_pgdir(%eax) + /* Set host's TSS to available (clear byte 5 bit 2). */ + movl (LGUEST_STATE_host_gdt+2)(%eax), %ebx + andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%ebx) + /* Switch to guest page tables */ + popl %ebx + movl %ebx, %cr3 + /* Switch to guest's TSS. */ + movl $(GDT_ENTRY_TSS*8), %ebx + ltr %bx + /* Restore guest regs */ + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %gs + /* Now we've loaded gs, neuter the TLS entries down to 1 byte/page */ + addl $(LGUEST_STATE_gdt_table+GDT_ENTRY_TLS_MIN*8), %eax + movw $0,(%eax) + movw $0,8(%eax) + movw $0,16(%eax) + popl %eax + popl %fs + popl %ds + popl %es + /* Skip error code and trap number */ + addl $8, %esp + iret + +#define SWITCH_TO_HOST \ + SAVE_REGS; \ + /* Save old pgdir */ \ + movl %cr3, %eax; \ + pushl %eax; \ + /* Load lguest ds segment for convenience. */ \ + movl $(LGUEST_DS), %eax; \ + movl %eax, %ds; \ + /* Now figure out who we are */ \ + movl %esp, %eax; \ + subl $LGUEST_STATE_regs, %eax; \ + /* Switch to host page tables (GDT, IDT and stack are in host \ + mem, so need this first) */ \ + movl LGUEST_STATE_host_pgdir(%eax), %ebx; \ + movl %ebx, %cr3; \ + /* Set guest's TSS to available (clear byte 5 bit 2). */ \ + andb $0xFD, (LGUEST_STATE_gdt_table+GDT_ENTRY_TSS*8+5)(%eax);\ + /* Switch to host's GDT & IDT. */ \ + lgdt LGUEST_STATE_host_gdt(%eax); \ + lidt LGUEST_STATE_host_idt(%eax); \ + /* Switch to host's stack. */ \ + movl LGUEST_STATE_host_stackptr(%eax), %esp; \ + /* Switch to host's TSS */ \ + movl $(GDT_ENTRY_TSS*8), %eax; \ + ltr %ax; \ + /* Restore host regs */ \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %gs; \ + popl %eax; \ + popl %fs; \ + popl %ds; \ + popl %es; \ + popl %ss + +/* Return to run_guest_once. */ +return_to_host: + SWITCH_TO_HOST + iret + +deliver_to_host: + SWITCH_TO_HOST +decode_idt_and_jmp: + /* Decode IDT and jump to hosts' irq handler. When that does iret, it + * will return to run_guest_once. This is a feature. */ + /* We told gcc we'd clobber edx and eax... */ + movl LGUEST_STATE_trapnum(%eax), %eax + leal (%edx,%eax,8), %eax + movzwl (%eax),%edx + movl 4(%eax), %eax + xorw %ax, %ax + orl %eax, %edx + jmp *%edx + +deliver_to_host_with_errcode: + SWITCH_TO_HOST + pushl LGUEST_STATE_errcode(%eax) + jmp decode_idt_and_jmp + +/* Real hardware interrupts are delivered straight to the host. Others + cause us to return to run_guest_once so it can decide what to do. Note + that some of these are overridden by the guest to deliver directly, and + never enter here (see load_guest_idt_entry). */ +.macro IRQ_STUB N TARGET + .data; .long 1f; .text; 1: + /* Make an error number for most traps, which don't have one. */ + .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17) + pushl $0 + .endif + pushl $\N + jmp \TARGET + ALIGN +.endm + +.macro IRQ_STUBS FIRST LAST TARGET + irq=\FIRST + .rept \LAST-\FIRST+1 + IRQ_STUB irq \TARGET + irq=irq+1 + .endr +.endm + +/* We intercept every interrupt, because we may need to switch back to + * host. Unfortunately we can't tell them apart except by entry + * point, so we need 256 entry points. + */ +irq_stubs: +.data +default_idt_entries: +.text + IRQ_STUBS 0 1 return_to_host /* First two traps */ + IRQ_STUB 2 deliver_to_host_with_errcode /* NMI */ + IRQ_STUBS 3 31 return_to_host /* Rest of traps */ + IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */ + IRQ_STUB 128 return_to_host /* System call (overridden) */ + IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */ + +/* Everything after this is used for the lguest_state structs. */ +ALIGN =================================================================== --- /dev/null +++ b/arch/i386/lguest/interrupts_and_traps.c @@ -0,0 +1,221 @@ +#include +#include "lg.h" + +static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 val) +{ + lhwrite_u32(lg, (u32)--(*gstack), val); +} + +int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err) +{ + u32 __user *gstack; + u32 eflags, ss, irq_enable; + struct lguest_regs *regs = &lg->state->regs; + + if (!trap->addr) + return 0; + + /* If they want a ring change, we use new stack and push old ss/esp */ + if ((regs->ss&0x3) != GUEST_DPL) { + gstack = (u32 __user *)guest_pa(lg, lg->state->tss.esp1); + ss = lg->state->tss.ss1; + push_guest_stack(lg, &gstack, regs->ss); + push_guest_stack(lg, &gstack, regs->esp); + } else { + gstack = (u32 __user *)guest_pa(lg, regs->esp); + ss = regs->ss; + } + + /* We use IF bit in eflags to indicate whether irqs were disabled + (it's always 0, since irqs are enabled when guest is running). */ + eflags = regs->eflags; + get_user(irq_enable, &lg->lguest_data->irq_enabled); + eflags |= (irq_enable & 512); + + push_guest_stack(lg, &gstack, eflags); + push_guest_stack(lg, &gstack, regs->cs); + push_guest_stack(lg, &gstack, regs->eip); + + if (has_err) + push_guest_stack(lg, &gstack, regs->errcode); + + /* Change the real stack so hypervisor returns to trap handler */ + regs->ss = ss; + regs->esp = (u32)gstack + lg->page_offset; + regs->cs = (__KERNEL_CS|GUEST_DPL); + regs->eip = trap->addr; + + /* GS will be neutered on way back to guest. */ + put_user(0, &lg->lguest_data->gs_gpf_eip); + + /* Disable interrupts for an interrupt gate. */ + if (trap->disable_interrupts) + put_user(0, &lg->lguest_data->irq_enabled); + return 1; +} + +void maybe_do_interrupt(struct lguest *lg) +{ + unsigned int irq; + DECLARE_BITMAP(irqs, LGUEST_IRQS); + + if (!lg->lguest_data) + return; + + /* If timer has changed, set timer interrupt. */ + if (lg->timer_on && jiffies != lg->last_timer) + set_bit(0, lg->irqs_pending); + + /* Mask out any interrupts they have blocked. */ + copy_from_user(&irqs, lg->lguest_data->interrupts, sizeof(irqs)); + bitmap_andnot(irqs, lg->irqs_pending, irqs, LGUEST_IRQS); + + irq = find_first_bit(irqs, LGUEST_IRQS); + if (irq >= LGUEST_IRQS) + return; + + /* If they're halted, we re-enable interrupts. */ + if (lg->halted) { + /* Re-enable interrupts. */ + put_user(512, &lg->lguest_data->irq_enabled); + lg->halted = 0; + } else { + /* Maybe they have interrupts disabled? */ + u32 irq_enabled; + get_user(irq_enabled, &lg->lguest_data->irq_enabled); + if (!irq_enabled) + return; + } + + if (lg->interrupt[irq].addr != 0) { + clear_bit(irq, lg->irqs_pending); + reflect_trap(lg, &lg->interrupt[irq], 0); + } +} + +void check_bug_kill(struct lguest *lg) +{ +#ifdef CONFIG_BUG + u32 eip = lg->state->regs.eip - PAGE_OFFSET; + u16 insn; + + /* This only works for addresses in linear mapping... */ + if (lg->state->regs.eip < PAGE_OFFSET) + return; + lhread(lg, &insn, eip, sizeof(insn)); + if (insn == 0x0b0f) { +#ifdef CONFIG_DEBUG_BUGVERBOSE + u16 l; + u32 f; + char file[128]; + lhread(lg, &l, eip+sizeof(insn), sizeof(l)); + lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f)); + lhread(lg, file, f - PAGE_OFFSET, sizeof(file)); + file[sizeof(file)-1] = 0; + kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l); +#else + kill_guest(lg, "BUG() at %#x", eip); +#endif /* CONFIG_DEBUG_BUGVERBOSE */ + } +#endif /* CONFIG_BUG */ +} + +static void copy_trap(struct lguest *lg, + struct host_trap *trap, + const struct desc_struct *desc) +{ + u8 type = ((desc->b >> 8) & 0xF); + + /* Not present? */ + if (!(desc->b & 0x8000)) { + trap->addr = 0; + return; + } + if (type != 0xE && type != 0xF) + kill_guest(lg, "bad IDT type %i", type); + trap->disable_interrupts = (type == 0xE); + trap->addr = ((desc->a & 0x0000FFFF) | (desc->b & 0xFFFF0000)); +} + +/* FIXME: Put this in hypervisor.S and do something clever with relocs? */ +static u8 tramp[] += { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */ + 0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00, + /* movl 0, %ss:lguest_data.gs_gpf_eip */ + 0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */ +}; +#define TRAMP_MOVL_TARGET_OFF 7 +#define TRAMP_JMP_TARGET_OFF 16 + +static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr) +{ + u32 addr, off; + + off = sizeof(tramp)*i; + memcpy(lg->trap_page + off, tramp, sizeof(tramp)); + + /* 0 is to be placed in lguest_data.gs_gpf_eip. */ + addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset; + memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4); + + /* Address is relative to where end of jmp will be. */ + addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp)); + memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4); + return (-4*1024*1024) + off; +} + +/* We bounce through the trap page, for two reasons: firstly, we need + the interrupt destination always mapped, to avoid double faults, + secondly we want to reload %gs to make it innocuous on entering kernel. + */ +static void setup_idt(struct lguest *lg, + unsigned int i, + const struct desc_struct *desc) +{ + u8 type = ((desc->b >> 8) & 0xF); + u32 taddr; + + /* Not present? */ + if (!(desc->b & 0x8000)) { + /* FIXME: When we need this, we'll know... */ + if (lg->state->idt_table[i].a & 0x8000) + kill_guest(lg, "removing interrupts not supported"); + return; + } + + /* We could reflect and disable interrupts, but guest can do itself. */ + if (type != 0xF) + kill_guest(lg, "bad direct IDT %i type %i", i, type); + + taddr = setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000)); + + lg->state->idt_table[i].a = (((__KERNEL_CS|GUEST_DPL)<<16) + | (taddr & 0x0000FFFF)); + lg->state->idt_table[i].b = (desc->b&0xEF00)|(taddr&0xFFFF0000); +} + +void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 high) +{ + struct desc_struct d = { low, high }; + + /* Ignore NMI, doublefault, hypercall, spurious interrupt. */ + if (i == 2 || i == 8 || i == 15 || i == LGUEST_TRAP_ENTRY) + return; + /* FIXME: We should handle debug and int3 */ + else if (i == 1 || i == 3) + return; + /* We intercept page fault, general protection fault and fpu missing */ + else if (i == 13) + copy_trap(lg, &lg->gpf_trap, &d); + else if (i == 14) + copy_trap(lg, &lg->page_trap, &d); + else if (i == 7) + copy_trap(lg, &lg->fpu_trap, &d); + /* Other traps go straight to guest. */ + else if (i < FIRST_EXTERNAL_VECTOR || i == SYSCALL_VECTOR) + setup_idt(lg, i, &d); + /* A virtual interrupt */ + else if (i < FIRST_EXTERNAL_VECTOR + LGUEST_IRQS) + copy_trap(lg, &lg->interrupt[i-FIRST_EXTERNAL_VECTOR], &d); +} + =================================================================== --- /dev/null +++ b/arch/i386/lguest/io.c @@ -0,0 +1,413 @@ +/* Simple I/O model for guests, based on shared memory. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include "lg.h" + +static struct list_head dma_hash[64]; + +/* FIXME: allow multi-page lengths. */ +static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma) +{ + unsigned int i; + + for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { + if (!dma->len[i]) + return 1; + if (!lguest_address_ok(lg, dma->addr[i])) + goto kill; + if (dma->len[i] > PAGE_SIZE) + goto kill; + /* We could do over a page, but is it worth it? */ + if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE) + goto kill; + } + return 1; + +kill: + kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]); + return 0; +} + +static unsigned int hash(const union futex_key *key) +{ + return jhash2((u32*)&key->both.word, + (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + key->both.offset) + % ARRAY_SIZE(dma_hash); +} + +/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */ +static void unlink_dma(struct lguest_dma_info *dmainfo) +{ + BUG_ON(down_trylock(&lguest_lock) == 0); + dmainfo->interrupt = 0; + list_del(&dmainfo->list); + drop_futex_key_refs(&dmainfo->key); +} + +static inline int key_eq(const union futex_key *a, const union futex_key *b) +{ + return (a->both.word == b->both.word + && a->both.ptr == b->both.ptr + && a->both.offset == b->both.offset); +} + +static u32 unbind_dma(struct lguest *lg, + const union futex_key *key, + unsigned long dmas) +{ + int i, ret = 0; + + for (i = 0; i < LGUEST_MAX_DMA; i++) { + if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { + unlink_dma(&lg->dma[i]); + ret = 1; + break; + } + } + return ret; +} + +u32 bind_dma(struct lguest *lg, + unsigned long addr, unsigned long dmas, u16 numdmas, u8 interrupt) +{ + unsigned int i; + u32 ret = 0; + union futex_key key; + + if (interrupt >= LGUEST_IRQS) + return 0; + + down(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)addr, &key) != 0) { + kill_guest(lg, "bad dma address %#lx", addr); + goto unlock; + } + get_futex_key_refs(&key); + + if (interrupt == 0) + ret = unbind_dma(lg, &key, dmas); + else { + for (i = 0; i < LGUEST_MAX_DMA; i++) { + if (lg->dma[i].interrupt == 0) { + lg->dma[i].dmas = dmas; + lg->dma[i].num_dmas = numdmas; + lg->dma[i].next_dma = 0; + lg->dma[i].key = key; + lg->dma[i].guestid = lg->guestid; + lg->dma[i].interrupt = interrupt; + list_add(&lg->dma[i].list, + &dma_hash[hash(&key)]); + ret = 1; + goto unlock; + } + } + } + drop_futex_key_refs(&key); +unlock: + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + return ret; +} + +/* lhread from another guest */ +static int lhread_other(struct lguest *lg, + void *buf, u32 addr, unsigned bytes) +{ + if (addr + bytes < addr + || !lguest_address_ok(lg, addr+bytes) + || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) { + memset(buf, 0, bytes); + kill_guest(lg, "bad address in registered DMA struct"); + return 0; + } + return 1; +} + +/* lhwrite to another guest */ +static int lhwrite_other(struct lguest *lg, u32 addr, + const void *buf, unsigned bytes) +{ + if (addr + bytes < addr + || !lguest_address_ok(lg, addr+bytes) + || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1) + != bytes)) { + kill_guest(lg, "bad address writing to registered DMA"); + return 0; + } + return 1; +} + +static u32 copy_data(const struct lguest_dma *src, + const struct lguest_dma *dst, + struct page *pages[]) +{ + unsigned int totlen, si, di, srcoff, dstoff; + void *maddr = NULL; + + totlen = 0; + si = di = 0; + srcoff = dstoff = 0; + while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] + && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { + u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); + + if (!maddr) + maddr = kmap(pages[di]); + + /* FIXME: This is not completely portable, since + archs do different things for copy_to_user_page. */ + if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, + (void *__user)src->addr[si], len) != 0) { + totlen = 0; + break; + } + + totlen += len; + srcoff += len; + dstoff += len; + if (srcoff == src->len[si]) { + si++; + srcoff = 0; + } + if (dstoff == dst->len[di]) { + kunmap(pages[di]); + maddr = NULL; + di++; + dstoff = 0; + } + } + + if (maddr) + kunmap(pages[di]); + + return totlen; +} + +/* Src is us, ie. current. */ +static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, + struct lguest *dstlg, const struct lguest_dma *dst) +{ + int i; + u32 ret; + struct page *pages[LGUEST_MAX_DMA_SECTIONS]; + + if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) + return 0; + + /* First get the destination pages */ + for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { + if (dst->len[i] == 0) + break; + if (get_user_pages(dstlg->tsk, dstlg->mm, + dst->addr[i], 1, 1, 1, pages+i, NULL) + != 1) { + ret = 0; + goto drop_pages; + } + } + + /* Now copy until we run out of src or dst. */ + ret = copy_data(src, dst, pages); + +drop_pages: + while (--i >= 0) + put_page(pages[i]); + return ret; +} + +/* We cache one process to wakeup: helps for batching & wakes outside locks. */ +void set_wakeup_process(struct lguest *lg, struct task_struct *p) +{ + if (p == lg->wake) + return; + + if (lg->wake) { + wake_up_process(lg->wake); + put_task_struct(lg->wake); + } + lg->wake = p; + if (lg->wake) + get_task_struct(lg->wake); +} + +static int dma_transfer(struct lguest *srclg, + unsigned long udma, + struct lguest_dma_info *dst) +{ + struct lguest_dma dst_dma, src_dma; + struct lguest *dstlg; + u32 i, dma = 0; + + dstlg = &lguests[dst->guestid]; + /* Get our dma list. */ + lhread(srclg, &src_dma, udma, sizeof(src_dma)); + + /* We can't deadlock against them dmaing to us, because this + * is all under the lguest_lock. */ + down_read(&dstlg->mm->mmap_sem); + + for (i = 0; i < dst->num_dmas; i++) { + dma = (dst->next_dma + i) % dst->num_dmas; + if (!lhread_other(dstlg, &dst_dma, + dst->dmas + dma * sizeof(struct lguest_dma), + sizeof(dst_dma))) { + goto fail; + } + if (!dst_dma.used_len) + break; + } + if (i != dst->num_dmas) { + unsigned long used_lenp; + unsigned int ret; + + ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); + /* Put used length in src. */ + lhwrite_u32(srclg, + udma+offsetof(struct lguest_dma, used_len), ret); + if (ret == 0 && src_dma.len[0] != 0) + goto fail; + + /* Make sure destination sees contents before length. */ + mb(); + used_lenp = dst->dmas + + dma * sizeof(struct lguest_dma) + + offsetof(struct lguest_dma, used_len); + lhwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); + dst->next_dma++; + } + up_read(&dstlg->mm->mmap_sem); + + /* Do this last so dst doesn't simply sleep on lock. */ + set_bit(dst->interrupt, dstlg->irqs_pending); + set_wakeup_process(srclg, dstlg->tsk); + return i == dst->num_dmas; + +fail: + up_read(&dstlg->mm->mmap_sem); + return 0; +} + +int send_dma(struct lguest *lg, unsigned long addr, unsigned long udma) +{ + union futex_key key; + int pending = 0, empty = 0; + +again: + down(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)addr, &key) != 0) { + kill_guest(lg, "bad sending DMA address"); + goto unlock; + } + /* Shared mapping? Look for other guests... */ + if (key.shared.offset & 1) { + struct lguest_dma_info *i, *n; + list_for_each_entry_safe(i, n, &dma_hash[hash(&key)], list) { + if (i->guestid == lg->guestid) + continue; + if (!key_eq(&key, &i->key)) + continue; + + empty += dma_transfer(lg, udma, i); + break; + } + if (empty == 1) { + /* Give any recipients one chance to restock. */ + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + yield(); + empty++; + goto again; + } + pending = 0; + } else { + /* Private mapping: tell our userspace. */ + lg->dma_is_pending = 1; + lg->pending_dma = udma; + lg->pending_addr = addr; + pending = 1; + } +unlock: + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + return pending; +} + +void release_all_dma(struct lguest *lg) +{ + unsigned int i; + + BUG_ON(down_trylock(&lguest_lock) == 0); + + down_read(&lg->mm->mmap_sem); + for (i = 0; i < LGUEST_MAX_DMA; i++) { + if (lg->dma[i].interrupt) + unlink_dma(&lg->dma[i]); + } + up_read(&lg->mm->mmap_sem); +} + +/* Userspace wants a dma buffer from this guest. */ +unsigned long get_dma_buffer(struct lguest *lg, + unsigned long addr, unsigned long *interrupt) +{ + unsigned long ret = 0; + union futex_key key; + struct lguest_dma_info *i; + + down(&lguest_lock); + down_read(¤t->mm->mmap_sem); + if (get_futex_key((u32 __user *)addr, &key) != 0) { + kill_guest(lg, "bad registered DMA buffer"); + goto unlock; + } + list_for_each_entry(i, &dma_hash[hash(&key)], list) { + if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { + unsigned int j; + for (j = 0; j < i->num_dmas; j++) { + struct lguest_dma dma; + + ret = i->dmas + j * sizeof(struct lguest_dma); + lhread(lg, &dma, ret, sizeof(dma)); + if (dma.used_len == 0) + break; + } + *interrupt = i->interrupt; + break; + } + } +unlock: + up_read(¤t->mm->mmap_sem); + up(&lguest_lock); + return ret; +} + +void lguest_io_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dma_hash); i++) + INIT_LIST_HEAD(&dma_hash[i]); +} =================================================================== --- /dev/null +++ b/arch/i386/lguest/lguest_user.c @@ -0,0 +1,242 @@ +/* Userspace control of the guest, via /dev/lguest. */ +#include +#include +#include +#include "lg.h" + +static struct lguest_state *setup_guest_state(unsigned int num, void *pgdir, + unsigned long start) +{ + struct lguest_state *guest = &__lguest_states()[num]; + unsigned int i; + const long *def = __lguest_default_idt_entries(); + struct lguest_regs *regs; + + guest->gdt_table[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; + guest->gdt_table[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; + guest->gdt.size = GDT_ENTRIES*8-1; + guest->gdt.address = (unsigned long)&guest->gdt_table; + + /* Other guest's IDTs are initialized from default. */ + guest->idt.size = 8 * IDT_ENTRIES; + guest->idt.address = (long)guest->idt_table; + for (i = 0; i < IDT_ENTRIES; i++) { + u32 flags = 0x8e00; + + /* They can't "int" into any of them except hypercall. */ + if (i == LGUEST_TRAP_ENTRY) + flags |= (GUEST_DPL << 13); + + guest->idt_table[i].a = (LGUEST_CS<<16) | (def[i]&0x0000FFFF); + guest->idt_table[i].b = (def[i]&0xFFFF0000) | flags; + } + + memset(&guest->tss, 0, sizeof(guest->tss)); + guest->tss.ss0 = LGUEST_DS; + guest->tss.esp0 = (unsigned long)(guest+1); + guest->tss.io_bitmap_base = sizeof(guest->tss); /* No I/O for you! */ + + /* Write out stack in format lguest expects, so we can switch to it. */ + regs = &guest->regs; + regs->cr3 = __pa(pgdir); + regs->eax = regs->ebx = regs->ecx = regs->edx = regs->esp = 0; + regs->edi = LGUEST_MAGIC_EDI; + regs->ebp = LGUEST_MAGIC_EBP; + regs->esi = LGUEST_MAGIC_ESI; + regs->gs = regs->fs = 0; + regs->ds = regs->es = __KERNEL_DS|GUEST_DPL; + regs->trapnum = regs->errcode = 0; + regs->eip = start; + regs->cs = __KERNEL_CS|GUEST_DPL; + regs->eflags = 0x202; /* Interrupts enabled. */ + regs->ss = __KERNEL_DS|GUEST_DPL; + + if (!fixup_gdt_table(guest->gdt_table, ARRAY_SIZE(guest->gdt_table), + &guest->regs, &guest->tss)) + return NULL; + + return guest; +} + +/* + addr */ +static long user_get_dma(struct lguest *lg, const u32 __user *input) +{ + unsigned long addr, udma, irq; + + if (get_user(addr, input) != 0) + return -EFAULT; + udma = get_dma_buffer(lg, addr, &irq); + if (!udma) + return -ENOENT; + + /* We put irq number in udma->used_len. */ + lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); + return udma; +} + +/* + irq */ +static int user_send_irq(struct lguest *lg, const u32 __user *input) +{ + u32 irq; + + if (get_user(irq, input) != 0) + return -EFAULT; + if (irq >= LGUEST_IRQS) + return -EINVAL; + set_bit(irq, lg->irqs_pending); + return 0; +} + +static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) +{ + struct lguest *lg = file->private_data; + + if (!lg) + return -EINVAL; + + if (lg->dead) { + size_t len; + + if (lg->dead == (void *)-1) + return -ENOMEM; + + len = min(size, strlen(lg->dead)+1); + if (copy_to_user(user, lg->dead, len) != 0) + return -EFAULT; + return len; + } + + if (lg->dma_is_pending) + lg->dma_is_pending = 0; + + return run_guest(lg, user); +} + +/* Take: pfnlimit, pgdir, start, pageoffset. */ +static int initialize(struct file *file, const u32 __user *input) +{ + struct lguest *lg; + int err, i; + u32 args[4]; + + if (file->private_data) + return -EBUSY; + + if (copy_from_user(args, input, sizeof(args)) != 0) + return -EFAULT; + + if (args[1] <= PAGE_SIZE) + return -EINVAL; + + down(&lguest_lock); + i = find_free_guest(); + if (i < 0) { + err = -ENOSPC; + goto unlock; + } + lg = &lguests[i]; + lg->guestid = i; + lg->pfn_limit = args[0]; + lg->page_offset = args[3]; + + lg->trap_page = (u32 *)get_zeroed_page(GFP_KERNEL); + if (!lg->trap_page) { + err = -ENOMEM; + goto release_guest; + } + + err = init_guest_pagetable(lg, args[1]); + if (err) + goto free_trap_page; + + lg->state = setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]); + if (!lg->state) { + err = -ENOEXEC; + goto release_pgtable; + } + up(&lguest_lock); + + lg->tsk = current; + lg->mm = get_task_mm(current); + file->private_data = lg; + return sizeof(args); + +release_pgtable: + free_guest_pagetable(lg); +free_trap_page: + free_page((long)lg->trap_page); +release_guest: + memset(lg, 0, sizeof(*lg)); +unlock: + up(&lguest_lock); + return err; +} + +static ssize_t write(struct file *file, const char __user *input, + size_t size, loff_t *off) +{ + struct lguest *lg = file->private_data; + u32 req; + + if (get_user(req, input) != 0) + return -EFAULT; + input += sizeof(req); + + if (req != LHREQ_INITIALIZE && !lg) + return -EINVAL; + if (lg && lg->dead) + return -ENOENT; + + switch (req) { + case LHREQ_INITIALIZE: + return initialize(file, (const u32 __user *)input); + case LHREQ_GETDMA: + return user_get_dma(lg, (const u32 __user *)input); + case LHREQ_IRQ: + return user_send_irq(lg, (const u32 __user *)input); + default: + return -EINVAL; + } +} + +static int close(struct inode *inode, struct file *file) +{ + struct lguest *lg = file->private_data; + + if (!lg) + return 0; + + down(&lguest_lock); + release_all_dma(lg); + free_page((long)lg->trap_page); + free_guest_pagetable(lg); + mmput(lg->mm); + if (lg->dead != (void *)1) + kfree(lg->dead); + memset(lg->state, 0, sizeof(*lg->state)); + memset(lg, 0, sizeof(*lg)); + up(&lguest_lock); + return 0; +} + +static struct file_operations lguest_fops = { + .owner = THIS_MODULE, + .release = close, + .write = write, + .read = read, +}; +static struct miscdevice lguest_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "lguest", + .fops = &lguest_fops, +}; + +int __init lguest_device_init(void) +{ + return misc_register(&lguest_dev); +} + +void __exit lguest_device_remove(void) +{ + misc_deregister(&lguest_dev); +} =================================================================== --- /dev/null +++ b/arch/i386/lguest/page_tables.c @@ -0,0 +1,374 @@ +/* Shadow page table operations. + * Copyright (C) Rusty Russell IBm Corporation 2006. + * GPL v2 and any later version */ +#include +#include +#include +#include +#include +#include +#include "lg.h" + +#define PTES_PER_PAGE_SHIFT 10 +#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) +#define HYPERVISOR_PGD_ENTRY (PTES_PER_PAGE - 1) + +static DEFINE_PER_CPU(u32 *, hypervisor_pte_pages) = { NULL }; +#define hypervisor_pte_page(cpu) per_cpu(hypervisor_pte_pages, cpu) + +static unsigned vaddr_to_pgd(unsigned long vaddr) +{ + return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); +} + +/* These access the real versions. */ +static u32 *toplev(struct lguest *lg, u32 i, unsigned long vaddr) +{ + unsigned int index = vaddr_to_pgd(vaddr); + + if (index >= HYPERVISOR_PGD_ENTRY) { + kill_guest(lg, "attempt to access hypervisor pages"); + index = 0; + } + return &lg->pgdirs[i].pgdir[index]; +} + +static u32 *pteof(struct lguest *lg, u32 top, unsigned long vaddr) +{ + u32 *page = __va(top&PAGE_MASK); + BUG_ON(!(top & _PAGE_PRESENT)); + return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; +} + +/* These access the guest versions. */ +static u32 gtoplev(struct lguest *lg, unsigned long vaddr) +{ + unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); + return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(u32); +} + +static u32 gpteof(struct lguest *lg, u32 gtop, unsigned long vaddr) +{ + u32 gpage = (gtop&PAGE_MASK); + BUG_ON(!(gtop & _PAGE_PRESENT)); + return gpage + ((vaddr >> PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(u32); +} + +static void release_pte(u32 pte) +{ + if (pte & _PAGE_PRESENT) + put_page(pfn_to_page(pte >> PAGE_SHIFT)); +} + +/* Do a virtual -> physical mapping on a user page. */ +static unsigned long get_pfn(unsigned long virtpfn, int write) +{ + struct vm_area_struct *vma; + struct page *page; + unsigned long ret = -1UL; + + down_read(¤t->mm->mmap_sem); + if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT, + 1, write, 1, &page, &vma) == 1) + ret = page_to_pfn(page); + up_read(¤t->mm->mmap_sem); + return ret; +} + +static u32 check_pgtable_entry(struct lguest *lg, u32 entry) +{ + if ((entry & (_PAGE_PWT|_PAGE_PSE)) + || (entry >> PAGE_SHIFT) >= lg->pfn_limit) + kill_guest(lg, "bad page table entry"); + return entry & ~_PAGE_GLOBAL; +} + +static u32 get_pte(struct lguest *lg, u32 entry, int write) +{ + u32 pfn; + + pfn = get_pfn(entry >> PAGE_SHIFT, write); + if (pfn == -1UL) { + kill_guest(lg, "failed to get page %u", entry>>PAGE_SHIFT); + return 0; + } + return ((pfn << PAGE_SHIFT) | (entry & (PAGE_SIZE-1))); +} + +/* FIXME: We hold reference to pages, which prevents them from being + swapped. It'd be nice to have a callback when Linux wants to swap out. */ + +/* We fault pages in, which allows us to update accessed/dirty bits. + * Return NULL or the pte page. */ +static int page_in(struct lguest *lg, u32 vaddr, unsigned flags) +{ + u32 gtop, gpte; + u32 *top, *pte, *ptepage; + u32 val; + + gtop = gtoplev(lg, vaddr); + val = lhread_u32(lg, gtop); + if (!(val & _PAGE_PRESENT)) + return 0; + + top = toplev(lg, lg->pgdidx, vaddr); + if (!(*top & _PAGE_PRESENT)) { + /* Get a PTE page for them. */ + ptepage = (void *)get_zeroed_page(GFP_KERNEL); + /* FIXME: Steal from self in this case? */ + if (!ptepage) { + kill_guest(lg, "out of memory allocating pte page"); + return 0; + } + val = check_pgtable_entry(lg, val); + *top = (__pa(ptepage) | (val & (PAGE_SIZE-1))); + } else + ptepage = __va(*top & PAGE_MASK); + + gpte = gpteof(lg, val, vaddr); + val = lhread_u32(lg, gpte); + + /* No page, or write to readonly page? */ + if (!(val&_PAGE_PRESENT) || ((flags&_PAGE_DIRTY) && !(val&_PAGE_RW))) + return 0; + + pte = pteof(lg, *top, vaddr); + val = check_pgtable_entry(lg, val) | flags; + + /* We're done with the old pte. */ + release_pte(*pte); + + /* We don't make it writable if this isn't a write: later + * write will fault so we can set dirty bit in guest. */ + if (val & _PAGE_DIRTY) + *pte = get_pte(lg, val, 1); + else + *pte = get_pte(lg, val & ~_PAGE_RW, 0); + + /* Now we update dirty/accessed on guest. */ + lhwrite_u32(lg, gpte, val); + return 1; +} + +int demand_page(struct lguest *lg, u32 vaddr, int write) +{ + return page_in(lg, vaddr, (write ? _PAGE_DIRTY : 0)|_PAGE_ACCESSED); +} + +void pin_stack_pages(struct lguest *lg) +{ + unsigned int i; + u32 stack = lg->state->tss.esp1; + + for (i = 0; i < lg->stack_pages; i++) + if (!demand_page(lg, stack - i*PAGE_SIZE, 1)) + kill_guest(lg, "bad stack page %i@%#x", i, stack); +} + +static unsigned int find_pgdir(struct lguest *lg, u32 pgtable) +{ + unsigned int i; + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].cr3 == pgtable) + break; + return i; +} + +static void release_pgd(struct lguest *lg, u32 *pgd) +{ + if (*pgd & _PAGE_PRESENT) { + unsigned int i; + u32 *ptepage = __va(*pgd & ~(PAGE_SIZE-1)); + for (i = 0; i < PTES_PER_PAGE; i++) + release_pte(ptepage[i]); + free_page((long)ptepage); + *pgd = 0; + } +} + +static void flush_user_mappings(struct lguest *lg, int idx) +{ + unsigned int i; + for (i = 0; i < vaddr_to_pgd(lg->page_offset); i++) + release_pgd(lg, lg->pgdirs[idx].pgdir + i); +} + +void guest_pagetable_flush_user(struct lguest *lg) +{ + flush_user_mappings(lg, lg->pgdidx); +} + +static unsigned int new_pgdir(struct lguest *lg, u32 cr3) +{ + unsigned int next; + + next = (lg->pgdidx + random32()) % ARRAY_SIZE(lg->pgdirs); + if (!lg->pgdirs[next].pgdir) { + lg->pgdirs[next].pgdir = (u32 *)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[next].pgdir) + next = lg->pgdidx; + } + lg->pgdirs[next].cr3 = cr3; + /* Release all the non-kernel mappings. */ + flush_user_mappings(lg, next); + + return next; +} + +void guest_new_pagetable(struct lguest *lg, u32 pgtable) +{ + int newpgdir; + + newpgdir = find_pgdir(lg, pgtable); + if (newpgdir == ARRAY_SIZE(lg->pgdirs)) + newpgdir = new_pgdir(lg, pgtable); + lg->pgdidx = newpgdir; + lg->state->regs.cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir); + pin_stack_pages(lg); +} + +static void release_all_pagetables(struct lguest *lg) +{ + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].pgdir) + for (j = 0; j < HYPERVISOR_PGD_ENTRY; j++) + release_pgd(lg, lg->pgdirs[i].pgdir + j); +} + +void guest_pagetable_clear_all(struct lguest *lg) +{ + release_all_pagetables(lg); + pin_stack_pages(lg); +} + +static void do_set_pte(struct lguest *lg, int idx, + unsigned long vaddr, u32 val) +{ + u32 *top = toplev(lg, idx, vaddr); + if (*top & _PAGE_PRESENT) { + u32 *pte = pteof(lg, *top, vaddr); + release_pte(*pte); + if (val & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + val = check_pgtable_entry(lg, val); + *pte = get_pte(lg, val, val & _PAGE_DIRTY); + } else + *pte = 0; + } +} + +void guest_set_pte(struct lguest *lg, + unsigned long cr3, unsigned long vaddr, u32 val) +{ + /* Kernel mappings must be changed on all top levels. */ + if (vaddr >= lg->page_offset) { + unsigned int i; + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + if (lg->pgdirs[i].pgdir) + do_set_pte(lg, i, vaddr, val); + } else { + int pgdir = find_pgdir(lg, cr3); + if (pgdir != ARRAY_SIZE(lg->pgdirs)) + do_set_pte(lg, pgdir, vaddr, val); + } +} + +void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 idx) +{ + int pgdir; + + if (idx >= HYPERVISOR_PGD_ENTRY) + return; + + pgdir = find_pgdir(lg, cr3); + if (pgdir < ARRAY_SIZE(lg->pgdirs)) + release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); +} + +int init_guest_pagetable(struct lguest *lg, u32 pgtable) +{ + /* We assume this in flush_user_mappings, so check now */ + if (vaddr_to_pgd(lg->page_offset) >= HYPERVISOR_PGD_ENTRY) + return -EINVAL; + lg->pgdidx = 0; + lg->pgdirs[lg->pgdidx].cr3 = pgtable; + lg->pgdirs[lg->pgdidx].pgdir = (u32*)get_zeroed_page(GFP_KERNEL); + if (!lg->pgdirs[lg->pgdidx].pgdir) + return -ENOMEM; + return 0; +} + +void free_guest_pagetable(struct lguest *lg) +{ + unsigned int i; + + release_all_pagetables(lg); + for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) + free_page((long)lg->pgdirs[i].pgdir); +} + +/* Caller must be preempt-safe */ +void map_trap_page(struct lguest *lg) +{ + int cpu = smp_processor_id(); + + hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT); + + /* Since hypervisor less that 4MB, we simply mug top pte page. */ + lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] = + (__pa(hypervisor_pte_page(cpu))| _PAGE_KERNEL); +} + +static void free_hypervisor_pte_pages(void) +{ + int i; + + for_each_possible_cpu(i) + free_page((long)hypervisor_pte_page(i)); +} + +static __init int alloc_hypervisor_pte_pages(void) +{ + int i; + + for_each_possible_cpu(i) { + hypervisor_pte_page(i) = (u32 *)get_zeroed_page(GFP_KERNEL); + if (!hypervisor_pte_page(i)) { + free_hypervisor_pte_pages(); + return -ENOMEM; + } + } + return 0; +} + +static __init void populate_hypervisor_pte_page(int cpu) +{ + int i; + u32 *pte = hypervisor_pte_page(cpu); + + for (i = 0; i < HYPERVISOR_PAGES; i++) { + /* First entry set dynamically in map_trap_page */ + pte[i+1] = ((page_to_pfn(&hype_pages[i]) << PAGE_SHIFT) + | _PAGE_KERNEL_EXEC); + } +} + +__init int init_pagetables(struct page hype_pages[]) +{ + int ret; + unsigned int i; + + ret = alloc_hypervisor_pte_pages(); + if (ret) + return ret; + + for_each_possible_cpu(i) + populate_hypervisor_pte_page(i); + return 0; +} + +__exit void free_pagetables(void) +{ + free_hypervisor_pte_pages(); +} =================================================================== --- /dev/null +++ b/arch/i386/lguest/segments.c @@ -0,0 +1,171 @@ +#include "lg.h" + +/* Dealing with GDT entries is such a horror, I convert to sanity and back */ +struct decoded_gdt_entry +{ + u32 base, limit; + union { + struct { + unsigned type:4; + unsigned dtype:1; + unsigned dpl:2; + unsigned present:1; + unsigned unused:4; + unsigned avl:1; + unsigned mbz:1; + unsigned def:1; + unsigned page_granularity:1; + }; + u16 raw_attributes; + }; +}; + +static struct decoded_gdt_entry decode_gdt_entry(const struct desc_struct *en) +{ + struct decoded_gdt_entry de; + de.base = ((en->a >> 16) | ((en->b & 0xff) << 16) + | (en->b & 0xFF000000)); + de.limit = ((en->a & 0xFFFF) | (en->b & 0xF0000)); + de.raw_attributes = (en->b >> 8); + return de; +} + +static struct desc_struct encode_gdt_entry(const struct decoded_gdt_entry *de) +{ + struct desc_struct en; + en.a = ((de->limit & 0xFFFF) | (de->base << 16)); + en.b = (((de->base >> 16) & 0xFF) + | ((((u32)de->raw_attributes) & 0xF0FF) << 8) + | (de->limit & 0xF0000) + | (de->base & 0xFF000000)); + return en; +} + +static int check_desc(const struct decoded_gdt_entry *dec) +{ + return (dec->mbz == 0 && dec->dtype == 1 && (dec->type & 4) == 0); +} + +static void check_segment(const struct desc_struct *gdt, u32 *segreg) +{ + if (*segreg > 255 || !(gdt[*segreg >> 3].b & 0x8000)) + *segreg = 0; +} + +/* Ensure our manually-loaded segment regs don't fault in switch_to_guest. */ +static void check_live_segments(const struct desc_struct *gdt, + struct lguest_regs *regs) +{ + check_segment(gdt, ®s->es); + check_segment(gdt, ®s->ds); + check_segment(gdt, ®s->fs); + check_segment(gdt, ®s->gs); +} + +int fixup_gdt_table(struct desc_struct *gdt, unsigned int num, + struct lguest_regs *regs, struct x86_tss *tss) +{ + unsigned int i; + struct decoded_gdt_entry dec; + + for (i = 0; i < num; i++) { + unsigned long base, length; + + /* We override these ones, so we don't care what they give. */ + if (i == GDT_ENTRY_TSS + || i == GDT_ENTRY_LGUEST_CS + || i == GDT_ENTRY_LGUEST_DS + || i == GDT_ENTRY_DOUBLEFAULT_TSS) + continue; + + dec = decode_gdt_entry(&gdt[i]); + if (!dec.present) + continue; + + if (!check_desc(&dec)) + return 0; + + base = dec.base; + length = dec.limit + 1; + if (dec.page_granularity) { + base *= PAGE_SIZE; + length *= PAGE_SIZE; + } + + /* Unacceptable base? */ + if (base >= HYPE_ADDR) + return 0; + + /* Wrap around or segment overlaps hypervisor mem? */ + if (!length + || base + length < base + || base + length > HYPE_ADDR) { + /* Trim to edge of hypervisor. */ + length = HYPE_ADDR - base; + if (dec.page_granularity) + dec.limit = (length / PAGE_SIZE) - 1; + else + dec.limit = length - 1; + } + if (dec.dpl == 0) + dec.dpl = GUEST_DPL; + gdt[i] = encode_gdt_entry(&dec); + } + check_live_segments(gdt, regs); + + /* Now put in hypervisor data and code segments. */ + gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; + gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; + + /* Finally, TSS entry */ + dec.base = (unsigned long)tss; + dec.limit = sizeof(*tss)-1; + dec.type = 0x9; + dec.dtype = 0; + dec.def = 0; + dec.present = 1; + dec.mbz = 0; + dec.page_granularity = 0; + gdt[GDT_ENTRY_TSS] = encode_gdt_entry(&dec); + + return 1; +} + +void load_guest_gdt(struct lguest *lg, u32 table, u32 num) +{ + if (num > GDT_ENTRIES) + kill_guest(lg, "too many gdt entries %i", num); + + lhread(lg, lg->state->gdt_table, table, + num * sizeof(lg->state->gdt_table[0])); + if (!fixup_gdt_table(lg->state->gdt_table, num, + &lg->state->regs, &lg->state->tss)) + kill_guest(lg, "bad gdt table"); +} + +/* We don't care about limit here, since we only let them use these in + * usermode (where lack of USER bit in pagetable protects hypervisor mem). + * However, we want to ensure it doesn't fault when loaded, since *we* are + * the ones who will load it in switch_to_guest. + */ +void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls) +{ + unsigned int i; + struct desc_struct *tls = &lg->state->gdt_table[GDT_ENTRY_TLS_MIN]; + + lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); + for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++) { + struct decoded_gdt_entry dec = decode_gdt_entry(&tls[i]); + + if (!dec.present) + continue; + + /* We truncate to one byte/page (depending on G bit) to neuter + it, so ensure it's more than 1 page below trap page. */ + tls[i].a &= 0xFFFF0000; + lg->tls_limits[i] = dec.limit; + if (!check_desc(&dec) || dec.base > HYPE_ADDR - PAGE_SIZE) + kill_guest(lg, "bad TLS descriptor %i", i); + } + check_live_segments(lg->state->gdt_table, &lg->state->regs); +}