This adds a second entry point to head.S, which is jumped to when booted by Xen. This allows startup under Xen to be easily detected. Because Xen starts the kernel in a fairly sane state, very little setup is needed here; it just needs to jump into xen_start_kernel to init the paravirt_ops structure, and then jump into start_kernel proper. This also makes a few small adjustments to the gdt tables to make them properly suited to Xen. One warty thing in this patch is the requirement to hard-code the location of the Xen entrypoint and hypervisor page, rather than letting the assembler/linker choose an appropriate place. This is because these addresses must be converted into a string at compile time, so the address must be known at compile rather than link time. Subject: [patch 16/21] Xen-paravirt: Add outline of Xen paravirt interface code, plus boot-time init. Create a new arch/i386/xen/ directory for all the Xen-specific paravirt code; I'd expect there would be parallel paravirt-vmi, etc directories. This also contains an initial set of paravirt ops for Xen, mostly ones which can just be implemented with the generic native_* version. At boot time, the global paravirt_ops structure is populated with the Xen pointers in xen_start_kernel, which then jumps to the standard start_kernel. Hook Xen entrypoint into common paravirt entrypoint. Register Xen-specific architecture and memory setup functions. Signed-off-by: Jeremy Fitzhardinge --- arch/i386/Makefile | 3 arch/i386/kernel/cpu/common.c | 3 arch/i386/kernel/entry.S | 77 +++ arch/i386/kernel/head.S | 12 arch/i386/kernel/paravirt.c | 48 +- arch/i386/kernel/vmlinux.lds.S | 1 arch/i386/mm/pgtable.c | 1 arch/i386/xen/Makefile | 2 arch/i386/xen/enlighten.c | 807 +++++++++++++++++++++++++++++++++++++++ arch/i386/xen/events.c | 473 ++++++++++++++++++++++ arch/i386/xen/features.c | 29 + arch/i386/xen/mmu.c | 419 ++++++++++++++++++++ arch/i386/xen/mmu.h | 51 ++ arch/i386/xen/multicalls.c | 62 ++ arch/i386/xen/multicalls.h | 13 arch/i386/xen/setup.c | 95 ++++ arch/i386/xen/time.c | 452 +++++++++++++++++++++ arch/i386/xen/xen-head.S | 29 + arch/i386/xen/xen-ops.h | 20 include/asm-i386/hypercall.h | 21 - include/asm-i386/irq.h | 1 include/asm-i386/paravirt.h | 42 ++ include/asm-i386/pda.h | 11 include/xen/events.h | 28 + include/xen/features.h | 26 + include/xen/page.h | 175 ++++++++ 26 files changed, 2872 insertions(+), 29 deletions(-) =================================================================== --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -93,6 +93,9 @@ mcore-$(CONFIG_X86_ES7000) := mach-defau mcore-$(CONFIG_X86_ES7000) := mach-default core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/i386/xen/ + # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default =================================================================== --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -19,6 +19,7 @@ #include #endif #include +#include #include "cpu.h" @@ -707,6 +708,8 @@ __cpuinit int init_gdt(int cpu, struct t pda->cpu_number = cpu; pda->pcurrent = idle; + paravirt_init_pda(pda, cpu); + return 1; } =================================================================== --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1001,6 +1001,83 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#ifdef CONFIG_XEN +/* Xen only supports sysenter/sysexit in ring0 guests, + and only if it the guest asks for it. So for now, + this should never be used. */ +ENTRY(xen_sti_sysexit) + CFI_STARTPROC + ud2 + CFI_ENDPROC + +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jz 5f + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) + CFI_ADJUST_CFA_OFFSET -16 + jmp iret_exc +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET -16 + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous + +#endif /* CONFIG_XEN */ + .section .rodata,"a" #include "syscall_table.S" =================================================================== --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -519,6 +519,10 @@ 1: jmp 1b #endif +#ifdef CONFIG_XEN +#include "../xen/xen-head.S" +#endif + /* * Real beginning of normal "text" segment */ @@ -528,7 +532,7 @@ ENTRY(_stext) /* * BSS section */ -.section ".bss.page_aligned","w" +.section ".bss.page_aligned" ENTRY(swapper_pg_dir) .fill 1024,4,0 ENTRY(empty_zero_page) @@ -598,7 +602,8 @@ ENTRY(boot_gdt_table) /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ - .align L1_CACHE_BYTES + .section ".data.page_aligned" + .align PAGE_SIZE_asm ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -647,3 +652,6 @@ ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* 0xf0 - unused */ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + /* Be sure this is zeroed to avoid false validations in Xen */ + .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0 + .previous =================================================================== --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -146,55 +146,55 @@ void init_IRQ(void) paravirt_ops.init_IRQ(); } -static fastcall void native_clts(void) +fastcall void native_clts(void) { asm volatile ("clts"); } -static fastcall unsigned long native_read_cr0(void) +fastcall unsigned long native_read_cr0(void) { unsigned long val; asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr0(unsigned long val) +fastcall void native_write_cr0(unsigned long val) { asm volatile("movl %0,%%cr0": :"r" (val)); } -static fastcall unsigned long native_read_cr2(void) +fastcall unsigned long native_read_cr2(void) { unsigned long val; asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr2(unsigned long val) +fastcall void native_write_cr2(unsigned long val) { asm volatile("movl %0,%%cr2": :"r" (val)); } -static fastcall unsigned long native_read_cr3(void) +fastcall unsigned long native_read_cr3(void) { unsigned long val; asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr3(unsigned long val) +fastcall void native_write_cr3(unsigned long val) { asm volatile("movl %0,%%cr3": :"r" (val)); } -static fastcall unsigned long native_read_cr4(void) +fastcall unsigned long native_read_cr4(void) { unsigned long val; asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); return val; } -static fastcall unsigned long native_read_cr4_safe(void) +fastcall unsigned long native_read_cr4_safe(void) { unsigned long val; /* This could fault if %cr4 does not exist */ @@ -207,7 +207,7 @@ static fastcall unsigned long native_rea return val; } -static fastcall void native_write_cr4(unsigned long val) +fastcall void native_write_cr4(unsigned long val) { asm volatile("movl %0,%%cr4": :"r" (val)); } @@ -246,12 +246,12 @@ static fastcall void native_halt(void) asm volatile("hlt": : :"memory"); } -static fastcall void native_wbinvd(void) +fastcall void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } -static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +fastcall unsigned long long native_read_msr(unsigned int msr, int *err) { unsigned long long val; @@ -270,7 +270,7 @@ static fastcall unsigned long long nativ return val; } -static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +fastcall int native_write_msr(unsigned int msr, unsigned long long val) { int err; asm volatile("2: wrmsr ; xorl %0,%0\n" @@ -288,14 +288,14 @@ static fastcall int native_write_msr(uns return err; } -static fastcall unsigned long long native_read_tsc(void) +fastcall unsigned long long native_read_tsc(void) { unsigned long long val; asm volatile("rdtsc" : "=A" (val)); return val; } -static fastcall unsigned long long native_read_pmc(void) +fastcall unsigned long long native_read_pmc(void) { unsigned long long val; asm volatile("rdpmc" : "=A" (val)); @@ -317,17 +317,17 @@ static fastcall void native_load_idt(con asm volatile("lidt %0"::"m" (*dtr)); } -static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) { asm ("sgdt %0":"=m" (*dtr)); } -static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +fastcall void native_store_idt(struct Xgt_desc_struct *dtr) { asm ("sidt %0":"=m" (*dtr)); } -static fastcall unsigned long native_store_tr(void) +fastcall unsigned long native_store_tr(void) { unsigned long tr; asm ("str %0":"=r" (tr)); @@ -336,9 +336,9 @@ static fastcall unsigned long native_sto static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) { -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] - C(0); C(1); C(2); -#undef C + get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 0] = t->tls_array[0]; + get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 1] = t->tls_array[1]; + get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 2] = t->tls_array[2]; } static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) @@ -348,17 +348,17 @@ static inline void native_write_dt_entry lp[1] = entry_high; } -static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } =================================================================== --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -93,6 +93,7 @@ SECTIONS . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) *(.data.idt) } =================================================================== --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -267,6 +267,7 @@ static void pgd_ctor(pgd_t *pgd) swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); } else { + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); =================================================================== --- /dev/null +++ b/arch/i386/xen/Makefile @@ -0,0 +1,2 @@ +obj-y := enlighten.o setup.o events.o time.o \ + features.o mmu.o multicalls.o =================================================================== --- /dev/null +++ b/arch/i386/xen/enlighten.c @@ -0,0 +1,807 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xen-ops.h" +#include "mmu.h" +#include "multicalls.h" + +extern struct Xgt_desc_struct cpu_gdt_descr; +extern struct i386_pda boot_pda; +extern unsigned long init_pg_tables_end; + +static DEFINE_PER_CPU(unsigned, lazy_mode); + +/* Code defined in entry.S (not a function) */ +extern const char xen_sti_sysexit[]; + +struct start_info *xen_start_info; + +static unsigned xen_patch(u8 type, u16 clobber, void *firstinsn, unsigned len) +{ + /* Xen will require relocations to patch calls and jmps, and + perhaps chunks of inline code */ + return len; +} + +static void __init xen_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); + printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); +} + +static void xen_init_pda(struct i386_pda *pda, int cpu) +{ + /* Don't re-init boot CPU; we do it once very early in boot, + and then then cpu_init tries to do it again. If so, just + reuse the stuff we already set up. */ + if (cpu == 0 && pda != &boot_pda) { + BUG_ON(boot_pda.xen.vcpu == NULL); + pda->xen = boot_pda.xen; + return; + } + + pda->xen.vcpu = &HYPERVISOR_shared_info->vcpu_info[cpu]; + pda->xen.cr3 = 0; +} + +static fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + unsigned maskedx = ~0; + if (*eax == 1) + maskedx = ~(1 << X86_FEATURE_APIC); + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); + *edx &= maskedx; +} + +static fastcall void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static fastcall unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static fastcall unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + preempt_disable(); + vcpu = read_pda(xen.vcpu); + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + preempt_enable(); + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static fastcall void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + vcpu = read_pda(xen.vcpu); + vcpu->evtchn_upcall_mask = flags; + if (flags == 0) { + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); + } else + preempt_enable_no_resched(); +} + +static fastcall void xen_irq_disable(void) +{ + struct vcpu_info *vcpu; + preempt_disable(); + vcpu = read_pda(xen.vcpu); + vcpu->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static fastcall void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + vcpu = read_pda(xen.vcpu); + vcpu->evtchn_upcall_mask = 0; + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); +} + +static fastcall void xen_safe_halt(void) +{ + stop_hz_timer(); + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) + BUG(); + start_hz_timer(); +} + +static fastcall void xen_halt(void) +{ +#if 0 + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); +#endif +} + +static void xen_set_lazy_mode(int mode) +{ + unsigned *lazy = &get_cpu_var(lazy_mode); + + if (xen_mc_flush()) + BUG(); + + *lazy = mode; + + put_cpu_var(lazy_mode); +} + +static unsigned xen_get_lazy_mode(void) +{ + unsigned ret = get_cpu_var(lazy_mode); + put_cpu_var(lazy_mode); + + return ret; +} + +static fastcall void xen_load_tr_desc(void) +{ + /* do nothing */ +} + +static fastcall unsigned long xen_store_tr(void) +{ + return 0; +} + +static fastcall void xen_set_ldt(const void *addr, unsigned entries) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)addr; + if (addr) + /* ldt my be vmalloced, use arbitrary_virt_to_machine */ + op->arg1.linear_addr = arbitrary_virt_to_machine((unsigned long)addr).maddr; + op->arg2.nr_ents = entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU) + xen_mc_flush(); +} + +static fastcall void xen_load_gdt(const struct Xgt_desc_struct *dtr) +{ + unsigned long va; + int f; + unsigned size = dtr->size + 1; + unsigned long frames[16]; + + BUG_ON(size > 16*PAGE_SIZE); + + for (va = dtr->address, f = 0; + va < dtr->address + size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly((void *)va); + } + + /* This is used very early, so we can't rely on per-cpu data + being set up, so no multicalls */ + if (HYPERVISOR_set_gdt(frames, size/8)) + BUG(); +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + xmaddr_t maddr = virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN+i]); + struct multicall_space mc = xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static fastcall void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall void xen_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + unsigned long lp = (unsigned long)dt + entrynum * 8; + xmaddr_t mach_lp = virt_to_machine(lp); + u64 entry = (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); +} + +static int cvt_gate_to_trap(int vector, u32 low, u32 high, struct trap_info *info) +{ + u8 type, dpl; + + type = (high >> 8) & 0x1f; + dpl = (high >> 13) & 3; + + if (type != 0xf && type != 0xe) + return 0; + + info->vector = vector; + info->address = (high & 0xffff0000) | (low & 0x0000ffff); + info->cs = low >> 16; + info->flags = dpl; + /* interrupt gates clear IF */ + if (type == 0xe) + info->flags |= 4; + + return 1; +} + +#if 0 +static void unpack_desc(u32 low, u32 high, + unsigned long *base, unsigned long *limit, + unsigned char *type, unsigned char *flags) +{ + *base = (high & 0xff000000) | ((high << 16) & 0x00ff0000) | ((low >> 16) & 0xffff); + *limit = (high & 0x000f0000) | (low & 0xffff); + *type = (high >> 8) & 0xff; + *flags = (high >> 20) & 0xf; +} +#endif + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static fastcall void xen_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + + int cpu = smp_processor_id(); + unsigned long p = (unsigned long)dt + entrynum * 8; + unsigned long start = per_cpu(idt_desc, cpu).address; + unsigned long end = start + per_cpu(idt_desc, cpu).size + 1; + + xen_mc_flush(); + + native_write_idt_entry(dt, entrynum, low, high); + + if (p >= start && (p + 8) <= end) { + struct trap_info info[2]; + + info[1].address = 0; + + if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static fastcall void xen_load_idt(const struct Xgt_desc_struct *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + int cpu = smp_processor_id(); + unsigned in, out, count; + + per_cpu(idt_desc, cpu) = *desc; + + count = desc->size / 8; + BUG_ON(count > 256); + + spin_lock(&lock); + for(in = out = 0; in < count; in++) { + const u32 *entry = (u32 *)(desc->address + in * 8); + + if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + out++; + } + traps[out].address = 0; + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static fastcall void xen_write_gdt_entry(void *dt, int entry, u32 low, u32 high) +{ + switch ((high >> 8) & 0xff) { + case DESCTYPE_LDT: + case DESCTYPE_TSS: + /* ignore */ + break; + + default: + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(virt_to_machine(dt + entry*8).maddr, + (u64)high << 32 | low)) + BUG(); + } +} + +static fastcall void xen_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU) { + if (HYPERVISOR_stack_switch(__KERNEL_DS, thread->esp0)) + BUG(); + } else { + struct multicall_space mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + } +} + +static fastcall void xen_set_iopl_mask(unsigned mask) +{ +#if 0 + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +#endif +} + +static fastcall void xen_io_delay(void) +{ +} + +#ifdef CONFIG_X86_LOCAL_APIC +static fastcall void xen_apic_write(unsigned long reg, unsigned long v) +{ +} + +static fastcall void xen_apic_write_atomic(unsigned long reg, unsigned long v) +{ +} + +static fastcall unsigned long xen_apic_read(unsigned long reg) +{ + return 0; +} +#endif + +static fastcall void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall void xen_flush_tlb_global(void) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_ALL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall void xen_flush_tlb_single(u32 addr) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall unsigned long xen_read_cr2(void) +{ + return read_pda(xen.vcpu)->arch.cr2; +} + +static fastcall void xen_write_cr4(unsigned long cr4) +{ + /* never allow TSC to be disabled */ + native_write_cr4(cr4 & ~X86_CR4_TSD); +} + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + +static fastcall unsigned long xen_read_cr3(void) +{ + return read_pda(xen.cr3); +} + +static fastcall void xen_write_cr3(unsigned long cr3) +{ + if (cr3 == read_pda(xen.cr3)) { + /* just a simple tlb flush */ + xen_flush_tlb(); + return; + } + + write_pda(xen.cr3, cr3); + + + { + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); + } +} + +static fastcall void xen_alloc_pt(u32 pfn) +{ + /* XXX pfn isn't necessarily a lowmem page */ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_alloc_pd(u32 pfn) +{ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_release_pd(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + /* make sure next person to allocate this page gets a clean + pmd */ + clear_page(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_release_pt(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_alloc_pd_clone(u32 pfn, u32 clonepfn, + u32 start, u32 count) +{ + xen_alloc_pd(pfn); +} + +static __init void xen_pagetable_setup_start(pgd_t *base) +{ + pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + + init_mm.pgd = base; + + /* copy top-level of Xen-supplied pagetable into place. For + !PAE we can use this as-is, but for PAE it is a stand-in + while we copy the pmd pages. */ + memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) { + int i; + + /* For PAE, need to allocate new pmds, rather than + share Xen's, since Xen doesn't like pmd's being + shared between address spaces, even though in this + case they're effectively the same address space. */ + for(i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); + + xen_alloc_pd(PFN_DOWN(__pa(pmd))); + + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); + } + } + + /* make sure the zero_page is mapped RO so we + can use it in pagetables */ + make_lowmem_page_readonly(empty_zero_page); + make_lowmem_page_readonly(base); + + /* Switch to new pagetable. This is done before + pagetable_init has done anything so that the new pages + added to the table can be prepared properly for Xen. */ + printk("about to switch to new pagetable %p...\n", base); + xen_write_cr3(__pa(base)); + printk("done\n"); +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + /* init_mm has a new pagetable set up - make sure the GDT page + is still read-only in the new pagetable */ + xen_load_gdt(&cpu_gdt_descr); + + if (!xen_feature(XENFEAT_writable_page_tables)) { + /* Create a mapping for the shared info page. + Should be set_fixmap(), but shared_info is a machine + address with no corresponding pseudo-phys address. */ + set_pte_mfn(fix_to_virt(FIX_PARAVIRT), + PFN_DOWN(xen_start_info->shared_info), + PAGE_KERNEL); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT); + } else + HYPERVISOR_shared_info = + (struct shared_info *)__va(xen_start_info->shared_info); + + xen_pgd_pin(base); + + write_pda(xen.vcpu, &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]); +} + +static const struct paravirt_ops xen_paravirt_ops __initdata = { + .paravirt_enabled = 1, + .shared_kernel_pmd = 0, + .pgd_alignment = PAGE_SIZE, + + .name = "Xen", + .banner = xen_banner, + + .patch = xen_patch, + + .memory_setup = xen_memory_setup, + .arch_setup = xen_arch_setup, + .init_IRQ = xen_init_IRQ, + .time_init = xen_time_init, + .init_pda = xen_init_pda, + + .cpuid = xen_cpuid, + + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, + + .clts = native_clts, + + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + + .read_cr2 = xen_read_cr2, + .write_cr2 = native_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = xen_write_cr4, + + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, + .wbinvd = native_wbinvd, + + .read_msr = native_read_msr, + .write_msr = native_write_msr, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + + .iret = (void (fastcall *)(void))&hypercall_page[__HYPERVISOR_iret], + .irq_enable_sysexit = (void (fastcall *)(void))xen_sti_sysexit, + + .load_tr_desc = xen_load_tr_desc, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_esp0 = xen_load_esp0, + + .set_iopl_mask = xen_set_iopl_mask, + .io_delay = xen_io_delay, + .const_udelay = __const_udelay, + .set_wallclock = xen_set_wallclock, + .get_wallclock = xen_get_wallclock, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = xen_apic_write, + .apic_write_atomic = xen_apic_write_atomic, + .apic_read = xen_apic_read, + .setup_boot_clock = (void *)native_nop, + .setup_secondary_clock = (void *)native_nop, +#endif + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb_global, + .flush_tlb_single = xen_flush_tlb_single, + + .pte_update = (void *)native_nop, + .pte_update_defer = (void *)native_nop, + + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .set_pte = xen_set_pte, + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd, + + .alloc_pt = xen_alloc_pt, + .alloc_pd = xen_alloc_pd, + .alloc_pd_clone = xen_alloc_pd_clone, + .release_pd = xen_release_pd, + .release_pt = xen_release_pt, + + .pte_val = xen_pte_val, + .pmd_val = xen_pmd_val, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pmd = xen_make_pmd, + .make_pgd = xen_make_pgd, + + .ptep_get_and_clear = xen_ptep_get_and_clear, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte, + .set_pte_present = xen_set_pte_at, + .set_pud = xen_set_pud, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, +#endif /* PAE */ + + .set_lazy_mode = xen_set_lazy_mode, + .startup_ipi_hook = (void *)native_nop, +}; + +/* First C function to be called on Xen boot */ +static asmlinkage void __init xen_start_kernel(void) +{ + u32 low, high; + pgd_t *pgd; + + if (!xen_start_info) + return; + + BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); + + /* Install Xen paravirt ops */ + paravirt_ops = xen_paravirt_ops; + + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + + pgd = (pgd_t *)xen_start_info->pt_base; + + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + + /* set up the boot-time gdt and segments */ + init_mm.pgd = pgd; /* use the Xen pagetables to start */ + + xen_load_gdt(&cpu_gdt_descr); + + /* set up PDA descriptor */ + pack_descriptor(&low, &high, (unsigned)&boot_pda, sizeof(boot_pda)-1, + 0x80 | DESCTYPE_S | 0x02, 0); + + /* Use hypercall directly, because xen_write_gdt_entry can't + * be used until batched multicalls work. */ + if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt_table + + GDT_ENTRY_PDA).maddr, + (u64)high << 32 | low)) + BUG(); + + /* set up %fs and init Xen parts of the PDA */ + asm volatile("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); + xen_init_pda(&boot_pda, 0); + boot_pda.xen.cr3 = __pa(pgd); + + paravirt_ops.kernel_rpl = xen_feature(XENFEAT_supervisor_mode_kernel) ? 0 : 1; + + /* set the limit of our address space */ + reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + new_cpu_data.hard_math = 1; + identify_cpu(&new_cpu_data); + + /* Poke various useful things into boot_params */ + LOADER_TYPE = (9 << 4) | 0; + INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; + INITRD_SIZE = xen_start_info->mod_len; + + /* Start the world */ + start_kernel(); +} + +paravirt_probe(xen_start_kernel); =================================================================== --- /dev/null +++ b/arch/i386/xen/events.c @@ -0,0 +1,473 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> VIRQ mapping. */ +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* Packed IRQ information: binding type, sub-type index, and event channel. */ +static u32 irq_info[NR_IRQS]; + +/* Binding types. */ +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN }; + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 +}; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} +EXPORT_SYMBOL_GPL(force_evtchn_callback); + +static struct irq_chip xen_dynamic_chip; + +/* Constructor for packed IRQ information. */ +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + return ((type << 24) | (index << 16) | evtchn); +} + +/* + * Accessors for packed IRQ information. + */ +static inline unsigned int evtchn_from_irq(int irq) +{ + return (u16)(irq_info[irq]); +} + +static inline unsigned int index_from_irq(int irq) +{ + return (u8)(irq_info[irq] >> 16); +} + +static inline unsigned int type_from_irq(int irq) +{ + return (u8)(irq_info[irq] >> 24); +} + +static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + int irq = evtchn_to_irq[chn]; + + BUG_ON(irq == -1); + set_native_irq_info(irq, cpumask_of_cpu(cpu)); + + __clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); + __set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); + + cpu_evtchn[chn] = cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ + int i; + + /* By default all event channels notify CPU#0. */ + for (i = 0; i < NR_IRQS; i++) + set_native_irq_info(i, cpumask_of_cpu(0)); + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +static inline void clear_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void set_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_pending[0]); +} + + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +void mask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_mask[0]); +} +EXPORT_SYMBOL_GPL(mask_evtchn); + +void unmask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = smp_processor_id(); + struct vcpu_info *vcpu_info = read_pda(xen.vcpu); + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu != cpu_from_evtchn(port))) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + return; + } + + sync_clear_bit(port, &s->evtchn_mask[0]); + + /* + * The following is basically the equivalent of 'hw_resend_irq'. Just + * like a real IO-APIC we 'lose the interrupt edge' if the channel is + * masked. + */ + if (sync_test_bit(port, &s->evtchn_pending[0]) && + !sync_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; +} +EXPORT_SYMBOL_GPL(unmask_evtchn); + +static int find_unbound_irq(void) +{ + int irq; + + /* Only allocate from dynirq range */ + for (irq = 0; irq < NR_IRQS; irq++) + if (irq_bindcount[irq] == 0) + break; + + if (irq == NR_IRQS) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +static int bind_evtchn_to_irq(unsigned int evtchn) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + irq = evtchn_to_irq[evtchn]; + + if (irq == -1) { + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler(irq, &xen_dynamic_chip, handle_level_irq); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(virq_to_irq, cpu)[virq]; + + if (irq == -1) { + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler(irq, &xen_dynamic_chip, handle_level_irq); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn = evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = IRQ_UNBOUND; + + dynamic_irq_init(irq); + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_evtchn_to_irq(evtchn); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_virq_to_irq(virq, cpu); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +/* + Search the CPUs pending events bitmasks. For each one found, map + the event number to an irq, and feed it into do_IRQ() for + handling. + + Xen uses a two-level bitmap to speed searching. The first level is + a bitset of words which contain pending event bits. The second + level is a bitset of pending events themselves. +*/ +asmlinkage fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = read_pda(xen.vcpu); + unsigned long pending_words; + + vcpu_info->evtchn_upcall_pending = 0; + + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); + while (pending_words != 0) { + unsigned long pending_bits; + int word_idx = __ffs(pending_words); + pending_words &= ~(1UL << word_idx); + + while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; + + if (irq != -1) { + regs->orig_eax = ~irq; + do_IRQ(regs); + } + } + } +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + int evtchn = evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port = evtchn; + bind_vcpu.vcpu = tcpu; + + /* + * If this fails, it usually just indicates that we're dealing with a + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + bind_evtchn_to_cpu(evtchn, tcpu); +} + + +static void set_affinity_irq(unsigned irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +static int retrigger_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + int ret = 0; + + if (VALID_EVTCHN(evtchn)) { + set_evtchn(evtchn); + ret = 1; + } + + return ret; +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-virq", + .mask = disable_dynirq, + .unmask = enable_dynirq, + .ack = ack_dynirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, +}; + +void __init xen_init_IRQ(void) +{ + int i; + + init_evtchn_cpu_bindings(); + + /* No event channels are 'live' right now. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i = 0; i < NR_IRQS; i++) + irq_bindcount[i] = 0; + + irq_ctx_init(smp_processor_id()); +} =================================================================== --- /dev/null +++ b/arch/i386/xen/features.c @@ -0,0 +1,29 @@ +/****************************************************************************** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include +#include +#include +#include +#include + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +EXPORT_SYMBOL_GPL(xen_features); + +void xen_setup_features(void) +{ + struct xen_feature_info fi; + int i, j; + + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx = i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j=0; j<32; j++) + xen_features[i*32+j] = !!(fi.submap & 1< +#include + +#include +#include +#include + +#include +#include + +#include +#include + +xmaddr_t arbitrary_virt_to_machine(unsigned long address) +{ + pte_t *pte = lookup_address(address); + unsigned offset = address & PAGE_MASK; + + BUG_ON(pte == NULL); + + return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); +} + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_wrprotect(*pte); + + if (xen_feature(XENFEAT_writable_page_tables)) + *pte = ptev; + else + if(HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_mkwrite(*pte); + + if (xen_feature(XENFEAT_writable_page_tables)) + *pte = ptev; + else + if(HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +fastcall void xen_set_pte(pte_t *ptep, pte_t pte) +{ +#if 1 + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr; + u.val = pte_val_ma(pte); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +#else + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +#endif +} + +fastcall void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +#ifdef CONFIG_X86_PAE +fastcall void xen_set_pud(pmd_t *ptr, pud_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} +#endif + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* stored as-is, to permit clearing entries */ + xen_set_pte(pte, mfn_pte(mfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr, + pte_t *ptep, pte_t pteval) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0) + xen_set_pte(ptep, pteval); +} + +void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ +} + +void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ +} + +#ifdef CONFIG_X86_PAE +void fastcall xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((u64 *)ptep, pte_val_ma(pte)); +} + +void fastcall xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep) +{ +#if 1 + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +#else + set_64bit((u64 *)ptep, 0); +#endif +} + +void fastcall xen_pmd_clear(pmd_t *pmdp) +{ + xen_set_pmd(pmdp, __pmd(0)); +} + +fastcall unsigned long long xen_pte_val(pte_t pte) +{ + unsigned long long ret = 0; + + if (pte.pte_low) { + ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + } + + return ret; +} + +fastcall unsigned long long xen_pmd_val(pmd_t pmd) +{ + unsigned long long ret = pmd.pmd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +fastcall unsigned long long xen_pgd_val(pgd_t pgd) +{ + unsigned long long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +fastcall pte_t xen_make_pte(unsigned long long pte) +{ + if (pte & 1) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte, pte >> 32 }; +} + +fastcall pmd_t xen_make_pmd(unsigned long long pmd) +{ + if (pmd & 1) + pmd = phys_to_machine(XPADDR(pmd)).maddr; + + return (pmd_t){ pmd }; +} + +fastcall pgd_t xen_make_pgd(unsigned long long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} + +fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep) +{ + pte_t res; + + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low = xchg(&ptep->pte_low, 0); + res.pte_high = ptep->pte_high; + ptep->pte_high = 0; + + return res; +} +#else /* !PAE */ +fastcall unsigned long xen_pte_val(pte_t pte) +{ + unsigned long ret = pte.pte_low; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr; + + return ret; +} + +fastcall unsigned long xen_pmd_val(pmd_t pmd) +{ + BUG(); + return 0; +} + +fastcall unsigned long xen_pgd_val(pgd_t pgd) +{ + unsigned long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +fastcall pte_t xen_make_pte(unsigned long pte) +{ + if (pte & _PAGE_PRESENT) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte }; +} + +fastcall pmd_t xen_make_pmd(unsigned long pmd) +{ + BUG(); + return __pmd(0); +} + +fastcall pgd_t xen_make_pgd(unsigned long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} + +fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep) +{ + return __pte_ma(xchg(&(ptep)->pte_low, 0)); +} +#endif /* CONFIG_X86_PAE */ + + + +static void pgd_walk_set_prot(void *pt, pgprot_t flags) +{ + unsigned long pfn = PFN_DOWN(__pa(pt)); + + if (HYPERVISOR_update_va_mapping((unsigned long)pt, + pfn_pte(pfn, flags), 0) < 0) + BUG(); +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g, u, m; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + + if (PTRS_PER_PUD > 1) /* not folded */ + pgd_walk_set_prot(pud,flags); + + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + + if (PTRS_PER_PMD > 1) /* not folded */ + pgd_walk_set_prot(pmd,flags); + + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + + /* This can get called before mem_map + is set up, so we assume nothing is + highmem at that point. */ + if (mem_map == NULL || + !PageHighMem(pmd_page(*pmd))) { + pte = pte_offset_kernel(pmd,0); + pgd_walk_set_prot(pte,flags); + } + } + } + } + + if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(PFN_DOWN(__pa(pgd_base)), + flags), + UVMF_TLB_FLUSH) < 0) + BUG(); +} + + +/* This is called just after a mm has been duplicated from its parent, + but it has not been used yet. We need to make sure that its + pagetable is all read-only, and can be pinned. The pagetable itself + needs to map itself as RO; it doesn't matter what the state its in + with respect to any other pagetable. */ +void xen_pgd_pin(pgd_t *pgd) +{ + struct mmuext_op op; + + pgd_walk(pgd, PAGE_KERNEL_RO); + +#if defined(CONFIG_X86_PAE) + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L2_TABLE; +#endif + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +/* Release a pagetables pages back as normal RW */ +void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op op; + + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); + + pgd_walk(pgd, PAGE_KERNEL); +} + + +fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + xen_pgd_pin(next->pgd); +} + +fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + xen_pgd_pin(mm->pgd); +} + +fastcall void xen_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + xen_pgd_unpin(mm->pgd); +} =================================================================== --- /dev/null +++ b/arch/i386/xen/mmu.h @@ -0,0 +1,51 @@ +#ifndef _XEN_MMU_H + +#include +#include + +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +void fastcall xen_set_pte(pte_t *ptep, pte_t pteval); +void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr, + pte_t *ptep, pte_t pteval); +void fastcall xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); +void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep); +void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep); + +fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); +fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); +fastcall void xen_exit_mmap(struct mm_struct *mm); + +fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep); + +void xen_pgd_pin(pgd_t *pgd); +void xen_pgd_unpin(pgd_t *pgd); + +#ifdef CONFIG_X86_PAE +fastcall unsigned long long xen_pte_val(pte_t); +fastcall unsigned long long xen_pmd_val(pmd_t); +fastcall unsigned long long xen_pgd_val(pgd_t); + +fastcall pte_t xen_make_pte(unsigned long long); +fastcall pmd_t xen_make_pmd(unsigned long long); +fastcall pgd_t xen_make_pgd(unsigned long long); + +fastcall void xen_set_pte_at(struct mm_struct *mm, u32 addr, + pte_t *ptep, pte_t pteval); +fastcall void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +fastcall void xen_set_pud(pud_t *ptr, pud_t val); +fastcall void xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep); +fastcall void xen_pmd_clear(pmd_t *pmdp); + + +#else +fastcall unsigned long xen_pte_val(pte_t); +fastcall unsigned long xen_pmd_val(pmd_t); +fastcall unsigned long xen_pgd_val(pgd_t); + +fastcall pte_t xen_make_pte(unsigned long); +fastcall pmd_t xen_make_pmd(unsigned long); +fastcall pgd_t xen_make_pgd(unsigned long); +#endif + +#endif /* _XEN_MMU_H */ =================================================================== --- /dev/null +++ b/arch/i386/xen/multicalls.c @@ -0,0 +1,62 @@ +#include + +#include + +#include "multicalls.h" + +#define MC_BATCH 8 +#define MC_ARGS (MC_BATCH * 32 / sizeof(u64)) + +struct mc_buffer { + struct multicall_entry entries[MC_BATCH]; + u64 args[MC_ARGS]; + unsigned mcidx, argidx; +}; + +static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); + +int xen_mc_flush(void) +{ + struct mc_buffer *b = &get_cpu_var(mc_buffer); + int ret = 0; + + if (b->mcidx) { + int i; + + if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) + BUG(); + for(i = 0; i < b->mcidx; i++) + if (b->entries[i].result < 0) + ret++; + b->mcidx = 0; + b->argidx = 0; + } else + BUG_ON(b->argidx != 0); + + put_cpu_var(mc_buffer); + + return ret; +} + +struct multicall_space xen_mc_entry(size_t args) +{ + struct mc_buffer *b = &get_cpu_var(mc_buffer); + struct multicall_space ret; + unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + + BUG_ON(argspace > MC_ARGS); + + if (b->mcidx == MC_BATCH || + (b->argidx + argspace) > MC_ARGS) + if (xen_mc_flush()) + BUG(); + + ret.mc = &b->entries[b->mcidx]; + b->mcidx++; + ret.args = &b->args[b->argidx]; + b->argidx += argspace; + + put_cpu_var(mc_buffer); + + return ret; +} =================================================================== --- /dev/null +++ b/arch/i386/xen/multicalls.h @@ -0,0 +1,13 @@ +#ifndef _XEN_MULTICALLS_H +#define _XEN_MULTICALLS_H + +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +struct multicall_space xen_mc_entry(size_t args); +int xen_mc_flush(void); + +#endif /* _XEN_MULTICALLS_H */ =================================================================== --- /dev/null +++ b/arch/i386/xen/setup.c @@ -0,0 +1,95 @@ +/* + * Machine specific setup for xen + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +static __initdata struct shared_info init_shared; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info = &init_shared; +EXPORT_SYMBOL(HYPERVISOR_shared_info); + +unsigned long *phys_to_machine_mapping; +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16]; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ + +char * __init xen_memory_setup(void) +{ + unsigned long max_pfn = xen_start_info->nr_pages; + + e820.nr_map = 0; + add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); + + return "Xen"; +} + +void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |= TS_POLLING; + } +} + +void __init xen_arch_setup(void) +{ + struct physdevop_set_iopl set_iopl; + int rc; + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + + HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, + __KERNEL_CS, (unsigned long)xen_failsafe_callback); + + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl); + if (rc != 0) + printk(KERN_INFO "physdev_op failed %d\n", rc); + +#ifdef CONFIG_ACPI + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + disable_acpi(); + } +#endif + + memcpy(saved_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + + pm_idle = xen_idle; + + vdso_enabled = 1; /* enable by default */ +} =================================================================== --- /dev/null +++ b/arch/i386/xen/time.c @@ -0,0 +1,452 @@ +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +#define XEN_SHIFT 22 + +/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */ +static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */ +static int __init __permitted_clock_jitter(char *str) +{ + permitted_clock_jitter = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("permitted_clock_jitter=", __permitted_clock_jitter); + + +/* These are perodically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + +/* Keep track of last time we did processing/updating of jiffies and xtime. */ +static u64 processed_system_time; /* System time (ns) at last processing. */ +static DEFINE_PER_CPU(u64, processed_system_time); + +/* How much CPU time was spent blocked and how much was 'stolen'? */ +static DEFINE_PER_CPU(u64, processed_stolen_time); +static DEFINE_PER_CPU(u64, processed_blocked_time); + +/* Current runstate of each CPU (updated automatically by the hypervisor). */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* Must be signed, as it's compared with s64 quantities which can be -ve. */ +#define NS_PER_TICK (1000000000LL/HZ) + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. + */ +static void get_time_values_from_xen(void) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &read_pda(xen.vcpu)->time; + dst = &get_cpu_var(shadow_time); + + do { + dst->version = src->version; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + } while ((src->version & 1) | (dst->version ^ src->version)); + + put_cpu_var(shadow_time); +} + +static inline int time_values_up_to_date(void) +{ + struct vcpu_time_info *src; + unsigned dstversion; + + src = &read_pda(xen.vcpu)->time; + dstversion = get_cpu_var(shadow_time).version; + put_cpu_var(shadow_time); + + rmb(); + return (dstversion == src->version); +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + + +static void xen_timer_interrupt_hook(void) +{ + s64 delta, delta_cpu, stolen, blocked; + u64 sched_time; + int i, cpu = smp_processor_id(); + unsigned long ticks; + struct shadow_time_info *shadow = &__get_cpu_var(shadow_time); + struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate); + + do { + get_time_values_from_xen(); + + /* Obtain a consistent snapshot of elapsed wallclock cycles. */ + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); + if (0) + printk("tsc_timestamp=%llu system_timestamp=%llu tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld processed_system_time=%lld\n", + shadow->tsc_timestamp, shadow->system_timestamp, + shadow->tsc_to_nsec_mul, shadow->tsc_shift, + shadow->version, delta, processed_system_time); + + delta -= processed_system_time; + delta_cpu -= __get_cpu_var(processed_system_time); + + /* + * Obtain a consistent snapshot of stolen/blocked cycles. We + * can use state_entry_time to detect if we get preempted here. + */ + do { + sched_time = runstate->state_entry_time; + barrier(); + stolen = runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline] - + __get_cpu_var(processed_stolen_time); + blocked = runstate->time[RUNSTATE_blocked] - + __get_cpu_var(processed_blocked_time); + barrier(); + } while (sched_time != runstate->state_entry_time); + } while (!time_values_up_to_date()); + + if ((unlikely(delta < -(s64)permitted_clock_jitter) || + unlikely(delta_cpu < -(s64)permitted_clock_jitter)) + && printk_ratelimit()) { + printk("Timer ISR/%d: Time went backwards: " + "delta=%lld delta_cpu=%lld shadow=%lld " + "off=%lld processed=%lld cpu_processed=%lld\n", + cpu, delta, delta_cpu, shadow->system_timestamp, + (s64)get_nsec_offset(shadow), + processed_system_time, + __get_cpu_var(processed_system_time)); + for (i = 0; i < num_online_cpus(); i++) + printk(" %d: %lld\n", i, + per_cpu(processed_system_time, i)); + } + + /* System-wide jiffy work. */ + ticks = 0; + while(delta > NS_PER_TICK) { + delta -= NS_PER_TICK; + processed_system_time += NS_PER_TICK; + ticks++; + } + do_timer(ticks); + + /* + * Account stolen ticks. + * HACK: Passing NULL to account_steal_time() + * ensures that the ticks are accounted as stolen. + */ + if ((stolen > 0) && (delta_cpu > 0)) { + delta_cpu -= stolen; + if (unlikely(delta_cpu < 0)) + stolen += delta_cpu; /* clamp local-time progress */ + do_div(stolen, NS_PER_TICK); + __get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK; + __get_cpu_var(processed_system_time) += stolen * NS_PER_TICK; + account_steal_time(NULL, (cputime_t)stolen); + } + + /* + * Account blocked ticks. + * HACK: Passing idle_task to account_steal_time() + * ensures that the ticks are accounted as idle/wait. + */ + if ((blocked > 0) && (delta_cpu > 0)) { + delta_cpu -= blocked; + if (unlikely(delta_cpu < 0)) + blocked += delta_cpu; /* clamp local-time progress */ + do_div(blocked, NS_PER_TICK); + __get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK; + __get_cpu_var(processed_system_time) += blocked * NS_PER_TICK; + account_steal_time(idle_task(cpu), (cputime_t)blocked); + } + + update_process_times(user_mode_vm(get_irq_regs())); +} + +static cycle_t xen_clocksource_read(void) +{ + struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + cycle_t ret; + + get_time_values_from_xen(); + + ret = shadow->system_timestamp + get_nsec_offset(shadow); + + put_cpu_var(shadow_time); + + return ret; +} + +static void xen_read_wallclock(struct timespec *ts) +{ + const struct shared_info *s = HYPERVISOR_shared_info; + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = s->wc_version; + rmb(); + now.tv_sec = s->wc_sec; + now.tv_nsec = s->wc_nsec; + rmb(); + } while ((s->wc_version & 1) | (version ^ s->wc_version)); + + delta = xen_clocksource_read(); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +unsigned long xen_get_wallclock(void) +{ + struct timespec ts; + + xen_read_wallclock(&ts); + + return ts.tv_sec; +} + +int xen_set_wallclock(unsigned long now) +{ + /* do nothing for domU */ + return -1; +} + +static void init_cpu_khz(void) +{ + u64 __cpu_khz = 1000000ULL << 32; + struct vcpu_time_info *info; + info = &HYPERVISOR_shared_info->vcpu_info[0].time; + do_div(__cpu_khz, info->tsc_to_system_mul); + if (info->tsc_shift < 0) + cpu_khz = __cpu_khz << -info->tsc_shift; + else + cpu_khz = __cpu_khz >> info->tsc_shift; +} + +static struct clocksource xen_clocksource = { + .name = "xen", + .rating = 400, + .read = xen_clocksource_read, + .mask = ~0, + .mult = 1<time[RUNSTATE_blocked]; + per_cpu(processed_stolen_time, cpu) = + runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline]; +} + +static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) +{ + /* + * Here we are in the timer irq handler. We just have irqs locally + * disabled but we don't know if the timer_bh is running on the other + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need + * the irq version of write_lock because as just said we have irq + * locally disabled. -arca + */ + write_seqlock(&xtime_lock); + + xen_timer_interrupt_hook(); + + write_sequnlock(&xtime_lock); + + return IRQ_HANDLED; +} + +static void setup_cpu0_timer_irq(void) +{ + printk(KERN_DEBUG "installing Xen timer for CPU 0\n"); + + bind_virq_to_irqhandler( + VIRQ_TIMER, + 0, + xen_timer_interrupt, + SA_INTERRUPT, + "timer0", + NULL); +} + +static __init void xen_late_time_init(void) +{ + setup_cpu0_timer_irq(); +} + +extern void (*late_time_init)(void); +__init void xen_time_init(void) +{ + late_time_init = xen_late_time_init; + + get_time_values_from_xen(); + + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; + + init_cpu_khz(); + printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + + init_missing_ticks_accounting(0); + + clocksource_register(&xen_clocksource); + + /* Set initial system time with full resolution */ + xen_read_wallclock(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + tsc_disable = 0; +} + +/* Convert jiffies to system time. */ +static u64 jiffies_to_st(unsigned long j) +{ + unsigned long seq; + long delta; + u64 st; + + do { + seq = read_seqbegin(&xtime_lock); + delta = j - jiffies; + if (delta < 1) { + /* Triggers in some wrap-around cases, but that's okay: + * we just end up with a shorter timeout. */ + st = processed_system_time + NS_PER_TICK; + } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) { + /* Very long timeout means there is no pending timer. + * We indicate this to Xen by passing zero timeout. */ + st = 0; + } else { + st = processed_system_time + delta * (u64)NS_PER_TICK; + } + } while (read_seqretry(&xtime_lock, seq)); + + return st; +} + +/* + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu + * These functions are based on implementations from arch/s390/kernel/time.c + */ +void stop_hz_timer(void) +{ + unsigned int cpu = smp_processor_id(); + unsigned long j; + + cpu_set(cpu, nohz_cpu_mask); + + /* + * See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs + * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a + * value of rcp->cur that matches rdp->quiescbatch and allows us to + * stop the hz timer then the cpumasks created for subsequent values + * of cur in rcu_start_batch are guaranteed to pick up the updated + * nohz_cpu_mask and so will not depend on this cpu. + */ + + smp_mb(); + + /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ + if (rcu_needs_cpu(cpu) || local_softirq_pending() || + (j = next_timer_interrupt(), time_before_eq(j, jiffies))) { + cpu_clear(cpu, nohz_cpu_mask); + j = jiffies + 1; + } + + if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0) + BUG(); +} + +void start_hz_timer(void) +{ + cpu_clear(smp_processor_id(), nohz_cpu_mask); +} + =================================================================== --- /dev/null +++ b/arch/i386/xen/xen-head.S @@ -0,0 +1,29 @@ +/* Xen-specific pieces of head.S, intended to be included in the right + place in head.S */ + +#include +#include +#include + +ENTRY(startup_xen) + movl %esi,xen_start_info + jmp startup_paravirt + +.pushsection ".bss.page_aligned" +ENTRY(hypercall_page) + .skip 0x1000 +.popsection + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "!writable_page_tables|pae_pgdir_above_4gb") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") =================================================================== --- /dev/null +++ b/arch/i386/xen/xen-ops.h @@ -0,0 +1,20 @@ +#ifndef XEN_OPS_H +#define XEN_OPS_H + +#include + +extern struct start_info *xen_start_info; +extern struct shared_info *HYPERVISOR_shared_info; + +char * __init xen_memory_setup(void); +void __init xen_arch_setup(void); +void __init xen_init_IRQ(void); + +void __init xen_time_init(void); +unsigned long xen_get_wallclock(void); +int xen_set_wallclock(unsigned long time); + +void stop_hz_timer(void); +void start_hz_timer(void); + +#endif /* XEN_OPS_H */ =================================================================== --- a/include/asm-i386/hypercall.h +++ b/include/asm-i386/hypercall.h @@ -39,9 +39,6 @@ #include #include #include - -#define __STR(x) #x -#define STR(x) __STR(x) extern struct { char _entry[32]; } hypercall_page[]; @@ -413,4 +410,22 @@ MULTI_mmuext_op(struct multicall_entry * mcl->args[2] = (unsigned long)success_count; mcl->args[3] = domid; } + +static inline void +MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) +{ + mcl->op = __HYPERVISOR_set_gdt; + mcl->args[0] = (unsigned long)frames; + mcl->args[1] = entries; +} + +static inline void +MULTI_stack_switch(struct multicall_entry *mcl, + unsigned long ss, unsigned long esp) +{ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = ss; + mcl->args[1] = esp; +} + #endif /* __HYPERCALL_H__ */ =================================================================== --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -43,6 +43,7 @@ extern void fixup_irqs(cpumask_t map); extern void fixup_irqs(cpumask_t map); #endif +fastcall unsigned int do_IRQ(struct pt_regs *regs); void init_IRQ(void); void __init native_init_IRQ(void); =================================================================== --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -31,6 +31,7 @@ struct Xgt_desc_struct; struct Xgt_desc_struct; struct tss_struct; struct mm_struct; +struct i386_pda; struct paravirt_ops { int paravirt_enabled; @@ -53,6 +54,7 @@ struct paravirt_ops void (*arch_setup)(void); char *(*memory_setup)(void); void (*init_IRQ)(void); + void (*init_pda)(struct i386_pda *, int cpu); void (*pagetable_setup_start)(pgd_t *pgd_base); void (*pagetable_setup_done)(pgd_t *pgd_base); @@ -200,6 +202,30 @@ extern struct paravirt_ops paravirt_ops; void native_pagetable_setup_start(pgd_t *pgd); +/* Non-paravirtualized implementations of various operations for + back-ends which don't need their own version. */ +fastcall void native_clts(void); + +fastcall unsigned long native_read_cr0(void); +fastcall void native_write_cr0(unsigned long val); + +fastcall unsigned long native_read_cr2(void); +fastcall void native_write_cr2(unsigned long val); + +fastcall unsigned long native_read_cr3(void); +fastcall void native_write_cr3(unsigned long val); + +fastcall unsigned long native_read_cr4(void); +fastcall unsigned long native_read_cr4_safe(void); +fastcall void native_write_cr4(unsigned long val); + +fastcall void native_wbinvd(void); + +fastcall unsigned long long native_read_msr(unsigned int msr, int *err); +fastcall int native_write_msr(unsigned int msr, unsigned long long val); +fastcall unsigned long long native_read_tsc(void); +fastcall unsigned long long native_read_pmc(void); + #ifdef CONFIG_X86_PAE fastcall unsigned long long native_pte_val(pte_t); fastcall unsigned long long native_pmd_val(pmd_t); @@ -405,6 +431,19 @@ static inline void paravirt_exit_mmap(st { paravirt_ops.exit_mmap(mm); } + +static inline void paravirt_init_pda(struct i386_pda *pda, int cpu) +{ + if (paravirt_ops.init_pda) + (*paravirt_ops.init_pda)(pda, cpu); +} + +fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high); +fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high); +fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high); +fastcall void native_store_gdt(struct Xgt_desc_struct *dtr); +fastcall void native_store_idt(struct Xgt_desc_struct *dtr); +fastcall unsigned long native_store_tr(void); #define __flush_tlb() paravirt_ops.flush_tlb_user() #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() @@ -699,5 +738,8 @@ static inline void paravirt_exit_mmap(st { } +static inline void paravirt_init_pda(struct i386_pda *pda, int cpu) +{ +} #endif /* CONFIG_PARAVIRT */ #endif /* __ASM_PARAVIRT_H */ =================================================================== --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -16,6 +16,17 @@ struct i386_pda int cpu_number; struct task_struct *pcurrent; /* current process */ struct pt_regs *irq_regs; + +#ifdef CONFIG_PARAVIRT + union { +#ifdef CONFIG_XEN + struct { + struct vcpu_info *vcpu; + unsigned long cr3; + } xen; +#endif /* CONFIG_XEN */ + }; +#endif /* CONFIG_PARAVIRT */ }; extern struct i386_pda *_cpu_pda[]; =================================================================== --- /dev/null +++ b/include/xen/events.h @@ -0,0 +1,28 @@ +#ifndef _XEN_EVENTS_H +#define _XEN_EVENTS_H + +#include + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, + void *dev_id); +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id); + +/* + * Common unbind function for all event sources. Takes IRQ to unbind from. + * Automatically closes the underlying event channel (even for bindings + * made with bind_evtchn_to_irqhandler()). + */ +void unbind_from_irqhandler(unsigned int irq, void *dev_id); + +static inline void notify_remote_via_evtchn(int port) +{ + struct evtchn_send send = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); +} + +extern void notify_remote_via_irq(int irq); +#endif /* _XEN_EVENTS_H */ =================================================================== --- /dev/null +++ b/include/xen/features.h @@ -0,0 +1,26 @@ +/****************************************************************************** + * features.h + * + * Query the features reported by Xen. + * + * Copyright (c) 2006, Ian Campbell + */ + +#ifndef __XEN_FEATURES_H__ +#define __XEN_FEATURES_H__ + +#include + +void xen_setup_features(void); + +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32]; + +static inline int xen_feature(int flag) +{ + switch(flag) { + } + + return xen_features[flag]; +} + +#endif /* __ASM_XEN_FEATURES_H__ */ =================================================================== --- /dev/null +++ b/include/xen/page.h @@ -0,0 +1,175 @@ +#ifndef __XEN_PAGE_H +#define __XEN_PAGE_H + +#include + +#include + +#include + +#ifdef CONFIG_X86_PAE +/* Xen machine address */ +typedef struct xmaddr { + unsigned long long maddr; +} xmaddr_t; + +/* Xen pseudo-physical address */ +typedef struct xpaddr { + unsigned long long paddr; +} xpaddr_t; +#else +/* Xen machine address */ +typedef struct xmaddr { + unsigned long maddr; +} xmaddr_t; + +/* Xen pseudo-physical address */ +typedef struct xpaddr { + unsigned long paddr; +} xpaddr_t; +#endif + +#define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) +#define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +extern unsigned long *phys_to_machine_mapping; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + + return phys_to_machine_mapping[(unsigned int)(pfn)] & + ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 1; + + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + +#if 0 + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; +#endif + + pfn = 0; + /* + * The array access can fail (e.g., device space beyond end of RAM). + * In such cases it doesn't matter what we return (we return garbage), + * but we must handle the fault without crashing! + */ + __get_user(pfn, &machine_to_phys_mapping[mfn]); + + return pfn; +} + +static inline xmaddr_t phys_to_machine(xpaddr_t phys) +{ + unsigned offset = phys.paddr & ~PAGE_MASK; + return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); +} + +static inline xpaddr_t machine_to_phys(xmaddr_t machine) +{ + unsigned offset = machine.maddr & ~PAGE_MASK; + return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + extern unsigned long max_mapnr; + unsigned long pfn = mfn_to_pfn(mfn); + if ((pfn < max_mapnr) + && !xen_feature(XENFEAT_auto_translated_physmap) + && (phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) +#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#ifdef CONFIG_X86_PAE +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\ + (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT))) + +static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + pte_t pte; + + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | (pgprot_val(pgprot) >> 32); + pte.pte_high &= (__supported_pte_mask >> 32); + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); + pte.pte_low &= __supported_pte_mask; + + return pte; +} + +static inline unsigned long long pte_val_ma(pte_t x) +{ + return ((unsigned long long)x.pte_high << 32) | x.pte_low; +} +#define pmd_val_ma(v) ((v).pmd) +#define pud_val_ma(v) ((v).pgd.pgd) +#else /* !X86_PAE */ +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) +#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pte_val_ma(x) ((x).pte_low) +#define pmd_val_ma(v) ((v).pud.pgd.pgd) +#endif /* CONFIG_X86_PAE */ +#define pgd_val_ma(x) ((x).pgd) + +#define __pte_ma(x) ((pte_t) { (x) } ) + +xmaddr_t arbitrary_virt_to_machine(unsigned long address); +void make_lowmem_page_readonly(void *vaddr); +void make_lowmem_page_readwrite(void *vaddr); + +#endif /* __XEN_PAGE_H */ --