From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jeremy Fitzhardinge Subject: [patch 16/21] Xen-paravirt: Add code into head.S to handle being booted by Xen Date: Tue, 13 Feb 2007 14:17:45 -0800 Message-ID: <20070213221830.707197267@goop.org> References: <20070213221729.772002682@goop.org> Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable Return-path: Content-Disposition: inline; filename=xen-core.patch List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: virtualization-bounces@lists.osdl.org Errors-To: virtualization-bounces@lists.osdl.org To: Andi Kleen Cc: Andrew Morton , virtualization@lists.osdl.org, xen-devel@lists.xensource.com, Chris Wright , linux-kernel@vger.kernel.org List-Id: virtualization@lists.linuxfoundation.org This adds a second entry point to head.S, which is jumped to when booted by Xen. This allows startup under Xen to be easily detected. Because Xen starts the kernel in a fairly sane state, very little setup is needed here; it just needs to jump into xen_start_kernel to init the paravirt_ops structure, and then jump into start_kernel proper. This also makes a few small adjustments to the gdt tables to make them properly suited to Xen. One warty thing in this patch is the requirement to hard-code the location of the Xen entrypoint and hypervisor page, rather than letting the assembler/linker choose an appropriate place. This is because these addresses must be converted into a string at compile time, so the address must be known at compile rather than link time. Subject: [patch 16/21] Xen-paravirt: Add outline of Xen paravirt interface = code, plus boot-time init. Create a new arch/i386/xen/ directory for all the Xen-specific paravirt code; I'd expect there would be parallel paravirt-vmi, etc directories. This also contains an initial set of paravirt ops for Xen, mostly ones which can just be implemented with the generic native_* version. At boot time, the global paravirt_ops structure is populated with the Xen pointers in xen_start_kernel, which then jumps to the standard start_kernel. Hook Xen entrypoint into common paravirt entrypoint. Register Xen-specific architecture and memory setup functions. Signed-off-by: Jeremy Fitzhardinge --- arch/i386/Makefile | 3 = arch/i386/kernel/cpu/common.c | 3 = arch/i386/kernel/entry.S | 77 +++ arch/i386/kernel/head.S | 12 = arch/i386/kernel/paravirt.c | 48 +- arch/i386/kernel/vmlinux.lds.S | 1 = arch/i386/mm/pgtable.c | 1 = arch/i386/xen/Makefile | 2 = arch/i386/xen/enlighten.c | 807 ++++++++++++++++++++++++++++++++++++= +++ arch/i386/xen/events.c | 473 ++++++++++++++++++++++ arch/i386/xen/features.c | 29 + arch/i386/xen/mmu.c | 419 ++++++++++++++++++++ arch/i386/xen/mmu.h | 51 ++ arch/i386/xen/multicalls.c | 62 ++ arch/i386/xen/multicalls.h | 13 = arch/i386/xen/setup.c | 95 ++++ arch/i386/xen/time.c | 452 +++++++++++++++++++++ arch/i386/xen/xen-head.S | 29 + arch/i386/xen/xen-ops.h | 20 = include/asm-i386/hypercall.h | 21 - include/asm-i386/irq.h | 1 = include/asm-i386/paravirt.h | 42 ++ include/asm-i386/pda.h | 11 = include/xen/events.h | 28 + include/xen/features.h | 26 + include/xen/page.h | 175 ++++++++ 26 files changed, 2872 insertions(+), 29 deletions(-) =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -93,6 +93,9 @@ mcore-$(CONFIG_X86_ES7000) :=3D mach-defau mcore-$(CONFIG_X86_ES7000) :=3D mach-default core-$(CONFIG_X86_ES7000) :=3D arch/i386/mach-es7000/ = +# Xen paravirtualization support +core-$(CONFIG_XEN) +=3D arch/i386/xen/ + # default subarch .h files mflags-y +=3D -Iinclude/asm-i386/mach-default = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -19,6 +19,7 @@ #include #endif #include +#include = #include "cpu.h" = @@ -707,6 +708,8 @@ __cpuinit int init_gdt(int cpu, struct t pda->cpu_number =3D cpu; pda->pcurrent =3D idle; = + paravirt_init_pda(pda, cpu); + return 1; } = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1001,6 +1001,83 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) = +#ifdef CONFIG_XEN +/* Xen only supports sysenter/sysexit in ring0 guests, + and only if it the guest asks for it. So for now, + this should never be used. */ +ENTRY(xen_sti_sysexit) + CFI_STARTPROC + ud2 + CFI_ENDPROC + = +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC + = +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hyperca= ll +# to pop the stack frame we end up in an infinite loop of failsafe callbac= ks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jz 5f + addl $16,%esp # EAX !=3D 0 =3D> Category 2 (Bad IRET) + CFI_ADJUST_CFA_OFFSET -16 + jmp iret_exc +5: addl $16,%esp # EAX =3D=3D 0 =3D> Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET -16 + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + = +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous + = +#endif /* CONFIG_XEN */ + = .section .rodata,"a" #include "syscall_table.S" = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -519,6 +519,10 @@ 1: jmp 1b #endif = +#ifdef CONFIG_XEN +#include "../xen/xen-head.S" +#endif + = /* * Real beginning of normal "text" segment */ @@ -528,7 +532,7 @@ ENTRY(_stext) /* * BSS section */ -.section ".bss.page_aligned","w" +.section ".bss.page_aligned" ENTRY(swapper_pg_dir) .fill 1024,4,0 ENTRY(empty_zero_page) @@ -598,7 +602,8 @@ ENTRY(boot_gdt_table) /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ - .align L1_CACHE_BYTES + .section ".data.page_aligned" + .align PAGE_SIZE_asm ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -647,3 +652,6 @@ ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* 0xf0 - unused */ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ = + /* Be sure this is zeroed to avoid false validations in Xen */ + .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0 + .previous =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -146,55 +146,55 @@ void init_IRQ(void) paravirt_ops.init_IRQ(); } = -static fastcall void native_clts(void) +fastcall void native_clts(void) { asm volatile ("clts"); } = -static fastcall unsigned long native_read_cr0(void) +fastcall unsigned long native_read_cr0(void) { unsigned long val; asm volatile("movl %%cr0,%0\n\t" :"=3Dr" (val)); return val; } = -static fastcall void native_write_cr0(unsigned long val) +fastcall void native_write_cr0(unsigned long val) { asm volatile("movl %0,%%cr0": :"r" (val)); } = -static fastcall unsigned long native_read_cr2(void) +fastcall unsigned long native_read_cr2(void) { unsigned long val; asm volatile("movl %%cr2,%0\n\t" :"=3Dr" (val)); return val; } = -static fastcall void native_write_cr2(unsigned long val) +fastcall void native_write_cr2(unsigned long val) { asm volatile("movl %0,%%cr2": :"r" (val)); } = -static fastcall unsigned long native_read_cr3(void) +fastcall unsigned long native_read_cr3(void) { unsigned long val; asm volatile("movl %%cr3,%0\n\t" :"=3Dr" (val)); return val; } = -static fastcall void native_write_cr3(unsigned long val) +fastcall void native_write_cr3(unsigned long val) { asm volatile("movl %0,%%cr3": :"r" (val)); } = -static fastcall unsigned long native_read_cr4(void) +fastcall unsigned long native_read_cr4(void) { unsigned long val; asm volatile("movl %%cr4,%0\n\t" :"=3Dr" (val)); return val; } = -static fastcall unsigned long native_read_cr4_safe(void) +fastcall unsigned long native_read_cr4_safe(void) { unsigned long val; /* This could fault if %cr4 does not exist */ @@ -207,7 +207,7 @@ static fastcall unsigned long native_rea return val; } = -static fastcall void native_write_cr4(unsigned long val) +fastcall void native_write_cr4(unsigned long val) { asm volatile("movl %0,%%cr4": :"r" (val)); } @@ -246,12 +246,12 @@ static fastcall void native_halt(void) asm volatile("hlt": : :"memory"); } = -static fastcall void native_wbinvd(void) +fastcall void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } = -static fastcall unsigned long long native_read_msr(unsigned int msr, int *= err) +fastcall unsigned long long native_read_msr(unsigned int msr, int *err) { unsigned long long val; = @@ -270,7 +270,7 @@ static fastcall unsigned long long nativ return val; } = -static fastcall int native_write_msr(unsigned int msr, unsigned long long = val) +fastcall int native_write_msr(unsigned int msr, unsigned long long val) { int err; asm volatile("2: wrmsr ; xorl %0,%0\n" @@ -288,14 +288,14 @@ static fastcall int native_write_msr(uns return err; } = -static fastcall unsigned long long native_read_tsc(void) +fastcall unsigned long long native_read_tsc(void) { unsigned long long val; asm volatile("rdtsc" : "=3DA" (val)); return val; } = -static fastcall unsigned long long native_read_pmc(void) +fastcall unsigned long long native_read_pmc(void) { unsigned long long val; asm volatile("rdpmc" : "=3DA" (val)); @@ -317,17 +317,17 @@ static fastcall void native_load_idt(con asm volatile("lidt %0"::"m" (*dtr)); } = -static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) { asm ("sgdt %0":"=3Dm" (*dtr)); } = -static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +fastcall void native_store_idt(struct Xgt_desc_struct *dtr) { asm ("sidt %0":"=3Dm" (*dtr)); } = -static fastcall unsigned long native_store_tr(void) +fastcall unsigned long native_store_tr(void) { unsigned long tr; asm ("str %0":"=3Dr" (tr)); @@ -336,9 +336,9 @@ static fastcall unsigned long native_sto = static fastcall void native_load_tls(struct thread_struct *t, unsigned int= cpu) { -#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] =3D t->tls_arra= y[i] - C(0); C(1); C(2); -#undef C + get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 0] =3D t->tls_array[0]; + get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 1] =3D t->tls_array[1]; + get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 2] =3D t->tls_array[2]; } = static inline void native_write_dt_entry(void *dt, int entry, u32 entry_lo= w, u32 entry_high) @@ -348,17 +348,17 @@ static inline void native_write_dt_entry lp[1] =3D entry_high; } = -static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 lo= w, u32 high) +fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 = high) { native_write_dt_entry(dt, entrynum, low, high); } = -static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 lo= w, u32 high) +fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 = high) { native_write_dt_entry(dt, entrynum, low, high); } = -static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 lo= w, u32 high) +fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 = high) { native_write_dt_entry(dt, entrynum, low, high); } =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -93,6 +93,7 @@ SECTIONS = . =3D ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) *(.data.idt) } = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -267,6 +267,7 @@ static void pgd_ctor(pgd_t *pgd) swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); } else { + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/Makefile @@ -0,0 +1,2 @@ +obj-y :=3D enlighten.o setup.o events.o time.o \ + features.o mmu.o multicalls.o =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/enlighten.c @@ -0,0 +1,807 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xen-ops.h" +#include "mmu.h" +#include "multicalls.h" + +extern struct Xgt_desc_struct cpu_gdt_descr; +extern struct i386_pda boot_pda; +extern unsigned long init_pg_tables_end; + +static DEFINE_PER_CPU(unsigned, lazy_mode); + +/* Code defined in entry.S (not a function) */ +extern const char xen_sti_sysexit[]; + +struct start_info *xen_start_info; + +static unsigned xen_patch(u8 type, u16 clobber, void *firstinsn, unsigned = len) +{ + /* Xen will require relocations to patch calls and jmps, and + perhaps chunks of inline code */ + return len; +} + +static void __init xen_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); + printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); +} + +static void xen_init_pda(struct i386_pda *pda, int cpu) +{ + /* Don't re-init boot CPU; we do it once very early in boot, + and then then cpu_init tries to do it again. If so, just + reuse the stuff we already set up. */ + if (cpu =3D=3D 0 && pda !=3D &boot_pda) { + BUG_ON(boot_pda.xen.vcpu =3D=3D NULL); + pda->xen =3D boot_pda.xen; + return; + } + + pda->xen.vcpu =3D &HYPERVISOR_shared_info->vcpu_info[cpu]; + pda->xen.cr3 =3D 0; +} + +static fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + unsigned maskedx =3D ~0; + if (*eax =3D=3D 1) + maskedx =3D ~(1 << X86_FEATURE_APIC); + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=3Da" (*eax), + "=3Db" (*ebx), + "=3Dc" (*ecx), + "=3Dd" (*edx) + : "0" (*eax), "2" (*ecx)); + *edx &=3D maskedx; +} + +static fastcall void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static fastcall unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static fastcall unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + preempt_disable(); + vcpu =3D read_pda(xen.vcpu); + /* flag has opposite sense of mask */ + flags =3D !vcpu->evtchn_upcall_mask; + preempt_enable(); + + /* convert to IF type flag = + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static fastcall void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + + /* convert from IF type flag */ + flags =3D !(flags & X86_EFLAGS_IF); + vcpu =3D read_pda(xen.vcpu); + vcpu->evtchn_upcall_mask =3D flags; + if (flags =3D=3D 0) { + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); + } else + preempt_enable_no_resched(); +} + +static fastcall void xen_irq_disable(void) +{ + struct vcpu_info *vcpu; + preempt_disable(); + vcpu =3D read_pda(xen.vcpu); + vcpu->evtchn_upcall_mask =3D 1; + preempt_enable_no_resched(); +} + +static fastcall void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + vcpu =3D read_pda(xen.vcpu); + vcpu->evtchn_upcall_mask =3D 0; + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); +} + +static fastcall void xen_safe_halt(void) +{ + stop_hz_timer(); + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, 0) !=3D 0) + BUG(); + start_hz_timer(); +} + +static fastcall void xen_halt(void) +{ +#if 0 + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); +#endif +} + +static void xen_set_lazy_mode(int mode) +{ + unsigned *lazy =3D &get_cpu_var(lazy_mode); + + if (xen_mc_flush()) + BUG(); + + *lazy =3D mode; + + put_cpu_var(lazy_mode); +} + +static unsigned xen_get_lazy_mode(void) +{ + unsigned ret =3D get_cpu_var(lazy_mode); + put_cpu_var(lazy_mode); + + return ret; +} + +static fastcall void xen_load_tr_desc(void) +{ + /* do nothing */ +} + +static fastcall unsigned long xen_store_tr(void) +{ + return 0; +} + +static fastcall void xen_set_ldt(const void *addr, unsigned entries) +{ + struct mmuext_op *op; + struct multicall_space mcs =3D xen_mc_entry(sizeof(*op)); + + op =3D mcs.args; + op->cmd =3D MMUEXT_SET_LDT; + op->arg1.linear_addr =3D (unsigned long)addr; + if (addr) + /* ldt my be vmalloced, use arbitrary_virt_to_machine */ + op->arg1.linear_addr =3D arbitrary_virt_to_machine((unsigned long)addr).= maddr; + op->arg2.nr_ents =3D entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() !=3D PARAVIRT_LAZY_CPU) + xen_mc_flush(); +} + +static fastcall void xen_load_gdt(const struct Xgt_desc_struct *dtr) +{ + unsigned long va; + int f; + unsigned size =3D dtr->size + 1; + unsigned long frames[16]; + + BUG_ON(size > 16*PAGE_SIZE); + + for (va =3D dtr->address, f =3D 0; + va < dtr->address + size; + va +=3D PAGE_SIZE, f++) { + frames[f] =3D virt_to_mfn(va); + make_lowmem_page_readonly((void *)va); + } + + /* This is used very early, so we can't rely on per-cpu data + being set up, so no multicalls */ + if (HYPERVISOR_set_gdt(frames, size/8)) + BUG(); +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + xmaddr_t maddr =3D virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_= MIN+i]); + struct multicall_space mc =3D xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static fastcall void xen_load_tls(struct thread_struct *t, unsigned int cp= u) +{ + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + if (xen_get_lazy_mode() !=3D PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall void xen_write_ldt_entry(void *dt, int entrynum, u32 low, = u32 high) +{ + unsigned long lp =3D (unsigned long)dt + entrynum * 8; + xmaddr_t mach_lp =3D virt_to_machine(lp); + u64 entry =3D (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); +} + +static int cvt_gate_to_trap(int vector, u32 low, u32 high, struct trap_inf= o *info) +{ + u8 type, dpl; + + type =3D (high >> 8) & 0x1f; + dpl =3D (high >> 13) & 3; + + if (type !=3D 0xf && type !=3D 0xe) + return 0; + + info->vector =3D vector; + info->address =3D (high & 0xffff0000) | (low & 0x0000ffff); + info->cs =3D low >> 16; + info->flags =3D dpl; + /* interrupt gates clear IF */ + if (type =3D=3D 0xe) + info->flags |=3D 4; + + return 1; +} + +#if 0 +static void unpack_desc(u32 low, u32 high, + unsigned long *base, unsigned long *limit, + unsigned char *type, unsigned char *flags) +{ + *base =3D (high & 0xff000000) | ((high << 16) & 0x00ff0000) | ((low >> 16= ) & 0xffff); + *limit =3D (high & 0x000f0000) | (low & 0xffff); + *type =3D (high >> 8) & 0xff; + *flags =3D (high >> 20) & 0xf; +} +#endif + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static fastcall void xen_write_idt_entry(void *dt, int entrynum, u32 low, = u32 high) +{ + + int cpu =3D smp_processor_id(); + unsigned long p =3D (unsigned long)dt + entrynum * 8; + unsigned long start =3D per_cpu(idt_desc, cpu).address; + unsigned long end =3D start + per_cpu(idt_desc, cpu).size + 1; + + xen_mc_flush(); + + native_write_idt_entry(dt, entrynum, low, high); + + if (p >=3D start && (p + 8) <=3D end) { + struct trap_info info[2]; + + info[1].address =3D 0; + + if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static fastcall void xen_load_idt(const struct Xgt_desc_struct *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + int cpu =3D smp_processor_id(); + unsigned in, out, count; + + per_cpu(idt_desc, cpu) =3D *desc; + = + count =3D desc->size / 8; + BUG_ON(count > 256); + + spin_lock(&lock); + for(in =3D out =3D 0; in < count; in++) { + const u32 *entry =3D (u32 *)(desc->address + in * 8); + + if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + out++; + } + traps[out].address =3D 0; + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static fastcall void xen_write_gdt_entry(void *dt, int entry, u32 low, u32= high) +{ + switch ((high >> 8) & 0xff) { + case DESCTYPE_LDT: + case DESCTYPE_TSS: + /* ignore */ + break; + + default: + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(virt_to_machine(dt + entry*8).maddr, + (u64)high << 32 | low)) + BUG(); + } +} + +static fastcall void xen_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + if (xen_get_lazy_mode() !=3D PARAVIRT_LAZY_CPU) { + if (HYPERVISOR_stack_switch(__KERNEL_DS, thread->esp0)) + BUG(); + } else { + struct multicall_space mcs =3D xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + } +} + +static fastcall void xen_set_iopl_mask(unsigned mask) +{ +#if 0 + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl =3D (mask =3D=3D 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +#endif +} + +static fastcall void xen_io_delay(void) +{ +} + +#ifdef CONFIG_X86_LOCAL_APIC +static fastcall void xen_apic_write(unsigned long reg, unsigned long v) +{ +} + +static fastcall void xen_apic_write_atomic(unsigned long reg, unsigned lon= g v) +{ +} + +static fastcall unsigned long xen_apic_read(unsigned long reg) +{ + return 0; +} +#endif + +static fastcall void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs =3D xen_mc_entry(sizeof(*op)); + + op =3D mcs.args; + op->cmd =3D MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() !=3D PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall void xen_flush_tlb_global(void) +{ + struct mmuext_op *op; + struct multicall_space mcs =3D xen_mc_entry(sizeof(*op)); + + op =3D mcs.args; + op->cmd =3D MMUEXT_TLB_FLUSH_ALL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() !=3D PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall void xen_flush_tlb_single(u32 addr) +{ + struct mmuext_op *op; + struct multicall_space mcs =3D xen_mc_entry(sizeof(*op)); + + op =3D mcs.args; + op->cmd =3D MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr =3D addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() !=3D PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); +} + +static fastcall unsigned long xen_read_cr2(void) +{ + return read_pda(xen.vcpu)->arch.cr2; +} + +static fastcall void xen_write_cr4(unsigned long cr4) +{ + /* never allow TSC to be disabled */ + native_write_cr4(cr4 & ~X86_CR4_TSD); +} + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >>= 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) <<= 20)) + +static fastcall unsigned long xen_read_cr3(void) +{ + return read_pda(xen.cr3); +} + +static fastcall void xen_write_cr3(unsigned long cr3) +{ + if (cr3 =3D=3D read_pda(xen.cr3)) { + /* just a simple tlb flush */ + xen_flush_tlb(); + return; + } + + write_pda(xen.cr3, cr3); + + + { + struct mmuext_op *op; + struct multicall_space mcs =3D xen_mc_entry(sizeof(*op)); + unsigned long mfn =3D pfn_to_mfn(PFN_DOWN(cr3)); + + op =3D mcs.args; + op->cmd =3D MMUEXT_NEW_BASEPTR; + op->arg1.mfn =3D mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (xen_get_lazy_mode() !=3D PARAVIRT_LAZY_CPU && xen_mc_flush()) + BUG(); + } +} + +static fastcall void xen_alloc_pt(u32 pfn) +{ + /* XXX pfn isn't necessarily a lowmem page */ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_alloc_pd(u32 pfn) +{ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_release_pd(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + /* make sure next person to allocate this page gets a clean + pmd */ + clear_page(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_release_pt(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static fastcall void xen_alloc_pd_clone(u32 pfn, u32 clonepfn, + u32 start, u32 count) +{ + xen_alloc_pd(pfn); +} + +static __init void xen_pagetable_setup_start(pgd_t *base) +{ + pgd_t *xen_pgd =3D (pgd_t *)xen_start_info->pt_base; + + init_mm.pgd =3D base; + + /* copy top-level of Xen-supplied pagetable into place. For + !PAE we can use this as-is, but for PAE it is a stand-in + while we copy the pmd pages. */ + memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) { + int i; + + /* For PAE, need to allocate new pmds, rather than + share Xen's, since Xen doesn't like pmd's being + shared between address spaces, even though in this + case they're effectively the same address space. */ + for(i =3D 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd =3D (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); + + xen_alloc_pd(PFN_DOWN(__pa(pmd))); + + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); + } + } + + /* make sure the zero_page is mapped RO so we + can use it in pagetables */ + make_lowmem_page_readonly(empty_zero_page); + make_lowmem_page_readonly(base); + + /* Switch to new pagetable. This is done before + pagetable_init has done anything so that the new pages + added to the table can be prepared properly for Xen. */ + printk("about to switch to new pagetable %p...\n", base); + xen_write_cr3(__pa(base)); + printk("done\n"); +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + /* init_mm has a new pagetable set up - make sure the GDT page + is still read-only in the new pagetable */ + xen_load_gdt(&cpu_gdt_descr); + + if (!xen_feature(XENFEAT_writable_page_tables)) { + /* Create a mapping for the shared info page. + Should be set_fixmap(), but shared_info is a machine + address with no corresponding pseudo-phys address. */ + set_pte_mfn(fix_to_virt(FIX_PARAVIRT), + PFN_DOWN(xen_start_info->shared_info), + PAGE_KERNEL); + = + HYPERVISOR_shared_info =3D + (struct shared_info *)fix_to_virt(FIX_PARAVIRT); + } else + HYPERVISOR_shared_info =3D + (struct shared_info *)__va(xen_start_info->shared_info); + + xen_pgd_pin(base); + + write_pda(xen.vcpu, &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()= ]); +} + +static const struct paravirt_ops xen_paravirt_ops __initdata =3D { + .paravirt_enabled =3D 1, + .shared_kernel_pmd =3D 0, + .pgd_alignment =3D PAGE_SIZE, + + .name =3D "Xen", + .banner =3D xen_banner, + + .patch =3D xen_patch, + + .memory_setup =3D xen_memory_setup, + .arch_setup =3D xen_arch_setup, + .init_IRQ =3D xen_init_IRQ, + .time_init =3D xen_time_init, + .init_pda =3D xen_init_pda, + + .cpuid =3D xen_cpuid, + + .set_debugreg =3D xen_set_debugreg, + .get_debugreg =3D xen_get_debugreg, + + .clts =3D native_clts, + + .read_cr0 =3D native_read_cr0, + .write_cr0 =3D native_write_cr0, + + .read_cr2 =3D xen_read_cr2, + .write_cr2 =3D native_write_cr2, + + .read_cr3 =3D xen_read_cr3, + .write_cr3 =3D xen_write_cr3, + + .read_cr4 =3D native_read_cr4, + .read_cr4_safe =3D native_read_cr4_safe, + .write_cr4 =3D xen_write_cr4, + + .save_fl =3D xen_save_fl, + .restore_fl =3D xen_restore_fl, + .irq_disable =3D xen_irq_disable, + .irq_enable =3D xen_irq_enable, + .safe_halt =3D xen_safe_halt, + .halt =3D xen_halt, + .wbinvd =3D native_wbinvd, + + .read_msr =3D native_read_msr, + .write_msr =3D native_write_msr, + .read_tsc =3D native_read_tsc, + .read_pmc =3D native_read_pmc, + + .iret =3D (void (fastcall *)(void))&hypercall_page[__HYPERVISOR_iret], + .irq_enable_sysexit =3D (void (fastcall *)(void))xen_sti_sysexit, + + .load_tr_desc =3D xen_load_tr_desc, + .set_ldt =3D xen_set_ldt, + .load_gdt =3D xen_load_gdt, + .load_idt =3D xen_load_idt, + .load_tls =3D xen_load_tls, + + .store_gdt =3D native_store_gdt, + .store_idt =3D native_store_idt, + .store_tr =3D xen_store_tr, + + .write_ldt_entry =3D xen_write_ldt_entry, + .write_gdt_entry =3D xen_write_gdt_entry, + .write_idt_entry =3D xen_write_idt_entry, + .load_esp0 =3D xen_load_esp0, + + .set_iopl_mask =3D xen_set_iopl_mask, + .io_delay =3D xen_io_delay, + .const_udelay =3D __const_udelay, + .set_wallclock =3D xen_set_wallclock, + .get_wallclock =3D xen_get_wallclock, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write =3D xen_apic_write, + .apic_write_atomic =3D xen_apic_write_atomic, + .apic_read =3D xen_apic_read, + .setup_boot_clock =3D (void *)native_nop, + .setup_secondary_clock =3D (void *)native_nop, +#endif + + .flush_tlb_user =3D xen_flush_tlb, + .flush_tlb_kernel =3D xen_flush_tlb_global, + .flush_tlb_single =3D xen_flush_tlb_single, + + .pte_update =3D (void *)native_nop, + .pte_update_defer =3D (void *)native_nop, + + .pagetable_setup_start =3D xen_pagetable_setup_start, + .pagetable_setup_done =3D xen_pagetable_setup_done, + .activate_mm =3D xen_activate_mm, + .dup_mmap =3D xen_dup_mmap, + .exit_mmap =3D xen_exit_mmap, + + .set_pte =3D xen_set_pte, + .set_pte_at =3D xen_set_pte_at, + .set_pmd =3D xen_set_pmd, + + .alloc_pt =3D xen_alloc_pt, + .alloc_pd =3D xen_alloc_pd, + .alloc_pd_clone =3D xen_alloc_pd_clone, + .release_pd =3D xen_release_pd, + .release_pt =3D xen_release_pt, + + .pte_val =3D xen_pte_val, + .pmd_val =3D xen_pmd_val, + .pgd_val =3D xen_pgd_val, + + .make_pte =3D xen_make_pte, + .make_pmd =3D xen_make_pmd, + .make_pgd =3D xen_make_pgd, + + .ptep_get_and_clear =3D xen_ptep_get_and_clear, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic =3D xen_set_pte, + .set_pte_present =3D xen_set_pte_at, + .set_pud =3D xen_set_pud, + .pte_clear =3D xen_pte_clear, + .pmd_clear =3D xen_pmd_clear, +#endif /* PAE */ + + .set_lazy_mode =3D xen_set_lazy_mode, + .startup_ipi_hook =3D (void *)native_nop, +}; + +/* First C function to be called on Xen boot */ +static asmlinkage void __init xen_start_kernel(void) +{ + u32 low, high; + pgd_t *pgd; + + if (!xen_start_info) + return; + + BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) !=3D 0); + + /* Install Xen paravirt ops */ + paravirt_ops =3D xen_paravirt_ops; + + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping =3D (unsigned long *)xen_start_info->mfn_list; + + pgd =3D (pgd_t *)xen_start_info->pt_base; + + init_pg_tables_end =3D __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + + /* set up the boot-time gdt and segments */ + init_mm.pgd =3D pgd; /* use the Xen pagetables to start */ + + xen_load_gdt(&cpu_gdt_descr); + + /* set up PDA descriptor */ + pack_descriptor(&low, &high, (unsigned)&boot_pda, sizeof(boot_pda)-1, + 0x80 | DESCTYPE_S | 0x02, 0); + + /* Use hypercall directly, because xen_write_gdt_entry can't + * be used until batched multicalls work. */ + if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt_table + + GDT_ENTRY_PDA).maddr, + (u64)high << 32 | low)) + BUG(); + + /* set up %fs and init Xen parts of the PDA */ + asm volatile("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); + xen_init_pda(&boot_pda, 0); + boot_pda.xen.cr3 =3D __pa(pgd); + + paravirt_ops.kernel_rpl =3D xen_feature(XENFEAT_supervisor_mode_kernel) ?= 0 : 1; + + /* set the limit of our address space */ + reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + new_cpu_data.hard_math =3D 1; + identify_cpu(&new_cpu_data); + + /* Poke various useful things into boot_params */ + LOADER_TYPE =3D (9 << 4) | 0; + INITRD_START =3D xen_start_info->mod_start ? __pa(xen_start_info->mod_sta= rt) : 0; + INITRD_SIZE =3D xen_start_info->mod_len; + + /* Start the world */ + start_kernel(); +} + +paravirt_probe(xen_start_kernel); =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/events.c @@ -0,0 +1,473 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping table= s. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> VIRQ mapping. */ +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) =3D {[0 ... NR_VIRQS-1] =3D -1}; + +/* Packed IRQ information: binding type, sub-type index, and event channel= . */ +static u32 irq_info[NR_IRQS]; + +/* Binding types. */ +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN }; + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +static int evtchn_to_irq[NR_EVENT_CHANNELS] =3D { + [0 ... NR_EVENT_CHANNELS-1] =3D -1 +}; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_L= ONG]; +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) !=3D 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} +EXPORT_SYMBOL_GPL(force_evtchn_callback); + +static struct irq_chip xen_dynamic_chip; + +/* Constructor for packed IRQ information. */ +static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + return ((type << 24) | (index << 16) | evtchn); +} + +/* + * Accessors for packed IRQ information. + */ +static inline unsigned int evtchn_from_irq(int irq) +{ + return (u16)(irq_info[irq]); +} + +static inline unsigned int index_from_irq(int irq) +{ + return (u8)(irq_info[irq] >> 16); +} + +static inline unsigned int type_from_irq(int irq) +{ + return (u8)(irq_info[irq] >> 24); +} + +static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + int irq =3D evtchn_to_irq[chn]; + + BUG_ON(irq =3D=3D -1); + set_native_irq_info(irq, cpumask_of_cpu(cpu)); + + __clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); + __set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); + + cpu_evtchn[chn] =3D cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ + int i; + + /* By default all event channels notify CPU#0. */ + for (i =3D 0; i < NR_IRQS; i++) + set_native_irq_info(i, cpumask_of_cpu(0)); + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +static inline void clear_evtchn(int port) +{ + struct shared_info *s =3D HYPERVISOR_shared_info; + sync_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void set_evtchn(int port) +{ + struct shared_info *s =3D HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_pending[0]); +} + + +/** + * notify_remote_via_irq - send event to remote end of event channel via i= rq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + int evtchn =3D evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +void mask_evtchn(int port) +{ + struct shared_info *s =3D HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_mask[0]); +} +EXPORT_SYMBOL_GPL(mask_evtchn); + +void unmask_evtchn(int port) +{ + struct shared_info *s =3D HYPERVISOR_shared_info; + unsigned int cpu =3D smp_processor_id(); + struct vcpu_info *vcpu_info =3D read_pda(xen.vcpu); + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu !=3D cpu_from_evtchn(port))) { + struct evtchn_unmask unmask =3D { .port =3D port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + return; + } + + sync_clear_bit(port, &s->evtchn_mask[0]); + + /* + * The following is basically the equivalent of 'hw_resend_irq'. Just + * like a real IO-APIC we 'lose the interrupt edge' if the channel is + * masked. + */ + if (sync_test_bit(port, &s->evtchn_pending[0]) && + !sync_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending =3D 1; +} +EXPORT_SYMBOL_GPL(unmask_evtchn); + +static int find_unbound_irq(void) +{ + int irq; + + /* Only allocate from dynirq range */ + for (irq =3D 0; irq < NR_IRQS; irq++) + if (irq_bindcount[irq] =3D=3D 0) + break; + + if (irq =3D=3D NR_IRQS) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +static int bind_evtchn_to_irq(unsigned int evtchn) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + irq =3D evtchn_to_irq[evtchn]; + + if (irq =3D=3D -1) { + irq =3D find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler(irq, &xen_dynamic_chip, handle_level_irq); + + evtchn_to_irq[evtchn] =3D irq; + irq_info[irq] =3D mk_irq_info(IRQT_EVTCHN, 0, evtchn); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq =3D per_cpu(virq_to_irq, cpu)[virq]; + + if (irq =3D=3D -1) { + bind_virq.virq =3D virq; + bind_virq.vcpu =3D cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) !=3D 0) + BUG(); + evtchn =3D bind_virq.port; + + irq =3D find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler(irq, &xen_dynamic_chip, handle_level_irq); + + evtchn_to_irq[evtchn] =3D irq; + irq_info[irq] =3D mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] =3D irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn =3D evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] =3D=3D 0)) { + close.port =3D evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) !=3D 0) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] =3D -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] =3D -1; + irq_info[irq] =3D IRQ_UNBOUND; + + dynamic_irq_init(irq); + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq =3D bind_evtchn_to_irq(evtchn); + retval =3D request_irq(irq, handler, irqflags, devname, dev_id); + if (retval !=3D 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq =3D bind_virq_to_irq(virq, cpu); + retval =3D request_irq(irq, handler, irqflags, devname, dev_id); + if (retval !=3D 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +/* + Search the CPUs pending events bitmasks. For each one found, map + the event number to an irq, and feed it into do_IRQ() for + handling. + + Xen uses a two-level bitmap to speed searching. The first level is + a bitset of words which contain pending event bits. The second + level is a bitset of pending events themselves. +*/ +asmlinkage fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + int cpu =3D smp_processor_id(); + struct shared_info *s =3D HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info =3D read_pda(xen.vcpu); + unsigned long pending_words; + + vcpu_info->evtchn_upcall_pending =3D 0; + + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + pending_words =3D xchg(&vcpu_info->evtchn_pending_sel, 0); + while (pending_words !=3D 0) { + unsigned long pending_bits; + int word_idx =3D __ffs(pending_words); + pending_words &=3D ~(1UL << word_idx); + + while ((pending_bits =3D active_evtchns(cpu, s, word_idx)) !=3D 0) { + int bit_idx =3D __ffs(pending_bits); + int port =3D (word_idx * BITS_PER_LONG) + bit_idx; + int irq =3D evtchn_to_irq[port]; + + if (irq !=3D -1) { + regs->orig_eax =3D ~irq; + do_IRQ(regs); + } + } + } +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + int evtchn =3D evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port =3D evtchn; + bind_vcpu.vcpu =3D tcpu; + + /* + * If this fails, it usually just indicates that we're dealing with a = + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >=3D 0) + bind_evtchn_to_cpu(evtchn, tcpu); +} + + +static void set_affinity_irq(unsigned irq, cpumask_t dest) +{ + unsigned tcpu =3D first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn =3D evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn =3D evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn =3D evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +static int retrigger_dynirq(unsigned int irq) +{ + int evtchn =3D evtchn_from_irq(irq); + int ret =3D 0; + + if (VALID_EVTCHN(evtchn)) { + set_evtchn(evtchn); + ret =3D 1; + } + + return ret; +} + +static struct irq_chip xen_dynamic_chip __read_mostly =3D { + .name =3D "xen-virq", + .mask =3D disable_dynirq, + .unmask =3D enable_dynirq, + .ack =3D ack_dynirq, + .set_affinity =3D set_affinity_irq, + .retrigger =3D retrigger_dynirq, +}; + +void __init xen_init_IRQ(void) +{ + int i; + + init_evtchn_cpu_bindings(); + + /* No event channels are 'live' right now. */ + for (i =3D 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i =3D 0; i < NR_IRQS; i++) + irq_bindcount[i] =3D 0; + + irq_ctx_init(smp_processor_id()); +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/features.c @@ -0,0 +1,29 @@ +/*************************************************************************= ***** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include +#include +#include +#include +#include + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +EXPORT_SYMBOL_GPL(xen_features); + +void xen_setup_features(void) +{ + struct xen_feature_info fi; + int i, j; + + for (i =3D 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx =3D i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j=3D0; j<32; j++) + xen_features[i*32+j] =3D !!(fi.submap & 1< +#include + +#include +#include +#include + +#include +#include + +#include +#include + +xmaddr_t arbitrary_virt_to_machine(unsigned long address) +{ + pte_t *pte =3D lookup_address(address); + unsigned offset =3D address & PAGE_MASK; + + BUG_ON(pte =3D=3D NULL); + + return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); +} + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address =3D (unsigned long)vaddr; + + pte =3D lookup_address(address); + BUG_ON(pte =3D=3D NULL); + + ptev =3D pte_wrprotect(*pte); + + if (xen_feature(XENFEAT_writable_page_tables)) + *pte =3D ptev; + else + if(HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address =3D (unsigned long)vaddr; + + pte =3D lookup_address(address); + BUG_ON(pte =3D=3D NULL); + + ptev =3D pte_mkwrite(*pte); + + if (xen_feature(XENFEAT_writable_page_tables)) + *pte =3D ptev; + else + if(HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +fastcall void xen_set_pte(pte_t *ptep, pte_t pte) +{ +#if 1 + struct mmu_update u; + + u.ptr =3D virt_to_machine(ptep).maddr; + u.val =3D pte_val_ma(pte); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +#else + ptep->pte_high =3D pte.pte_high; + smp_wmb(); + ptep->pte_low =3D pte.pte_low; +#endif +} + +fastcall void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + u.ptr =3D virt_to_machine(ptr).maddr; + u.val =3D pmd_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +#ifdef CONFIG_X86_PAE +fastcall void xen_set_pud(pmd_t *ptr, pud_t val) +{ + struct mmu_update u; + + u.ptr =3D virt_to_machine(ptr).maddr; + u.val =3D pud_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} +#endif + +/* + * Associate a virtual page frame with a given physical page frame = + * and protection flags for that frame. + */ = +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd =3D swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud =3D pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd =3D pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte =3D pte_offset_kernel(pmd, vaddr); + /* stored as-is, to permit clearing entries */ + xen_set_pte(pte, mfn_pte(mfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr, + pte_t *ptep, pte_t pteval) +{ + if ((mm !=3D current->mm && mm !=3D &init_mm) || + HYPERVISOR_update_va_mapping(addr, pteval, 0) !=3D 0) + xen_set_pte(ptep, pteval); +} + +void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ +} + +void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *= ptep) +{ +} + +#ifdef CONFIG_X86_PAE +void fastcall xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((u64 *)ptep, pte_val_ma(pte)); +} + +void fastcall xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep) +{ +#if 1 + ptep->pte_low =3D 0; + smp_wmb(); + ptep->pte_high =3D 0; = +#else + set_64bit((u64 *)ptep, 0); +#endif +} + +void fastcall xen_pmd_clear(pmd_t *pmdp) +{ + xen_set_pmd(pmdp, __pmd(0)); +} + +fastcall unsigned long long xen_pte_val(pte_t pte) +{ + unsigned long long ret =3D 0; + + if (pte.pte_low) { + ret =3D ((unsigned long long)pte.pte_high << 32) | pte.pte_low; + ret =3D machine_to_phys(XMADDR(ret)).paddr | 1; + } + + return ret; +} + +fastcall unsigned long long xen_pmd_val(pmd_t pmd) +{ + unsigned long long ret =3D pmd.pmd; + if (ret) + ret =3D machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +fastcall unsigned long long xen_pgd_val(pgd_t pgd) +{ + unsigned long long ret =3D pgd.pgd; + if (ret) + ret =3D machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +fastcall pte_t xen_make_pte(unsigned long long pte) +{ + if (pte & 1) + pte =3D phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte, pte >> 32 }; +} + +fastcall pmd_t xen_make_pmd(unsigned long long pmd) +{ + if (pmd & 1) + pmd =3D phys_to_machine(XPADDR(pmd)).maddr; + + return (pmd_t){ pmd }; +} + +fastcall pgd_t xen_make_pgd(unsigned long long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd =3D phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} + +fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep) +{ + pte_t res; + + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low =3D xchg(&ptep->pte_low, 0); + res.pte_high =3D ptep->pte_high; + ptep->pte_high =3D 0; + + return res; +} +#else /* !PAE */ +fastcall unsigned long xen_pte_val(pte_t pte) +{ + unsigned long ret =3D pte.pte_low; + + if (ret & _PAGE_PRESENT) + ret =3D machine_to_phys(XMADDR(ret)).paddr; + + return ret; +} + +fastcall unsigned long xen_pmd_val(pmd_t pmd) +{ + BUG(); + return 0; +} + +fastcall unsigned long xen_pgd_val(pgd_t pgd) +{ + unsigned long ret =3D pgd.pgd; + if (ret) + ret =3D machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +fastcall pte_t xen_make_pte(unsigned long pte) +{ + if (pte & _PAGE_PRESENT) + pte =3D phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte }; +} + +fastcall pmd_t xen_make_pmd(unsigned long pmd) +{ + BUG(); + return __pmd(0); +} + +fastcall pgd_t xen_make_pgd(unsigned long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd =3D phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} + +fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep) +{ + return __pte_ma(xchg(&(ptep)->pte_low, 0)); +} +#endif /* CONFIG_X86_PAE */ + + + +static void pgd_walk_set_prot(void *pt, pgprot_t flags) +{ + unsigned long pfn =3D PFN_DOWN(__pa(pt)); + + if (HYPERVISOR_update_va_mapping((unsigned long)pt, + pfn_pte(pfn, flags), 0) < 0) + BUG(); +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd =3D pgd_base; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g, u, m; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + for (g =3D 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud =3D pud_offset(pgd, 0); + + if (PTRS_PER_PUD > 1) /* not folded */ + pgd_walk_set_prot(pud,flags); + + for (u =3D 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd =3D pmd_offset(pud, 0); + + if (PTRS_PER_PMD > 1) /* not folded */ + pgd_walk_set_prot(pmd,flags); + + for (m =3D 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + + /* This can get called before mem_map + is set up, so we assume nothing is + highmem at that point. */ + if (mem_map =3D=3D NULL || + !PageHighMem(pmd_page(*pmd))) { + pte =3D pte_offset_kernel(pmd,0); + pgd_walk_set_prot(pte,flags); + } + } + } + } + + if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(PFN_DOWN(__pa(pgd_base)), + flags), + UVMF_TLB_FLUSH) < 0) + BUG(); +} + + +/* This is called just after a mm has been duplicated from its parent, + but it has not been used yet. We need to make sure that its + pagetable is all read-only, and can be pinned. The pagetable itself + needs to map itself as RO; it doesn't matter what the state its in + with respect to any other pagetable. */ +void xen_pgd_pin(pgd_t *pgd) +{ + struct mmuext_op op; + + pgd_walk(pgd, PAGE_KERNEL_RO); + +#if defined(CONFIG_X86_PAE) + op.cmd =3D MMUEXT_PIN_L3_TABLE; +#else + op.cmd =3D MMUEXT_PIN_L2_TABLE; +#endif + op.arg1.mfn =3D pfn_to_mfn(PFN_DOWN(__pa(pgd))); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +/* Release a pagetables pages back as normal RW */ +void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op op; + + op.cmd =3D MMUEXT_UNPIN_TABLE; + op.arg1.mfn =3D pfn_to_mfn(PFN_DOWN(__pa(pgd))); + + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); + + pgd_walk(pgd, PAGE_KERNEL); +} + + +fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *ne= xt) +{ + xen_pgd_pin(next->pgd); +} + +fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + xen_pgd_pin(mm->pgd); = +} + +fastcall void xen_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk =3D current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm =3D=3D mm) { + tsk->active_mm =3D &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) =3D=3D 0); + } + + task_unlock(tsk); + + xen_pgd_unpin(mm->pgd); +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/mmu.h @@ -0,0 +1,51 @@ +#ifndef _XEN_MMU_H + +#include +#include + +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +void fastcall xen_set_pte(pte_t *ptep, pte_t pteval); +void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr, + pte_t *ptep, pte_t pteval); +void fastcall xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); +void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep); +void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *= ptep); + +fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *ne= xt); +fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); +fastcall void xen_exit_mmap(struct mm_struct *mm); + +fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep); + +void xen_pgd_pin(pgd_t *pgd); +void xen_pgd_unpin(pgd_t *pgd); + +#ifdef CONFIG_X86_PAE +fastcall unsigned long long xen_pte_val(pte_t); +fastcall unsigned long long xen_pmd_val(pmd_t); +fastcall unsigned long long xen_pgd_val(pgd_t); + +fastcall pte_t xen_make_pte(unsigned long long); +fastcall pmd_t xen_make_pmd(unsigned long long); +fastcall pgd_t xen_make_pgd(unsigned long long); + +fastcall void xen_set_pte_at(struct mm_struct *mm, u32 addr, + pte_t *ptep, pte_t pteval); +fastcall void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +fastcall void xen_set_pud(pud_t *ptr, pud_t val); +fastcall void xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep); +fastcall void xen_pmd_clear(pmd_t *pmdp); + + +#else +fastcall unsigned long xen_pte_val(pte_t); +fastcall unsigned long xen_pmd_val(pmd_t); +fastcall unsigned long xen_pgd_val(pgd_t); + +fastcall pte_t xen_make_pte(unsigned long); +fastcall pmd_t xen_make_pmd(unsigned long); +fastcall pgd_t xen_make_pgd(unsigned long); +#endif + +#endif /* _XEN_MMU_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/multicalls.c @@ -0,0 +1,62 @@ +#include + +#include + +#include "multicalls.h" + +#define MC_BATCH 8 +#define MC_ARGS (MC_BATCH * 32 / sizeof(u64)) + +struct mc_buffer { + struct multicall_entry entries[MC_BATCH]; + u64 args[MC_ARGS]; + unsigned mcidx, argidx; +}; + +static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); + +int xen_mc_flush(void) +{ + struct mc_buffer *b =3D &get_cpu_var(mc_buffer); + int ret =3D 0; + + if (b->mcidx) { + int i; + + if (HYPERVISOR_multicall(b->entries, b->mcidx) !=3D 0) + BUG(); + for(i =3D 0; i < b->mcidx; i++) + if (b->entries[i].result < 0) + ret++; + b->mcidx =3D 0; + b->argidx =3D 0; + } else + BUG_ON(b->argidx !=3D 0); + + put_cpu_var(mc_buffer); + + return ret; +} + +struct multicall_space xen_mc_entry(size_t args) +{ + struct mc_buffer *b =3D &get_cpu_var(mc_buffer); + struct multicall_space ret; + unsigned argspace =3D (args + sizeof(u64) - 1) / sizeof(u64); + + BUG_ON(argspace > MC_ARGS); + + if (b->mcidx =3D=3D MC_BATCH || + (b->argidx + argspace) > MC_ARGS) + if (xen_mc_flush()) + BUG(); + + ret.mc =3D &b->entries[b->mcidx]; + b->mcidx++; + ret.args =3D &b->args[b->argidx]; + b->argidx +=3D argspace; + + put_cpu_var(mc_buffer); + + return ret; +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/multicalls.h @@ -0,0 +1,13 @@ +#ifndef _XEN_MULTICALLS_H +#define _XEN_MULTICALLS_H + +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +struct multicall_space xen_mc_entry(size_t args); +int xen_mc_flush(void); + +#endif /* _XEN_MULTICALLS_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/setup.c @@ -0,0 +1,95 @@ +/* + * Machine specific setup for xen + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +static __initdata struct shared_info init_shared; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info =3D &init_shared; +EXPORT_SYMBOL(HYPERVISOR_shared_info); + +unsigned long *phys_to_machine_mapping; +unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16]; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ + +char * __init xen_memory_setup(void) +{ + unsigned long max_pfn =3D xen_start_info->nr_pages; + + e820.nr_map =3D 0; + add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); + + return "Xen"; +} + +void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &=3D ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |=3D TS_POLLING; + } +} + +void __init xen_arch_setup(void) +{ + struct physdevop_set_iopl set_iopl; + int rc; + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + + HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callb= ack, + __KERNEL_CS, (unsigned long)xen_failsafe_callback); + + set_iopl.iopl =3D 1; + rc =3D HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl); + if (rc !=3D 0) + printk(KERN_INFO "physdev_op failed %d\n", rc); + +#ifdef CONFIG_ACPI + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + disable_acpi(); + } +#endif + + memcpy(saved_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + + pm_idle =3D xen_idle; + + vdso_enabled =3D 1; /* enable by default */ +} =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/time.c @@ -0,0 +1,452 @@ +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +#define XEN_SHIFT 22 + +/* Permitted clock jitter, in nsecs, beyond which a warning will be printe= d. */ +static unsigned long permitted_clock_jitter =3D 10000000UL; /* 10ms */ +static int __init __permitted_clock_jitter(char *str) +{ + permitted_clock_jitter =3D simple_strtoul(str, NULL, 0); + return 1; +} +__setup("permitted_clock_jitter=3D", __permitted_clock_jitter); + + +/* These are perodically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + +/* Keep track of last time we did processing/updating of jiffies and xtime= . */ +static u64 processed_system_time; /* System time (ns) at last processing= . */ +static DEFINE_PER_CPU(u64, processed_system_time); + +/* How much CPU time was spent blocked and how much was 'stolen'? */ +static DEFINE_PER_CPU(u64, processed_stolen_time); +static DEFINE_PER_CPU(u64, processed_blocked_time); + +/* Current runstate of each CPU (updated automatically by the hypervisor).= */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* Must be signed, as it's compared with s64 quantities which can be -ve. = */ +#define NS_PER_TICK (1000000000LL/HZ) + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. + */ +static void get_time_values_from_xen(void) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src =3D &read_pda(xen.vcpu)->time; + dst =3D &get_cpu_var(shadow_time); + + do { + dst->version =3D src->version; + rmb(); + dst->tsc_timestamp =3D src->tsc_timestamp; + dst->system_timestamp =3D src->system_time; + dst->tsc_to_nsec_mul =3D src->tsc_to_system_mul; + dst->tsc_shift =3D src->tsc_shift; + rmb(); + } while ((src->version & 1) | (dst->version ^ src->version)); + + put_cpu_var(shadow_time); +} + +static inline int time_values_up_to_date(void) +{ + struct vcpu_time_info *src; + unsigned dstversion; + + src =3D &read_pda(xen.vcpu)->time; + dstversion =3D get_cpu_var(shadow_time).version; + put_cpu_var(shadow_time); + + rmb(); + return (dstversion =3D=3D src->version); +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>=3D -shift; + else + delta <<=3D shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=3DA" (product), "=3Dr" (tmp1), "=3Dr" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=3Da" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + rdtscll(now); + delta =3D now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + + +static void xen_timer_interrupt_hook(void) +{ + s64 delta, delta_cpu, stolen, blocked; + u64 sched_time; + int i, cpu =3D smp_processor_id(); + unsigned long ticks; + struct shadow_time_info *shadow =3D &__get_cpu_var(shadow_time); + struct vcpu_runstate_info *runstate =3D &__get_cpu_var(runstate); + + do { + get_time_values_from_xen(); + + /* Obtain a consistent snapshot of elapsed wallclock cycles. */ + delta =3D delta_cpu =3D + shadow->system_timestamp + get_nsec_offset(shadow); + if (0) + printk("tsc_timestamp=3D%llu system_timestamp=3D%llu tsc_to_nsec=3D%u t= sc_shift=3D%d, version=3D%u, delta=3D%lld processed_system_time=3D%lld\n", + shadow->tsc_timestamp, shadow->system_timestamp, + shadow->tsc_to_nsec_mul, shadow->tsc_shift, + shadow->version, delta, processed_system_time); + + delta -=3D processed_system_time; + delta_cpu -=3D __get_cpu_var(processed_system_time); + + /* + * Obtain a consistent snapshot of stolen/blocked cycles. We + * can use state_entry_time to detect if we get preempted here. + */ + do { + sched_time =3D runstate->state_entry_time; + barrier(); + stolen =3D runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline] - + __get_cpu_var(processed_stolen_time); + blocked =3D runstate->time[RUNSTATE_blocked] - + __get_cpu_var(processed_blocked_time); + barrier(); + } while (sched_time !=3D runstate->state_entry_time); + } while (!time_values_up_to_date()); + + if ((unlikely(delta < -(s64)permitted_clock_jitter) || + unlikely(delta_cpu < -(s64)permitted_clock_jitter)) + && printk_ratelimit()) { + printk("Timer ISR/%d: Time went backwards: " + "delta=3D%lld delta_cpu=3D%lld shadow=3D%lld " + "off=3D%lld processed=3D%lld cpu_processed=3D%lld\n", + cpu, delta, delta_cpu, shadow->system_timestamp, + (s64)get_nsec_offset(shadow), + processed_system_time, + __get_cpu_var(processed_system_time)); + for (i =3D 0; i < num_online_cpus(); i++) + printk(" %d: %lld\n", i, + per_cpu(processed_system_time, i)); + } + + /* System-wide jiffy work. */ + ticks =3D 0; + while(delta > NS_PER_TICK) { + delta -=3D NS_PER_TICK; + processed_system_time +=3D NS_PER_TICK; + ticks++; + } + do_timer(ticks); + + /* + * Account stolen ticks. + * HACK: Passing NULL to account_steal_time() + * ensures that the ticks are accounted as stolen. + */ + if ((stolen > 0) && (delta_cpu > 0)) { + delta_cpu -=3D stolen; + if (unlikely(delta_cpu < 0)) + stolen +=3D delta_cpu; /* clamp local-time progress */ + do_div(stolen, NS_PER_TICK); + __get_cpu_var(processed_stolen_time) +=3D stolen * NS_PER_TICK; + __get_cpu_var(processed_system_time) +=3D stolen * NS_PER_TICK; + account_steal_time(NULL, (cputime_t)stolen); + } + + /* + * Account blocked ticks. + * HACK: Passing idle_task to account_steal_time() + * ensures that the ticks are accounted as idle/wait. + */ + if ((blocked > 0) && (delta_cpu > 0)) { + delta_cpu -=3D blocked; + if (unlikely(delta_cpu < 0)) + blocked +=3D delta_cpu; /* clamp local-time progress */ + do_div(blocked, NS_PER_TICK); + __get_cpu_var(processed_blocked_time) +=3D blocked * NS_PER_TICK; + __get_cpu_var(processed_system_time) +=3D blocked * NS_PER_TICK; + account_steal_time(idle_task(cpu), (cputime_t)blocked); + } + + update_process_times(user_mode_vm(get_irq_regs())); +} + +static cycle_t xen_clocksource_read(void) +{ + struct shadow_time_info *shadow =3D &get_cpu_var(shadow_time); + cycle_t ret; + + get_time_values_from_xen(); + + ret =3D shadow->system_timestamp + get_nsec_offset(shadow); + + put_cpu_var(shadow_time); + + return ret; +} + +static void xen_read_wallclock(struct timespec *ts) +{ + const struct shared_info *s =3D HYPERVISOR_shared_info; + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version =3D s->wc_version; + rmb(); + now.tv_sec =3D s->wc_sec; + now.tv_nsec =3D s->wc_nsec; + rmb(); + } while ((s->wc_version & 1) | (version ^ s->wc_version)); + + delta =3D xen_clocksource_read(); /* time since system boot */ + delta +=3D now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec =3D do_div(delta, NSEC_PER_SEC); + now.tv_sec =3D delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +unsigned long xen_get_wallclock(void) +{ + struct timespec ts; + + xen_read_wallclock(&ts); + + return ts.tv_sec; +} + +int xen_set_wallclock(unsigned long now) +{ + /* do nothing for domU */ + return -1; +} + +static void init_cpu_khz(void) +{ + u64 __cpu_khz =3D 1000000ULL << 32; + struct vcpu_time_info *info; + info =3D &HYPERVISOR_shared_info->vcpu_info[0].time; + do_div(__cpu_khz, info->tsc_to_system_mul); + if (info->tsc_shift < 0) + cpu_khz =3D __cpu_khz << -info->tsc_shift; + else + cpu_khz =3D __cpu_khz >> info->tsc_shift; +} + +static struct clocksource xen_clocksource =3D { + .name =3D "xen", + .rating =3D 400, + .read =3D xen_clocksource_read, + .mask =3D ~0, + .mult =3D 1<time[RUNSTATE_blocked]; + per_cpu(processed_stolen_time, cpu) =3D + runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline]; +} + +static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) +{ + /* + * Here we are in the timer irq handler. We just have irqs locally + * disabled but we don't know if the timer_bh is running on the other + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need + * the irq version of write_lock because as just said we have irq + * locally disabled. -arca + */ + write_seqlock(&xtime_lock); + + xen_timer_interrupt_hook(); + + write_sequnlock(&xtime_lock); + + return IRQ_HANDLED; +} + +static void setup_cpu0_timer_irq(void) +{ + printk(KERN_DEBUG "installing Xen timer for CPU 0\n"); + + bind_virq_to_irqhandler( + VIRQ_TIMER, + 0, + xen_timer_interrupt, + SA_INTERRUPT, + "timer0", + NULL); +} + +static __init void xen_late_time_init(void) +{ + setup_cpu0_timer_irq(); +} + +extern void (*late_time_init)(void); +__init void xen_time_init(void) +{ + late_time_init =3D xen_late_time_init; + + get_time_values_from_xen(); + + processed_system_time =3D per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) =3D processed_system_time; + + init_cpu_khz(); + printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + + init_missing_ticks_accounting(0); + + clocksource_register(&xen_clocksource); + + /* Set initial system time with full resolution */ + xen_read_wallclock(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + tsc_disable =3D 0; +} + +/* Convert jiffies to system time. */ +static u64 jiffies_to_st(unsigned long j) +{ + unsigned long seq; + long delta; + u64 st; + + do { + seq =3D read_seqbegin(&xtime_lock); + delta =3D j - jiffies; + if (delta < 1) { + /* Triggers in some wrap-around cases, but that's okay: + * we just end up with a shorter timeout. */ + st =3D processed_system_time + NS_PER_TICK; + } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) !=3D 0) { + /* Very long timeout means there is no pending timer. + * We indicate this to Xen by passing zero timeout. */ + st =3D 0; + } else { + st =3D processed_system_time + delta * (u64)NS_PER_TICK; + } + } while (read_seqretry(&xtime_lock, seq)); + + return st; +} + +/* + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle = cpu + * These functions are based on implementations from arch/s390/kernel/time= .c + */ +void stop_hz_timer(void) +{ + unsigned int cpu =3D smp_processor_id(); + unsigned long j; + + cpu_set(cpu, nohz_cpu_mask); + + /* = + * See matching smp_mb in rcu_start_batch in rcupdate.c. These mbs = + * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a + * value of rcp->cur that matches rdp->quiescbatch and allows us to + * stop the hz timer then the cpumasks created for subsequent values + * of cur in rcu_start_batch are guaranteed to pick up the updated + * nohz_cpu_mask and so will not depend on this cpu. + */ + + smp_mb(); + + /* Leave ourselves in tick mode if rcu or softirq or timer pending. */ + if (rcu_needs_cpu(cpu) || local_softirq_pending() || + (j =3D next_timer_interrupt(), time_before_eq(j, jiffies))) { + cpu_clear(cpu, nohz_cpu_mask); + j =3D jiffies + 1; + } + + if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) !=3D 0) + BUG(); +} + +void start_hz_timer(void) +{ + cpu_clear(smp_processor_id(), nohz_cpu_mask); +} + =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/xen-head.S @@ -0,0 +1,29 @@ +/* Xen-specific pieces of head.S, intended to be included in the right + place in head.S */ + +#include +#include +#include + +ENTRY(startup_xen) + movl %esi,xen_start_info + jmp startup_paravirt + = +.pushsection ".bss.page_aligned" +ENTRY(hypercall_page) + .skip 0x1000 +.popsection + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "!writable_page_tables|p= ae_pgdir_above_4gb") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/arch/i386/xen/xen-ops.h @@ -0,0 +1,20 @@ +#ifndef XEN_OPS_H +#define XEN_OPS_H + +#include + +extern struct start_info *xen_start_info; +extern struct shared_info *HYPERVISOR_shared_info; + +char * __init xen_memory_setup(void); +void __init xen_arch_setup(void); +void __init xen_init_IRQ(void); + +void __init xen_time_init(void); +unsigned long xen_get_wallclock(void); +int xen_set_wallclock(unsigned long time); + +void stop_hz_timer(void); +void start_hz_timer(void); + +#endif /* XEN_OPS_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/include/asm-i386/hypercall.h +++ b/include/asm-i386/hypercall.h @@ -39,9 +39,6 @@ #include #include #include - -#define __STR(x) #x -#define STR(x) __STR(x) = extern struct { char _entry[32]; } hypercall_page[]; = @@ -413,4 +410,22 @@ MULTI_mmuext_op(struct multicall_entry * mcl->args[2] =3D (unsigned long)success_count; mcl->args[3] =3D domid; } + +static inline void +MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entr= ies) +{ + mcl->op =3D __HYPERVISOR_set_gdt; + mcl->args[0] =3D (unsigned long)frames; + mcl->args[1] =3D entries; +} + +static inline void +MULTI_stack_switch(struct multicall_entry *mcl, = + unsigned long ss, unsigned long esp) +{ + mcl->op =3D __HYPERVISOR_stack_switch; + mcl->args[0] =3D ss; + mcl->args[1] =3D esp; +} + #endif /* __HYPERCALL_H__ */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -43,6 +43,7 @@ extern void fixup_irqs(cpumask_t map); extern void fixup_irqs(cpumask_t map); #endif = +fastcall unsigned int do_IRQ(struct pt_regs *regs); void init_IRQ(void); void __init native_init_IRQ(void); = =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -31,6 +31,7 @@ struct Xgt_desc_struct; struct Xgt_desc_struct; struct tss_struct; struct mm_struct; +struct i386_pda; struct paravirt_ops { int paravirt_enabled; @@ -53,6 +54,7 @@ struct paravirt_ops void (*arch_setup)(void); char *(*memory_setup)(void); void (*init_IRQ)(void); + void (*init_pda)(struct i386_pda *, int cpu); = void (*pagetable_setup_start)(pgd_t *pgd_base); void (*pagetable_setup_done)(pgd_t *pgd_base); @@ -200,6 +202,30 @@ extern struct paravirt_ops paravirt_ops; = void native_pagetable_setup_start(pgd_t *pgd); = +/* Non-paravirtualized implementations of various operations for + back-ends which don't need their own version. */ +fastcall void native_clts(void); + +fastcall unsigned long native_read_cr0(void); +fastcall void native_write_cr0(unsigned long val); + +fastcall unsigned long native_read_cr2(void); +fastcall void native_write_cr2(unsigned long val); + +fastcall unsigned long native_read_cr3(void); +fastcall void native_write_cr3(unsigned long val); + +fastcall unsigned long native_read_cr4(void); +fastcall unsigned long native_read_cr4_safe(void); +fastcall void native_write_cr4(unsigned long val); + +fastcall void native_wbinvd(void); + +fastcall unsigned long long native_read_msr(unsigned int msr, int *err); +fastcall int native_write_msr(unsigned int msr, unsigned long long val); +fastcall unsigned long long native_read_tsc(void); +fastcall unsigned long long native_read_pmc(void); + #ifdef CONFIG_X86_PAE fastcall unsigned long long native_pte_val(pte_t); fastcall unsigned long long native_pmd_val(pmd_t); @@ -405,6 +431,19 @@ static inline void paravirt_exit_mmap(st { paravirt_ops.exit_mmap(mm); } + +static inline void paravirt_init_pda(struct i386_pda *pda, int cpu) +{ + if (paravirt_ops.init_pda) + (*paravirt_ops.init_pda)(pda, cpu); +} + +fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 = high); +fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 = high); +fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 = high); +fastcall void native_store_gdt(struct Xgt_desc_struct *dtr); +fastcall void native_store_idt(struct Xgt_desc_struct *dtr); +fastcall unsigned long native_store_tr(void); = #define __flush_tlb() paravirt_ops.flush_tlb_user() #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel() @@ -699,5 +738,8 @@ static inline void paravirt_exit_mmap(st { } = +static inline void paravirt_init_pda(struct i386_pda *pda, int cpu) +{ +} #endif /* CONFIG_PARAVIRT */ #endif /* __ASM_PARAVIRT_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- a/include/asm-i386/pda.h +++ b/include/asm-i386/pda.h @@ -16,6 +16,17 @@ struct i386_pda int cpu_number; struct task_struct *pcurrent; /* current process */ struct pt_regs *irq_regs; + +#ifdef CONFIG_PARAVIRT + union { +#ifdef CONFIG_XEN + struct { + struct vcpu_info *vcpu; + unsigned long cr3; + } xen; +#endif /* CONFIG_XEN */ + }; +#endif /* CONFIG_PARAVIRT */ }; = extern struct i386_pda *_cpu_pda[]; =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/include/xen/events.h @@ -0,0 +1,28 @@ +#ifndef _XEN_EVENTS_H +#define _XEN_EVENTS_H + +#include + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, + void *dev_id); +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id); + +/* + * Common unbind function for all event sources. Takes IRQ to unbind from. + * Automatically closes the underlying event channel (even for bindings + * made with bind_evtchn_to_irqhandler()). + */ +void unbind_from_irqhandler(unsigned int irq, void *dev_id); + +static inline void notify_remote_via_evtchn(int port) +{ + struct evtchn_send send =3D { .port =3D port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); +} + +extern void notify_remote_via_irq(int irq); +#endif /* _XEN_EVENTS_H */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/include/xen/features.h @@ -0,0 +1,26 @@ +/*************************************************************************= ***** + * features.h + * + * Query the features reported by Xen. + * + * Copyright (c) 2006, Ian Campbell + */ + +#ifndef __XEN_FEATURES_H__ +#define __XEN_FEATURES_H__ + +#include + +void xen_setup_features(void); + +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32]; + +static inline int xen_feature(int flag) +{ + switch(flag) { + } + + return xen_features[flag]; +} + +#endif /* __ASM_XEN_FEATURES_H__ */ =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D --- /dev/null +++ b/include/xen/page.h @@ -0,0 +1,175 @@ +#ifndef __XEN_PAGE_H +#define __XEN_PAGE_H + +#include + +#include + +#include + +#ifdef CONFIG_X86_PAE +/* Xen machine address */ +typedef struct xmaddr { + unsigned long long maddr; +} xmaddr_t; + +/* Xen pseudo-physical address */ +typedef struct xpaddr { + unsigned long long paddr; +} xpaddr_t; +#else +/* Xen machine address */ +typedef struct xmaddr { + unsigned long maddr; +} xmaddr_t; + +/* Xen pseudo-physical address */ +typedef struct xpaddr { + unsigned long paddr; +} xpaddr_t; +#endif + +#define XMADDR(x) ((xmaddr_t) { .maddr =3D (x) }) +#define XPADDR(x) ((xpaddr_t) { .paddr =3D (x) }) + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +extern unsigned long *phys_to_machine_mapping; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + + return phys_to_machine_mapping[(unsigned int)(pfn)] & + ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 1; + + return (phys_to_machine_mapping[pfn] !=3D INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + +#if 0 + if (unlikely((mfn >> machine_to_phys_order) !=3D 0)) + return max_mapnr; +#endif + + pfn =3D 0; + /* + * The array access can fail (e.g., device space beyond end of RAM). + * In such cases it doesn't matter what we return (we return garbage), + * but we must handle the fault without crashing! + */ + __get_user(pfn, &machine_to_phys_mapping[mfn]); + + return pfn; +} + +static inline xmaddr_t phys_to_machine(xpaddr_t phys) +{ + unsigned offset =3D phys.paddr & ~PAGE_MASK; + return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); +} + +static inline xpaddr_t machine_to_phys(xmaddr_t machine) +{ + unsigned offset =3D machine.maddr & ~PAGE_MASK; + return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))=3D=3DMFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservati= on, + * there is no PFN such that p2m(PFN) =3D=3D MFN. Otherwise we can get con= fused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) =3D=3D= PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *m= ust* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as = we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + extern unsigned long max_mapnr; + unsigned long pfn =3D mfn_to_pfn(mfn); + if ((pfn < max_mapnr) + && !xen_feature(XENFEAT_auto_translated_physmap) + && (phys_to_machine_mapping[pfn] !=3D mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mf= n) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + BUG_ON(pfn !=3D mfn && mfn !=3D INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] =3D mfn; +} + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) +#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#ifdef CONFIG_X86_PAE +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\ + (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT))) + +static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + pte_t pte; + + pte.pte_high =3D (page_nr >> (32 - PAGE_SHIFT)) | (pgprot_val(pgprot) >> = 32); + pte.pte_high &=3D (__supported_pte_mask >> 32); + pte.pte_low =3D ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); + pte.pte_low &=3D __supported_pte_mask; + + return pte; +} + +static inline unsigned long long pte_val_ma(pte_t x) +{ + return ((unsigned long long)x.pte_high << 32) | x.pte_low; +} +#define pmd_val_ma(v) ((v).pmd) +#define pud_val_ma(v) ((v).pgd.pgd) +#else /* !X86_PAE */ +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) +#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(pro= t)) +#define pte_val_ma(x) ((x).pte_low) +#define pmd_val_ma(v) ((v).pud.pgd.pgd) +#endif /* CONFIG_X86_PAE */ +#define pgd_val_ma(x) ((x).pgd) + +#define __pte_ma(x) ((pte_t) { (x) } ) + +xmaddr_t arbitrary_virt_to_machine(unsigned long address); +void make_lowmem_page_readonly(void *vaddr); +void make_lowmem_page_readwrite(void *vaddr); + +#endif /* __XEN_PAGE_H */ -- =