This adds a second entry point to head.S, which is jumped to when
booted by Xen.  This allows startup under Xen to be easily detected.

Because Xen starts the kernel in a fairly sane state, very little
setup is needed here; it just needs to jump into xen_start_kernel to
init the paravirt_ops structure, and then jump into start_kernel
proper.

This also makes a few small adjustments to the gdt tables to make them
properly suited to Xen.

One warty thing in this patch is the requirement to hard-code the
location of the Xen entrypoint and hypervisor page, rather than
letting the assembler/linker choose an appropriate place.  This is
because these addresses must be converted into a string at compile
time, so the address must be known at compile rather than link time.

Subject: [patch 16/21] Xen-paravirt: Add outline of Xen paravirt interface code, plus boot-time init.

Create a new arch/i386/xen/ directory for all the
Xen-specific paravirt code; I'd expect there would be parallel
paravirt-vmi, etc directories.

This also contains an initial set of paravirt ops for Xen, mostly ones
which can just be implemented with the generic native_* version.  At
boot time, the global paravirt_ops structure is populated with the Xen
pointers in xen_start_kernel, which then jumps to the standard
start_kernel.

Hook Xen entrypoint into common paravirt entrypoint.

Register Xen-specific architecture and memory setup functions.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 arch/i386/Makefile             |    3 
 arch/i386/kernel/cpu/common.c  |    3 
 arch/i386/kernel/entry.S       |   77 +++
 arch/i386/kernel/head.S        |   12 
 arch/i386/kernel/paravirt.c    |   48 +-
 arch/i386/kernel/vmlinux.lds.S |    1 
 arch/i386/mm/pgtable.c         |    1 
 arch/i386/xen/Makefile         |    2 
 arch/i386/xen/enlighten.c      |  807 +++++++++++++++++++++++++++++++++++++++
 arch/i386/xen/events.c         |  473 ++++++++++++++++++++++
 arch/i386/xen/features.c       |   29 +
 arch/i386/xen/mmu.c            |  419 ++++++++++++++++++++
 arch/i386/xen/mmu.h            |   51 ++
 arch/i386/xen/multicalls.c     |   62 ++
 arch/i386/xen/multicalls.h     |   13 
 arch/i386/xen/setup.c          |   95 ++++
 arch/i386/xen/time.c           |  452 +++++++++++++++++++++
 arch/i386/xen/xen-head.S       |   29 +
 arch/i386/xen/xen-ops.h        |   20 
 include/asm-i386/hypercall.h   |   21 -
 include/asm-i386/irq.h         |    1 
 include/asm-i386/paravirt.h    |   42 ++
 include/asm-i386/pda.h         |   11 
 include/xen/events.h           |   28 +
 include/xen/features.h         |   26 +
 include/xen/page.h             |  175 ++++++++
 26 files changed, 2872 insertions(+), 29 deletions(-)

===================================================================
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mcore-$(CONFIG_X86_ES7000)	:= mach-defau
 mcore-$(CONFIG_X86_ES7000)	:= mach-default
 core-$(CONFIG_X86_ES7000)	:= arch/i386/mach-es7000/
 
+# Xen paravirtualization support
+core-$(CONFIG_XEN)		+= arch/i386/xen/
+
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
===================================================================
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -19,6 +19,7 @@
 #include <mach_apic.h>
 #endif
 #include <asm/pda.h>
+#include <asm/paravirt.h>
 
 #include "cpu.h"
 
@@ -707,6 +708,8 @@ __cpuinit int init_gdt(int cpu, struct t
 	pda->cpu_number = cpu;
 	pda->pcurrent = idle;
 
+	paravirt_init_pda(pda, cpu);
+
 	return 1;
 }
 
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1001,6 +1001,83 @@ ENTRY(kernel_thread_helper)
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
 
+#ifdef CONFIG_XEN
+/* Xen only supports sysenter/sysexit in ring0 guests,
+   and only if it the guest asks for it.  So for now,
+   this should never be used. */
+ENTRY(xen_sti_sysexit)
+	CFI_STARTPROC
+	ud2
+	CFI_ENDPROC
+	
+ENTRY(xen_hypervisor_callback)
+	CFI_STARTPROC
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	mov %esp, %eax
+	call xen_evtchn_do_upcall
+	jmp  ret_from_intr
+	CFI_ENDPROC
+	
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl $1,%eax
+1:	mov 4(%esp),%ds
+2:	mov 8(%esp),%es
+3:	mov 12(%esp),%fs
+4:	mov 16(%esp),%gs
+	testl %eax,%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	jz 5f
+	addl $16,%esp		# EAX != 0 => Category 2 (Bad IRET)
+	CFI_ADJUST_CFA_OFFSET -16
+	jmp iret_exc
+5:	addl $16,%esp		# EAX == 0 => Category 1 (Bad segment)
+	CFI_ADJUST_CFA_OFFSET -16
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	jmp ret_from_exception
+	CFI_ENDPROC
+	
+.section .fixup,"ax"
+6:	xorl %eax,%eax
+	movl %eax,4(%esp)
+	jmp 1b
+7:	xorl %eax,%eax
+	movl %eax,8(%esp)
+	jmp 2b
+8:	xorl %eax,%eax
+	movl %eax,12(%esp)
+	jmp 3b
+9:	xorl %eax,%eax
+	movl %eax,16(%esp)
+	jmp 4b
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,6b
+	.long 2b,7b
+	.long 3b,8b
+	.long 4b,9b
+.previous
+		
+#endif	/* CONFIG_XEN */
+	
 .section .rodata,"a"
 #include "syscall_table.S"
 
===================================================================
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -519,6 +519,10 @@ 1:
 	jmp	1b
 #endif
 
+#ifdef CONFIG_XEN
+#include "../xen/xen-head.S"
+#endif
+	
 /*
  * Real beginning of normal "text" segment
  */
@@ -528,7 +532,7 @@ ENTRY(_stext)
 /*
  * BSS section
  */
-.section ".bss.page_aligned","w"
+.section ".bss.page_aligned"
 ENTRY(swapper_pg_dir)
 	.fill 1024,4,0
 ENTRY(empty_zero_page)
@@ -598,7 +602,8 @@ ENTRY(boot_gdt_table)
 /*
  * The Global Descriptor Table contains 28 quadwords, per-CPU.
  */
-	.align L1_CACHE_BYTES
+	.section ".data.page_aligned"
+	.align PAGE_SIZE_asm
 ENTRY(cpu_gdt_table)
 	.quad 0x0000000000000000	/* NULL descriptor */
 	.quad 0x0000000000000000	/* 0x0b reserved */
@@ -647,3 +652,6 @@ ENTRY(cpu_gdt_table)
 	.quad 0x0000000000000000	/* 0xf0 - unused */
 	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
 
+	/* Be sure this is zeroed to avoid false validations in Xen */
+	.fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
+	.previous
===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -146,55 +146,55 @@ void init_IRQ(void)
 	paravirt_ops.init_IRQ();
 }
 
-static fastcall void native_clts(void)
+fastcall void native_clts(void)
 {
 	asm volatile ("clts");
 }
 
-static fastcall unsigned long native_read_cr0(void)
+fastcall unsigned long native_read_cr0(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall void native_write_cr0(unsigned long val)
+fastcall void native_write_cr0(unsigned long val)
 {
 	asm volatile("movl %0,%%cr0": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr2(void)
+fastcall unsigned long native_read_cr2(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall void native_write_cr2(unsigned long val)
+fastcall void native_write_cr2(unsigned long val)
 {
 	asm volatile("movl %0,%%cr2": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr3(void)
+fastcall unsigned long native_read_cr3(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall void native_write_cr3(unsigned long val)
+fastcall void native_write_cr3(unsigned long val)
 {
 	asm volatile("movl %0,%%cr3": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr4(void)
+fastcall unsigned long native_read_cr4(void)
 {
 	unsigned long val;
 	asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
 	return val;
 }
 
-static fastcall unsigned long native_read_cr4_safe(void)
+fastcall unsigned long native_read_cr4_safe(void)
 {
 	unsigned long val;
 	/* This could fault if %cr4 does not exist */
@@ -207,7 +207,7 @@ static fastcall unsigned long native_rea
 	return val;
 }
 
-static fastcall void native_write_cr4(unsigned long val)
+fastcall void native_write_cr4(unsigned long val)
 {
 	asm volatile("movl %0,%%cr4": :"r" (val));
 }
@@ -246,12 +246,12 @@ static fastcall void native_halt(void)
 	asm volatile("hlt": : :"memory");
 }
 
-static fastcall void native_wbinvd(void)
+fastcall void native_wbinvd(void)
 {
 	asm volatile("wbinvd": : :"memory");
 }
 
-static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
 {
 	unsigned long long val;
 
@@ -270,7 +270,7 @@ static fastcall unsigned long long nativ
 	return val;
 }
 
-static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+fastcall int native_write_msr(unsigned int msr, unsigned long long val)
 {
 	int err;
 	asm volatile("2: wrmsr ; xorl %0,%0\n"
@@ -288,14 +288,14 @@ static fastcall int native_write_msr(uns
 	return err;
 }
 
-static fastcall unsigned long long native_read_tsc(void)
+fastcall unsigned long long native_read_tsc(void)
 {
 	unsigned long long val;
 	asm volatile("rdtsc" : "=A" (val));
 	return val;
 }
 
-static fastcall unsigned long long native_read_pmc(void)
+fastcall unsigned long long native_read_pmc(void)
 {
 	unsigned long long val;
 	asm volatile("rdpmc" : "=A" (val));
@@ -317,17 +317,17 @@ static fastcall void native_load_idt(con
 	asm volatile("lidt %0"::"m" (*dtr));
 }
 
-static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
 {
 	asm ("sgdt %0":"=m" (*dtr));
 }
 
-static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
 {
 	asm ("sidt %0":"=m" (*dtr));
 }
 
-static fastcall unsigned long native_store_tr(void)
+fastcall unsigned long native_store_tr(void)
 {
 	unsigned long tr;
 	asm ("str %0":"=r" (tr));
@@ -336,9 +336,9 @@ static fastcall unsigned long native_sto
 
 static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
 {
-#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
-	C(0); C(1); C(2);
-#undef C
+	get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 0] = t->tls_array[0];
+	get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 1] = t->tls_array[1];
+	get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 2] = t->tls_array[2];
 }
 
 static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
@@ -348,17 +348,17 @@ static inline void native_write_dt_entry
 	lp[1] = entry_high;
 }
 
-static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
 	native_write_dt_entry(dt, entrynum, low, high);
 }
 
-static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
 	native_write_dt_entry(dt, entrynum, low, high);
 }
 
-static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
 	native_write_dt_entry(dt, entrynum, low, high);
 }
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -93,6 +93,7 @@ SECTIONS
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	*(.data.page_aligned)
 	*(.data.idt)
   }
 
===================================================================
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -267,6 +267,7 @@ static void pgd_ctor(pgd_t *pgd)
 					swapper_pg_dir + USER_PTRS_PER_PGD,
 					KERNEL_PGD_PTRS);
 		} else {
+			memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 			spin_lock_irqsave(&pgd_lock, flags);
 			pgd_list_add(pgd);
 			spin_unlock_irqrestore(&pgd_lock, flags);
===================================================================
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,2 @@
+obj-y		:= enlighten.o setup.o events.o time.o \
+			features.o mmu.o multicalls.o
===================================================================
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,807 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/hypercall.h>
+#include <asm/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+
+extern struct Xgt_desc_struct cpu_gdt_descr;
+extern struct i386_pda boot_pda;
+extern unsigned long init_pg_tables_end;
+
+static DEFINE_PER_CPU(unsigned, lazy_mode);
+
+/* Code defined in entry.S (not a function) */
+extern const char xen_sti_sysexit[];
+
+struct start_info *xen_start_info;
+
+static unsigned xen_patch(u8 type, u16 clobber, void *firstinsn, unsigned len)
+{
+	/* Xen will require relocations to patch calls and jmps, and
+	   perhaps chunks of inline code */
+	return len;
+}
+
+static void __init xen_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_init_pda(struct i386_pda *pda, int cpu)
+{
+	/* Don't re-init boot CPU; we do it once very early in boot,
+	   and then then cpu_init tries to do it again. If so, just
+	   reuse the stuff we already set up. */
+	if (cpu == 0 && pda != &boot_pda) {
+		BUG_ON(boot_pda.xen.vcpu == NULL);
+		pda->xen = boot_pda.xen;
+		return;
+	}
+
+	pda->xen.vcpu = &HYPERVISOR_shared_info->vcpu_info[cpu];
+	pda->xen.cr3 = 0;
+}
+
+static fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+			       unsigned int *ecx, unsigned int *edx)
+{
+	unsigned maskedx = ~0;
+	if (*eax == 1)
+		maskedx = ~(1 << X86_FEATURE_APIC);
+
+	asm(XEN_EMULATE_PREFIX "cpuid"
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+	*edx &= maskedx;
+}
+
+static fastcall void xen_set_debugreg(int reg, unsigned long val)
+{
+	HYPERVISOR_set_debugreg(reg, val);
+}
+
+static fastcall unsigned long xen_get_debugreg(int reg)
+{
+	return HYPERVISOR_get_debugreg(reg);
+}
+
+static fastcall unsigned long xen_save_fl(void)
+{
+	struct vcpu_info *vcpu;
+	unsigned long flags;
+
+	preempt_disable();
+	vcpu = read_pda(xen.vcpu);
+	/* flag has opposite sense of mask */
+	flags = !vcpu->evtchn_upcall_mask;
+	preempt_enable();
+
+	/* convert to IF type flag 
+	   -0 -> 0x00000000
+	   -1 -> 0xffffffff
+	*/
+	return (-flags) & X86_EFLAGS_IF;
+}
+
+static fastcall void xen_restore_fl(unsigned long flags)
+{
+	struct vcpu_info *vcpu;
+
+	preempt_disable();
+
+	/* convert from IF type flag */
+	flags = !(flags & X86_EFLAGS_IF);
+	vcpu = read_pda(xen.vcpu);
+	vcpu->evtchn_upcall_mask = flags;
+	if (flags == 0) {
+		barrier(); /* unmask then check (avoid races) */
+		if (unlikely(vcpu->evtchn_upcall_pending))
+			force_evtchn_callback();
+		preempt_enable();
+	} else
+		preempt_enable_no_resched();
+}
+
+static fastcall void xen_irq_disable(void)
+{
+	struct vcpu_info *vcpu;
+	preempt_disable();
+	vcpu = read_pda(xen.vcpu);
+	vcpu->evtchn_upcall_mask = 1;
+	preempt_enable_no_resched();
+}
+
+static fastcall void xen_irq_enable(void)
+{
+	struct vcpu_info *vcpu;
+
+	preempt_disable();
+	vcpu = read_pda(xen.vcpu);
+	vcpu->evtchn_upcall_mask = 0;
+	barrier(); /* unmask then check (avoid races) */
+	if (unlikely(vcpu->evtchn_upcall_pending))
+		force_evtchn_callback();
+	preempt_enable();
+}
+
+static fastcall void xen_safe_halt(void)
+{
+	stop_hz_timer();
+	/* Blocking includes an implicit local_irq_enable(). */
+	if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+		BUG();
+	start_hz_timer();
+}
+
+static fastcall void xen_halt(void)
+{
+#if 0
+	if (irqs_disabled())
+		HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+#endif
+}
+
+static void xen_set_lazy_mode(int mode)
+{
+	unsigned *lazy = &get_cpu_var(lazy_mode);
+
+	if (xen_mc_flush())
+		BUG();
+
+	*lazy = mode;
+
+	put_cpu_var(lazy_mode);
+}
+
+static unsigned xen_get_lazy_mode(void)
+{
+	unsigned ret = get_cpu_var(lazy_mode);
+	put_cpu_var(lazy_mode);
+
+	return ret;
+}
+
+static fastcall void xen_load_tr_desc(void)
+{
+	/* do nothing */
+}
+
+static fastcall unsigned long xen_store_tr(void)
+{
+	return 0;
+}
+
+static fastcall void xen_set_ldt(const void *addr, unsigned entries)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_SET_LDT;
+	op->arg1.linear_addr = (unsigned long)addr;
+	if (addr)
+		/* ldt my be vmalloced, use arbitrary_virt_to_machine */
+		op->arg1.linear_addr = arbitrary_virt_to_machine((unsigned long)addr).maddr;
+	op->arg2.nr_ents = entries;
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU)
+		xen_mc_flush();
+}
+
+static fastcall void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+        unsigned long va;
+        int f;
+	unsigned size = dtr->size + 1;
+	unsigned long frames[16];
+
+	BUG_ON(size > 16*PAGE_SIZE);
+
+        for (va = dtr->address, f = 0;
+             va < dtr->address + size;
+             va += PAGE_SIZE, f++) {
+                frames[f] = virt_to_mfn(va);
+		make_lowmem_page_readonly((void *)va);
+        }
+
+	/* This is used very early, so we can't rely on per-cpu data
+	   being set up, so no multicalls */
+	if (HYPERVISOR_set_gdt(frames, size/8))
+		BUG();
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+				unsigned int cpu, unsigned int i)
+{
+	xmaddr_t maddr = virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN+i]);
+	struct multicall_space mc = xen_mc_entry(0);
+
+	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static fastcall void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	load_TLS_descriptor(t, cpu, 0);
+	load_TLS_descriptor(t, cpu, 1);
+	load_TLS_descriptor(t, cpu, 2);
+
+	if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+		BUG();
+}
+
+static fastcall void xen_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+        unsigned long lp = (unsigned long)dt + entrynum * 8;
+        xmaddr_t mach_lp = virt_to_machine(lp);
+	u64 entry = (u64)high << 32 | low;
+
+	xen_mc_flush();
+        if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+		BUG();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high, struct trap_info *info)
+{
+	u8 type, dpl;
+
+	type = (high >> 8) & 0x1f;
+	dpl = (high >> 13) & 3;
+
+	if (type != 0xf && type != 0xe)
+		return 0;
+
+	info->vector = vector;
+	info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+	info->cs = low >> 16;
+	info->flags = dpl;
+	/* interrupt gates clear IF */
+	if (type == 0xe)
+		info->flags |= 4;
+
+	return 1;
+}
+
+#if 0
+static void unpack_desc(u32 low, u32 high,
+			unsigned long *base, unsigned long *limit,
+			unsigned char *type, unsigned char *flags)
+{
+	*base = (high & 0xff000000) | ((high << 16) & 0x00ff0000) | ((low >> 16) & 0xffff);
+	*limit = (high & 0x000f0000) | (low & 0xffff);
+	*type = (high >> 8) & 0xff;
+	*flags = (high >> 20) & 0xf;
+}
+#endif
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static fastcall void xen_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+
+	int cpu = smp_processor_id();
+	unsigned long p = (unsigned long)dt + entrynum * 8;
+	unsigned long start = per_cpu(idt_desc, cpu).address;
+	unsigned long end = start + per_cpu(idt_desc, cpu).size + 1;
+
+	xen_mc_flush();
+
+	native_write_idt_entry(dt, entrynum, low, high);
+
+	if (p >= start && (p + 8) <= end) {
+		struct trap_info info[2];
+
+		info[1].address = 0;
+
+		if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+			if (HYPERVISOR_set_trap_table(info))
+				BUG();
+	}
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static fastcall void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+	static DEFINE_SPINLOCK(lock);
+	static struct trap_info traps[257];
+
+	int cpu = smp_processor_id();
+	unsigned in, out, count;
+
+	per_cpu(idt_desc, cpu) = *desc;
+	
+	count = desc->size / 8;
+	BUG_ON(count > 256);
+
+	spin_lock(&lock);
+	for(in = out = 0; in < count; in++) {
+		const u32 *entry = (u32 *)(desc->address + in * 8);
+
+		if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+			out++;
+	}
+	traps[out].address = 0;
+
+	xen_mc_flush();
+	if (HYPERVISOR_set_trap_table(traps))
+		BUG();
+
+	spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static fastcall void xen_write_gdt_entry(void *dt, int entry, u32 low, u32 high)
+{
+	switch ((high >> 8) & 0xff) {
+	case DESCTYPE_LDT:
+	case DESCTYPE_TSS:
+		/* ignore */
+		break;
+
+	default:
+		xen_mc_flush();
+		if (HYPERVISOR_update_descriptor(virt_to_machine(dt + entry*8).maddr,
+						 (u64)high << 32 | low))
+			BUG();
+	}
+}
+
+static fastcall void xen_load_esp0(struct tss_struct *tss,
+				   struct thread_struct *thread)
+{
+	if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU) {
+		if (HYPERVISOR_stack_switch(__KERNEL_DS, thread->esp0))
+			BUG();
+	} else {
+		struct multicall_space mcs = xen_mc_entry(0);
+		MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+	}
+}
+
+static fastcall void xen_set_iopl_mask(unsigned mask)
+{
+#if 0
+	struct physdev_set_iopl set_iopl;
+
+	/* Force the change at ring 0. */
+	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+#endif
+}
+
+static fastcall void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static fastcall void xen_apic_write(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall void xen_apic_write_atomic(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall unsigned long xen_apic_read(unsigned long reg)
+{
+	return 0;
+}
+#endif
+
+static fastcall void xen_flush_tlb(void)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+		BUG();
+}
+
+static fastcall void xen_flush_tlb_global(void)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_TLB_FLUSH_ALL;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+		BUG();
+}
+
+static fastcall void xen_flush_tlb_single(u32 addr)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_INVLPG_LOCAL;
+	op->arg1.linear_addr = addr & PAGE_MASK;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+		BUG();
+}
+
+static fastcall unsigned long xen_read_cr2(void)
+{
+	return read_pda(xen.vcpu)->arch.cr2;
+}
+
+static fastcall void xen_write_cr4(unsigned long cr4)
+{
+	/* never allow TSC to be disabled */
+	native_write_cr4(cr4 & ~X86_CR4_TSD);
+}
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+static fastcall unsigned long xen_read_cr3(void)
+{
+	return read_pda(xen.cr3);
+}
+
+static fastcall void xen_write_cr3(unsigned long cr3)
+{
+	if (cr3 == read_pda(xen.cr3)) {
+		/* just a simple tlb flush */
+		xen_flush_tlb();
+		return;
+	}
+
+	write_pda(xen.cr3, cr3);
+
+
+	{
+		struct mmuext_op *op;
+		struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+		unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+		op = mcs.args;
+		op->cmd = MMUEXT_NEW_BASEPTR;
+		op->arg1.mfn = mfn;
+
+		MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+		if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+			BUG();
+	}
+}
+
+static fastcall void xen_alloc_pt(u32 pfn)
+{
+	/* XXX pfn isn't necessarily a lowmem page */
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_alloc_pd(u32 pfn)
+{
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_release_pd(u32 pfn)
+{
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+	/* make sure next person to allocate this page gets a clean
+	   pmd */
+	clear_page(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_release_pt(u32 pfn)
+{
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_alloc_pd_clone(u32 pfn, u32 clonepfn,
+					u32 start, u32 count)
+{
+	xen_alloc_pd(pfn);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+	init_mm.pgd = base;
+
+	/* copy top-level of Xen-supplied pagetable into place.  For
+	   !PAE we can use this as-is, but for PAE it is a stand-in
+	   while we copy the pmd pages. */
+	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+	if (PTRS_PER_PMD > 1) {
+		int i;
+
+		/* For PAE, need to allocate new pmds, rather than
+		   share Xen's, since Xen doesn't like pmd's being
+		   shared between address spaces, even though in this
+		   case they're effectively the same address space. */
+		for(i = 0; i < PTRS_PER_PGD; i++) {
+			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+
+				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+				       PAGE_SIZE);
+
+				xen_alloc_pd(PFN_DOWN(__pa(pmd)));
+
+				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+			} else
+				pgd_clear(&base[i]);
+		}
+	}
+
+	/* make sure the zero_page is mapped RO so we
+	   can use it in pagetables */
+	make_lowmem_page_readonly(empty_zero_page);
+	make_lowmem_page_readonly(base);
+
+	/* Switch to new pagetable.  This is done before
+	   pagetable_init has done anything so that the new pages
+	   added to the table can be prepared properly for Xen.  */
+	printk("about to switch to new pagetable %p...\n", base);
+	xen_write_cr3(__pa(base));
+	printk("done\n");
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+	/* init_mm has a new pagetable set up - make sure the GDT page
+	   is still read-only in the new pagetable */
+	xen_load_gdt(&cpu_gdt_descr);
+
+	if (!xen_feature(XENFEAT_writable_page_tables)) {
+		/* Create a mapping for the shared info page.
+		   Should be set_fixmap(), but shared_info is a machine
+		   address with no corresponding pseudo-phys address. */
+		set_pte_mfn(fix_to_virt(FIX_PARAVIRT),
+			    PFN_DOWN(xen_start_info->shared_info),
+			    PAGE_KERNEL);
+		
+		HYPERVISOR_shared_info =
+			(struct shared_info *)fix_to_virt(FIX_PARAVIRT);
+	} else
+		HYPERVISOR_shared_info =
+			(struct shared_info *)__va(xen_start_info->shared_info);
+
+	xen_pgd_pin(base);
+
+	write_pda(xen.vcpu, &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]);
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+	.paravirt_enabled = 1,
+	.shared_kernel_pmd = 0,
+	.pgd_alignment = PAGE_SIZE,
+
+	.name = "Xen",
+	.banner = xen_banner,
+
+	.patch = xen_patch,
+
+	.memory_setup = xen_memory_setup,
+	.arch_setup = xen_arch_setup,
+	.init_IRQ = xen_init_IRQ,
+	.time_init = xen_time_init,
+	.init_pda = xen_init_pda,
+
+	.cpuid = xen_cpuid,
+
+	.set_debugreg = xen_set_debugreg,
+	.get_debugreg = xen_get_debugreg,
+
+	.clts = native_clts,
+
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+
+	.read_cr2 = xen_read_cr2,
+	.write_cr2 = native_write_cr2,
+
+	.read_cr3 = xen_read_cr3,
+	.write_cr3 = xen_write_cr3,
+
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = xen_write_cr4,
+
+	.save_fl = xen_save_fl,
+	.restore_fl = xen_restore_fl,
+	.irq_disable = xen_irq_disable,
+	.irq_enable = xen_irq_enable,
+	.safe_halt = xen_safe_halt,
+	.halt = xen_halt,
+	.wbinvd = native_wbinvd,
+
+	.read_msr = native_read_msr,
+	.write_msr = native_write_msr,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+
+	.iret = (void (fastcall *)(void))&hypercall_page[__HYPERVISOR_iret],
+	.irq_enable_sysexit = (void (fastcall *)(void))xen_sti_sysexit,
+
+	.load_tr_desc = xen_load_tr_desc,
+	.set_ldt = xen_set_ldt,
+	.load_gdt = xen_load_gdt,
+	.load_idt = xen_load_idt,
+	.load_tls = xen_load_tls,
+
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = xen_store_tr,
+
+	.write_ldt_entry = xen_write_ldt_entry,
+	.write_gdt_entry = xen_write_gdt_entry,
+	.write_idt_entry = xen_write_idt_entry,
+	.load_esp0 = xen_load_esp0,
+
+	.set_iopl_mask = xen_set_iopl_mask,
+	.io_delay = xen_io_delay,
+	.const_udelay = __const_udelay,
+	.set_wallclock = xen_set_wallclock,
+	.get_wallclock = xen_get_wallclock,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = xen_apic_write,
+	.apic_write_atomic = xen_apic_write_atomic,
+	.apic_read = xen_apic_read,
+	.setup_boot_clock = (void *)native_nop,
+	.setup_secondary_clock = (void *)native_nop,
+#endif
+
+	.flush_tlb_user = xen_flush_tlb,
+	.flush_tlb_kernel = xen_flush_tlb_global,
+	.flush_tlb_single = xen_flush_tlb_single,
+
+	.pte_update = (void *)native_nop,
+	.pte_update_defer = (void *)native_nop,
+
+	.pagetable_setup_start = xen_pagetable_setup_start,
+	.pagetable_setup_done = xen_pagetable_setup_done,
+	.activate_mm = xen_activate_mm,
+	.dup_mmap = xen_dup_mmap,
+	.exit_mmap = xen_exit_mmap,
+
+	.set_pte = xen_set_pte,
+	.set_pte_at = xen_set_pte_at,
+	.set_pmd = xen_set_pmd,
+
+	.alloc_pt = xen_alloc_pt,
+	.alloc_pd = xen_alloc_pd,
+	.alloc_pd_clone = xen_alloc_pd_clone,
+	.release_pd = xen_release_pd,
+	.release_pt = xen_release_pt,
+
+	.pte_val = xen_pte_val,
+	.pmd_val = xen_pmd_val,
+	.pgd_val = xen_pgd_val,
+
+	.make_pte = xen_make_pte,
+	.make_pmd = xen_make_pmd,
+	.make_pgd = xen_make_pgd,
+
+	.ptep_get_and_clear = xen_ptep_get_and_clear,
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = xen_set_pte,
+	.set_pte_present = xen_set_pte_at,
+	.set_pud = xen_set_pud,
+	.pte_clear = xen_pte_clear,
+	.pmd_clear = xen_pmd_clear,
+#endif	/* PAE */
+
+	.set_lazy_mode = xen_set_lazy_mode,
+	.startup_ipi_hook = (void *)native_nop,
+};
+
+/* First C function to be called on Xen boot */
+static asmlinkage void __init xen_start_kernel(void)
+{
+	u32 low, high;
+	pgd_t *pgd;
+
+	if (!xen_start_info)
+		return;
+
+	BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+	/* Install Xen paravirt ops */
+	paravirt_ops = xen_paravirt_ops;
+
+	xen_setup_features();
+
+	/* Get mfn list */
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+
+	pgd = (pgd_t *)xen_start_info->pt_base;
+
+	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+	/* set up the boot-time gdt and segments */
+	init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+	xen_load_gdt(&cpu_gdt_descr);
+
+	/* set up PDA descriptor */
+	pack_descriptor(&low, &high, (unsigned)&boot_pda, sizeof(boot_pda)-1,
+			0x80 | DESCTYPE_S | 0x02, 0);
+
+	/* Use hypercall directly, because xen_write_gdt_entry can't
+	 * be used until batched multicalls work. */
+	if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt_table +
+							 GDT_ENTRY_PDA).maddr,
+					 (u64)high << 32 | low))
+		BUG();
+
+	/* set up %fs and init Xen parts of the PDA */
+	asm volatile("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+	xen_init_pda(&boot_pda, 0);
+	boot_pda.xen.cr3 = __pa(pgd);
+
+	paravirt_ops.kernel_rpl = xen_feature(XENFEAT_supervisor_mode_kernel) ? 0 : 1;
+
+	/* set the limit of our address space */
+	reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+	/* set up basic CPUID stuff */
+	cpu_detect(&new_cpu_data);
+	new_cpu_data.hard_math = 1;
+	identify_cpu(&new_cpu_data);
+
+	/* Poke various useful things into boot_params */
+	LOADER_TYPE = (9 << 4) | 0;
+	INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+	INITRD_SIZE = xen_start_info->mod_len;
+
+	/* Start the world */
+	start_kernel();
+}
+
+paravirt_probe(xen_start_kernel);
===================================================================
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,473 @@
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+static u32 irq_info[NR_IRQS];
+
+/* Binding types. */
+enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+	[0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)	((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+	(void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+	return ((type << 24) | (index << 16) | evtchn);
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+	return (u16)(irq_info[irq]);
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+	return (u8)(irq_info[irq] >> 16);
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+	return (u8)(irq_info[irq] >> 24);
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+					   struct shared_info *sh,
+					   unsigned int idx)
+{
+	return (sh->evtchn_pending[idx] &
+		cpu_evtchn_mask[cpu][idx] &
+		~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+	int irq = evtchn_to_irq[chn];
+
+	BUG_ON(irq == -1);
+	set_native_irq_info(irq, cpumask_of_cpu(cpu));
+
+	__clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
+	__set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
+
+	cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+	int i;
+
+	/* By default all event channels notify CPU#0. */
+	for (i = 0; i < NR_IRQS; i++)
+		set_native_irq_info(i, cpumask_of_cpu(0));
+
+	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+void mask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_mask[0]);
+}
+EXPORT_SYMBOL_GPL(mask_evtchn);
+
+void unmask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = smp_processor_id();
+	struct vcpu_info *vcpu_info = read_pda(xen.vcpu);
+
+	BUG_ON(!irqs_disabled());
+
+	/* Slow path (hypercall) if this is a non-local port. */
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+		return;
+	}
+
+	sync_clear_bit(port, &s->evtchn_mask[0]);
+
+	/*
+	 * The following is basically the equivalent of 'hw_resend_irq'. Just
+	 * like a real IO-APIC we 'lose the interrupt edge' if the channel is
+	 * masked.
+	 */
+	if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+	    !sync_test_and_set_bit(port / BITS_PER_LONG,
+				   &vcpu_info->evtchn_pending_sel))
+		vcpu_info->evtchn_upcall_pending = 1;
+}
+EXPORT_SYMBOL_GPL(unmask_evtchn);
+
+static int find_unbound_irq(void)
+{
+	int irq;
+
+	/* Only allocate from dynirq range */
+	for (irq = 0; irq < NR_IRQS; irq++)
+		if (irq_bindcount[irq] == 0)
+			break;
+
+	if (irq == NR_IRQS)
+		panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+	return irq;
+}
+
+static int bind_evtchn_to_irq(unsigned int evtchn)
+{
+	int irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = evtchn_to_irq[evtchn];
+
+	if (irq == -1) {
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler(irq, &xen_dynamic_chip, handle_level_irq);
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(virq_to_irq, cpu)[virq];
+
+	if (irq == -1) {
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler(irq, &xen_dynamic_chip, handle_level_irq);
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+		per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+	struct evtchn_close close;
+	int evtchn = evtchn_from_irq(irq);
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+		close.port = evtchn;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+			BUG();
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+		irq_info[irq] = IRQ_UNBOUND;
+
+		dynamic_irq_init(irq);
+	}
+
+	spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+			      irqreturn_t (*handler)(int, void *),
+			      unsigned long irqflags, const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_evtchn_to_irq(evtchn);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+			    irqreturn_t (*handler)(int, void *),
+			    unsigned long irqflags, const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_virq_to_irq(virq, cpu);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+	free_irq(irq, dev_id);
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+/*
+  Search the CPUs pending events bitmasks.  For each one found, map
+  the event number to an irq, and feed it into do_IRQ() for
+  handling.
+
+  Xen uses a two-level bitmap to speed searching.  The first level is
+  a bitset of words which contain pending event bits.  The second
+  level is a bitset of pending events themselves.
+*/
+asmlinkage fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = read_pda(xen.vcpu);
+	unsigned long pending_words;
+
+	vcpu_info->evtchn_upcall_pending = 0;
+
+	/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+	pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+	while (pending_words != 0) {
+		unsigned long pending_bits;
+		int word_idx = __ffs(pending_words);
+		pending_words &= ~(1UL << word_idx);
+
+		while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+			int bit_idx = __ffs(pending_bits);
+			int port = (word_idx * BITS_PER_LONG) + bit_idx;
+			int irq = evtchn_to_irq[port];
+
+			if (irq != -1) {
+				regs->orig_eax = ~irq;
+				do_IRQ(regs);
+			}
+		}
+	}
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+	struct evtchn_bind_vcpu bind_vcpu;
+	int evtchn = evtchn_from_irq(irq);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	/* Send future instances of this interrupt to other vcpu. */
+	bind_vcpu.port = evtchn;
+	bind_vcpu.vcpu = tcpu;
+
+	/*
+	 * If this fails, it usually just indicates that we're dealing with a 
+	 * virq or IPI channel, which don't actually need to be rebound. Ignore
+	 * it, but don't do the xenlinux-level rebind in that case.
+	 */
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+		bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+	unsigned tcpu = first_cpu(dest);
+	rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	move_native_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	int ret = 0;
+
+	if (VALID_EVTCHN(evtchn)) {
+		set_evtchn(evtchn);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+	.name		= "xen-virq",
+	.mask		= disable_dynirq,
+	.unmask		= enable_dynirq,
+	.ack		= ack_dynirq,
+	.set_affinity	= set_affinity_irq,
+	.retrigger	= retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+	int i;
+
+	init_evtchn_cpu_bindings();
+
+	/* No event channels are 'live' right now. */
+	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+		mask_evtchn(i);
+
+	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_bindcount[i] = 0;
+
+	irq_ctx_init(smp_processor_id());
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+	struct xen_feature_info fi;
+	int i, j;
+
+	for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+		fi.submap_idx = i;
+		if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+			break;
+		for (j=0; j<32; j++)
+			xen_features[i*32+j] = !!(fi.submap & 1<<j);
+	}
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,419 @@
+//#include <linux/bug.h>
+#include <asm/bug.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+
+#include <asm/hypercall.h>
+#include <asm/paravirt.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+	pte_t *pte = lookup_address(address);
+	unsigned offset = address & PAGE_MASK;
+
+	BUG_ON(pte == NULL);
+
+	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_wrprotect(*pte);
+
+	if (xen_feature(XENFEAT_writable_page_tables))
+		*pte = ptev;
+	else
+		if(HYPERVISOR_update_va_mapping(address, ptev, 0))
+			BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_mkwrite(*pte);
+
+	if (xen_feature(XENFEAT_writable_page_tables))
+		*pte = ptev;
+	else
+		if(HYPERVISOR_update_va_mapping(address, ptev, 0))
+			BUG();
+}
+
+
+fastcall void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+#if 1
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptep).maddr;
+	u.val = pte_val_ma(pte);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+#else
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+#endif
+}
+
+fastcall void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pmd_val_ma(val);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+#ifdef CONFIG_X86_PAE
+fastcall void xen_set_pud(pmd_t *ptr, pud_t val)
+{
+	struct mmu_update u;
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pud_val_ma(val);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+#endif
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	/* <mfn,flags> stored as-is, to permit clearing entries */
+	xen_set_pte(pte, mfn_pte(mfn, flags));
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr,
+			     pte_t *ptep, pte_t pteval)
+{
+	if ((mm != current->mm && mm != &init_mm) ||
+	    HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
+		xen_set_pte(ptep, pteval);
+}
+
+void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+}
+
+void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+}
+
+#ifdef CONFIG_X86_PAE
+void fastcall xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+	set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void fastcall xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep)
+{
+#if 1
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = 0;	
+#else
+	set_64bit((u64 *)ptep, 0);
+#endif
+}
+
+void fastcall xen_pmd_clear(pmd_t *pmdp)
+{
+	xen_set_pmd(pmdp, __pmd(0));
+}
+
+fastcall unsigned long long xen_pte_val(pte_t pte)
+{
+	unsigned long long ret = 0;
+
+	if (pte.pte_low) {
+		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	}
+
+	return ret;
+}
+
+fastcall unsigned long long xen_pmd_val(pmd_t pmd)
+{
+	unsigned long long ret = pmd.pmd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+fastcall unsigned long long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+fastcall pte_t xen_make_pte(unsigned long long pte)
+{
+	if (pte & 1)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte, pte >> 32 };
+}
+
+fastcall pmd_t xen_make_pmd(unsigned long long pmd)
+{
+	if (pmd & 1)
+		pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+	return (pmd_t){ pmd };
+}
+
+fastcall pgd_t xen_make_pgd(unsigned long long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+
+fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep)
+{
+	pte_t res;
+
+	/* xchg acts as a barrier before the setting of the high bits */
+	res.pte_low = xchg(&ptep->pte_low, 0);
+	res.pte_high = ptep->pte_high;
+	ptep->pte_high = 0;
+
+	return res;
+}
+#else  /* !PAE */
+fastcall unsigned long xen_pte_val(pte_t pte)
+{
+	unsigned long ret = pte.pte_low;
+
+	if (ret & _PAGE_PRESENT)
+		ret = machine_to_phys(XMADDR(ret)).paddr;
+
+	return ret;
+}
+
+fastcall unsigned long xen_pmd_val(pmd_t pmd)
+{
+	BUG();
+	return 0;
+}
+
+fastcall unsigned long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+fastcall pte_t xen_make_pte(unsigned long pte)
+{
+	if (pte & _PAGE_PRESENT)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte };
+}
+
+fastcall pmd_t xen_make_pmd(unsigned long pmd)
+{
+	BUG();
+	return __pmd(0);
+}
+
+fastcall pgd_t xen_make_pgd(unsigned long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+
+fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep)
+{
+	return __pte_ma(xchg(&(ptep)->pte_low, 0));
+}
+#endif	/* CONFIG_X86_PAE */
+
+
+
+static void pgd_walk_set_prot(void *pt, pgprot_t flags)
+{
+	unsigned long pfn = PFN_DOWN(__pa(pt));
+
+	if (HYPERVISOR_update_va_mapping((unsigned long)pt,
+					 pfn_pte(pfn, flags), 0) < 0)
+		BUG();
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+	pgd_t *pgd = pgd_base;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int    g, u, m;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+
+		if (PTRS_PER_PUD > 1) /* not folded */
+			pgd_walk_set_prot(pud,flags);
+
+		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+			if (pud_none(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+
+			if (PTRS_PER_PMD > 1) /* not folded */
+				pgd_walk_set_prot(pmd,flags);
+
+			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+				if (pmd_none(*pmd))
+					continue;
+
+				/* This can get called before mem_map
+				   is set up, so we assume nothing is
+				   highmem at that point. */
+				if (mem_map == NULL ||
+				    !PageHighMem(pmd_page(*pmd))) {
+					pte = pte_offset_kernel(pmd,0);
+					pgd_walk_set_prot(pte,flags);
+				}
+			}
+		}
+	}
+
+	if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+					 pfn_pte(PFN_DOWN(__pa(pgd_base)),
+						 flags),
+					 UVMF_TLB_FLUSH) < 0)
+		BUG();
+}
+
+
+/* This is called just after a mm has been duplicated from its parent,
+   but it has not been used yet.  We need to make sure that its
+   pagetable is all read-only, and can be pinned. The pagetable itself
+   needs to map itself as RO; it doesn't matter what the state its in
+   with respect to any other pagetable. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+	struct mmuext_op op;
+
+	pgd_walk(pgd, PAGE_KERNEL_RO);
+
+#if defined(CONFIG_X86_PAE)
+	op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+	op.cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+/* Release a pagetables pages back as normal RW */
+void xen_pgd_unpin(pgd_t *pgd)
+{
+	struct mmuext_op op;
+
+	op.cmd = MMUEXT_UNPIN_TABLE;
+	op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
+		BUG();
+
+	pgd_walk(pgd, PAGE_KERNEL);
+}
+
+
+fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+	xen_pgd_pin(next->pgd);
+}
+
+fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	xen_pgd_pin(mm->pgd);	
+}
+
+fastcall void xen_exit_mmap(struct mm_struct *mm)
+{
+	struct task_struct *tsk = current;
+
+	task_lock(tsk);
+
+	/*
+	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+	 */
+	if (tsk->active_mm == mm) {
+		tsk->active_mm = &init_mm;
+		atomic_inc(&init_mm.mm_count);
+
+		switch_mm(mm, &init_mm, tsk);
+
+		atomic_dec(&mm->mm_count);
+		BUG_ON(atomic_read(&mm->mm_count) == 0);
+	}
+
+	task_unlock(tsk);
+
+	xen_pgd_unpin(mm->pgd);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,51 @@
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void fastcall xen_set_pte(pte_t *ptep, pte_t pteval);
+void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr,
+			     pte_t *ptep, pte_t pteval);
+void fastcall xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep);
+void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep);
+
+fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+fastcall void xen_exit_mmap(struct mm_struct *mm);
+
+fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep);
+
+void xen_pgd_pin(pgd_t *pgd);
+void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+fastcall unsigned long long xen_pte_val(pte_t);
+fastcall unsigned long long xen_pmd_val(pmd_t);
+fastcall unsigned long long xen_pgd_val(pgd_t);
+
+fastcall pte_t xen_make_pte(unsigned long long);
+fastcall pmd_t xen_make_pmd(unsigned long long);
+fastcall pgd_t xen_make_pgd(unsigned long long);
+
+fastcall void xen_set_pte_at(struct mm_struct *mm, u32 addr,
+			     pte_t *ptep, pte_t pteval);
+fastcall void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+fastcall void xen_set_pud(pud_t *ptr, pud_t val);
+fastcall void xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep);
+fastcall void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+fastcall unsigned long xen_pte_val(pte_t);
+fastcall unsigned long xen_pmd_val(pmd_t);
+fastcall unsigned long xen_pgd_val(pgd_t);
+
+fastcall pte_t xen_make_pte(unsigned long);
+fastcall pmd_t xen_make_pmd(unsigned long);
+fastcall pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif	/* _XEN_MMU_H */
===================================================================
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,62 @@
+#include <linux/percpu.h>
+
+#include <asm/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH	8
+#define MC_ARGS		(MC_BATCH * 32 / sizeof(u64))
+
+struct mc_buffer {
+	struct multicall_entry entries[MC_BATCH];
+	u64 args[MC_ARGS];
+	unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+
+int xen_mc_flush(void)
+{
+	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	int ret = 0;
+
+	if (b->mcidx) {
+		int i;
+
+		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+			BUG();
+		for(i = 0; i < b->mcidx; i++)
+			if (b->entries[i].result < 0)
+				ret++;
+		b->mcidx = 0;
+		b->argidx = 0;
+	} else
+		BUG_ON(b->argidx != 0);
+
+	put_cpu_var(mc_buffer);
+
+	return ret;
+}
+
+struct multicall_space xen_mc_entry(size_t args)
+{
+	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	struct multicall_space ret;
+	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+	BUG_ON(argspace > MC_ARGS);
+
+	if (b->mcidx == MC_BATCH ||
+	    (b->argidx + argspace) > MC_ARGS)
+		if (xen_mc_flush())
+			BUG();
+
+	ret.mc = &b->entries[b->mcidx];
+	b->mcidx++;
+	ret.args = &b->args[b->argidx];
+	b->argidx += argspace;
+
+	put_cpu_var(mc_buffer);
+
+	return ret;
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,13 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+struct multicall_space
+{
+	struct multicall_entry *mc;
+	void *args;
+};
+
+struct multicall_space xen_mc_entry(size_t args);
+int xen_mc_flush(void);
+
+#endif /* _XEN_MULTICALLS_H */
===================================================================
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,95 @@
+/*
+ *	Machine specific setup for xen
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/hypervisor.h>
+#include <asm/hypercall.h>
+#include <asm/pda.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+static __initdata struct shared_info init_shared;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = &init_shared;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+unsigned long *phys_to_machine_mapping;
+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+	unsigned long max_pfn = xen_start_info->nr_pages;
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+	return "Xen";
+}
+
+void xen_idle(void)
+{
+	local_irq_disable();
+
+	if (need_resched())
+		local_irq_enable();
+	else {
+		current_thread_info()->status &= ~TS_POLLING;
+		smp_mb__after_clear_bit();
+		safe_halt();
+		current_thread_info()->status |= TS_POLLING;
+	}
+}
+
+void __init xen_arch_setup(void)
+{
+	struct physdevop_set_iopl set_iopl;
+	int rc;
+
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+
+	HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+				 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+	set_iopl.iopl = 1;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl);
+	if (rc != 0)
+		printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+		disable_acpi();
+	}
+#endif
+
+	memcpy(saved_command_line, xen_start_info->cmd_line,
+	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+	pm_idle = xen_idle;
+
+	vdso_enabled = 1;	/* enable by default */
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,452 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/clocksource.h>
+
+#include <asm/hypercall.h>
+#include <asm/arch_hooks.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
+static int __init __permitted_clock_jitter(char *str)
+{
+	permitted_clock_jitter = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
+
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+	u32 tsc_to_nsec_mul;
+	int tsc_shift;
+	u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time;   /* System time (ns) at last processing. */
+static DEFINE_PER_CPU(u64, processed_system_time);
+
+/* How much CPU time was spent blocked and how much was 'stolen'? */
+static DEFINE_PER_CPU(u64, processed_stolen_time);
+static DEFINE_PER_CPU(u64, processed_blocked_time);
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
+#define NS_PER_TICK (1000000000LL/HZ)
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static void get_time_values_from_xen(void)
+{
+	struct vcpu_time_info   *src;
+	struct shadow_time_info *dst;
+
+	src = &read_pda(xen.vcpu)->time;
+	dst = &get_cpu_var(shadow_time);
+
+	do {
+		dst->version = src->version;
+		rmb();
+		dst->tsc_timestamp     = src->tsc_timestamp;
+		dst->system_timestamp  = src->system_time;
+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+		dst->tsc_shift         = src->tsc_shift;
+		rmb();
+	} while ((src->version & 1) | (dst->version ^ src->version));
+
+	put_cpu_var(shadow_time);
+}
+
+static inline int time_values_up_to_date(void)
+{
+	struct vcpu_time_info   *src;
+	unsigned dstversion;
+
+	src = &read_pda(xen.vcpu)->time;
+	dstversion = get_cpu_var(shadow_time).version;
+	put_cpu_var(shadow_time);
+
+	rmb();
+	return (dstversion == src->version);
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+	u64 now, delta;
+	rdtscll(now);
+	delta = now - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+
+static void xen_timer_interrupt_hook(void)
+{
+	s64 delta, delta_cpu, stolen, blocked;
+	u64 sched_time;
+	int i, cpu = smp_processor_id();
+	unsigned long ticks;
+	struct shadow_time_info *shadow = &__get_cpu_var(shadow_time);
+	struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate);
+
+	do {
+		get_time_values_from_xen();
+
+		/* Obtain a consistent snapshot of elapsed wallclock cycles. */
+		delta = delta_cpu =
+			shadow->system_timestamp + get_nsec_offset(shadow);
+		if (0)
+			printk("tsc_timestamp=%llu system_timestamp=%llu tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld processed_system_time=%lld\n",
+			       shadow->tsc_timestamp, shadow->system_timestamp,
+			       shadow->tsc_to_nsec_mul, shadow->tsc_shift,
+			       shadow->version, delta, processed_system_time);
+
+		delta     -= processed_system_time;
+		delta_cpu -= __get_cpu_var(processed_system_time);
+
+		/*
+		 * Obtain a consistent snapshot of stolen/blocked cycles. We
+		 * can use state_entry_time to detect if we get preempted here.
+		 */
+		do {
+			sched_time = runstate->state_entry_time;
+			barrier();
+			stolen = runstate->time[RUNSTATE_runnable] +
+				runstate->time[RUNSTATE_offline] -
+				__get_cpu_var(processed_stolen_time);
+			blocked = runstate->time[RUNSTATE_blocked] -
+				__get_cpu_var(processed_blocked_time);
+			barrier();
+		} while (sched_time != runstate->state_entry_time);
+	} while (!time_values_up_to_date());
+
+	if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
+	     unlikely(delta_cpu < -(s64)permitted_clock_jitter))
+	    && printk_ratelimit()) {
+		printk("Timer ISR/%d: Time went backwards: "
+		       "delta=%lld delta_cpu=%lld shadow=%lld "
+		       "off=%lld processed=%lld cpu_processed=%lld\n",
+		       cpu, delta, delta_cpu, shadow->system_timestamp,
+		       (s64)get_nsec_offset(shadow),
+		       processed_system_time,
+		       __get_cpu_var(processed_system_time));
+		for (i = 0; i < num_online_cpus(); i++)
+			printk(" %d: %lld\n", i,
+			       per_cpu(processed_system_time, i));
+	}
+
+	/* System-wide jiffy work. */
+	ticks = 0;
+	while(delta > NS_PER_TICK) {
+		delta -= NS_PER_TICK;
+		processed_system_time += NS_PER_TICK;
+		ticks++;
+	}
+	do_timer(ticks);
+
+	/*
+	 * Account stolen ticks.
+	 * HACK: Passing NULL to account_steal_time()
+	 * ensures that the ticks are accounted as stolen.
+	 */
+	if ((stolen > 0) && (delta_cpu > 0)) {
+		delta_cpu -= stolen;
+		if (unlikely(delta_cpu < 0))
+			stolen += delta_cpu; /* clamp local-time progress */
+		do_div(stolen, NS_PER_TICK);
+		__get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK;
+		__get_cpu_var(processed_system_time) += stolen * NS_PER_TICK;
+		account_steal_time(NULL, (cputime_t)stolen);
+	}
+
+	/*
+	 * Account blocked ticks.
+	 * HACK: Passing idle_task to account_steal_time()
+	 * ensures that the ticks are accounted as idle/wait.
+	 */
+	if ((blocked > 0) && (delta_cpu > 0)) {
+		delta_cpu -= blocked;
+		if (unlikely(delta_cpu < 0))
+			blocked += delta_cpu; /* clamp local-time progress */
+		do_div(blocked, NS_PER_TICK);
+		__get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK;
+		__get_cpu_var(processed_system_time)  += blocked * NS_PER_TICK;
+		account_steal_time(idle_task(cpu), (cputime_t)blocked);
+	}
+
+	update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static cycle_t xen_clocksource_read(void)
+{
+	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+	cycle_t ret;
+
+	get_time_values_from_xen();
+
+	ret = shadow->system_timestamp + get_nsec_offset(shadow);
+
+	put_cpu_var(shadow_time);
+
+	return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+	const struct shared_info *s = HYPERVISOR_shared_info;
+	u32 version;
+	u64 delta;
+	struct timespec now;
+
+	/* get wallclock at system boot */
+	do {
+		version = s->wc_version;
+		rmb();
+		now.tv_sec  = s->wc_sec;
+		now.tv_nsec = s->wc_nsec;
+		rmb();
+	} while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+	delta = xen_clocksource_read();	/* time since system boot */
+	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+	now.tv_sec = delta;
+
+	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+	struct timespec ts;
+
+	xen_read_wallclock(&ts);
+
+	return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+	/* do nothing for domU */
+	return -1;
+}
+
+static void init_cpu_khz(void)
+{
+	u64 __cpu_khz = 1000000ULL << 32;
+	struct vcpu_time_info *info;
+	info = &HYPERVISOR_shared_info->vcpu_info[0].time;
+	do_div(__cpu_khz, info->tsc_to_system_mul);
+	if (info->tsc_shift < 0)
+		cpu_khz = __cpu_khz << -info->tsc_shift;
+	else
+		cpu_khz = __cpu_khz >> info->tsc_shift;
+}
+
+static struct clocksource xen_clocksource = {
+	.name = "xen",
+	.rating = 400,
+	.read = xen_clocksource_read,
+	.mask = ~0,
+	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
+	.shift = XEN_SHIFT,
+	.is_continuous = 1
+};
+
+static void init_missing_ticks_accounting(int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+	memset(runstate, 0, sizeof(*runstate));
+
+	area.addr.v = runstate;
+	HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+
+	per_cpu(processed_blocked_time, cpu) =
+		runstate->time[RUNSTATE_blocked];
+	per_cpu(processed_stolen_time, cpu) =
+		runstate->time[RUNSTATE_runnable] +
+		runstate->time[RUNSTATE_offline];
+}
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+	/*
+	 * Here we are in the timer irq handler. We just have irqs locally
+	 * disabled but we don't know if the timer_bh is running on the other
+	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+	 * the irq version of write_lock because as just said we have irq
+	 * locally disabled. -arca
+	 */
+	write_seqlock(&xtime_lock);
+
+	xen_timer_interrupt_hook();
+
+	write_sequnlock(&xtime_lock);
+
+	return IRQ_HANDLED;
+}
+
+static void setup_cpu0_timer_irq(void)
+{
+	printk(KERN_DEBUG "installing Xen timer for CPU 0\n");
+
+	bind_virq_to_irqhandler(
+		VIRQ_TIMER,
+		0,
+		xen_timer_interrupt,
+		SA_INTERRUPT,
+		"timer0",
+		NULL);
+}
+
+static __init void xen_late_time_init(void)
+{
+	setup_cpu0_timer_irq();
+}
+
+extern void (*late_time_init)(void);
+__init void xen_time_init(void)
+{
+	late_time_init = xen_late_time_init;
+
+	get_time_values_from_xen();
+
+	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+	per_cpu(processed_system_time, 0) = processed_system_time;
+
+	init_cpu_khz();
+	printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
+	       cpu_khz / 1000, cpu_khz % 1000);
+
+	init_missing_ticks_accounting(0);
+
+	clocksource_register(&xen_clocksource);
+
+	/* Set initial system time with full resolution */
+	xen_read_wallclock(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+	tsc_disable = 0;
+}
+
+/* Convert jiffies to system time. */
+static u64 jiffies_to_st(unsigned long j)
+{
+	unsigned long seq;
+	long delta;
+	u64 st;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		delta = j - jiffies;
+		if (delta < 1) {
+			/* Triggers in some wrap-around cases, but that's okay:
+			 * we just end up with a shorter timeout. */
+			st = processed_system_time + NS_PER_TICK;
+		} else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
+			/* Very long timeout means there is no pending timer.
+			 * We indicate this to Xen by passing zero timeout. */
+			st = 0;
+		} else {
+			st = processed_system_time + delta * (u64)NS_PER_TICK;
+		}
+	} while (read_seqretry(&xtime_lock, seq));
+
+	return st;
+}
+
+/*
+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
+ * These functions are based on implementations from arch/s390/kernel/time.c
+ */
+void stop_hz_timer(void)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned long j;
+
+	cpu_set(cpu, nohz_cpu_mask);
+
+	/* 
+	 * See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs 
+	 * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a
+	 * value of rcp->cur that matches rdp->quiescbatch and allows us to
+	 * stop the hz timer then the cpumasks created for subsequent values
+	 * of cur in rcu_start_batch are guaranteed to pick up the updated
+	 * nohz_cpu_mask and so will not depend on this cpu.
+	 */
+
+	smp_mb();
+
+	/* Leave ourselves in tick mode if rcu or softirq or timer pending. */
+	if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
+	    (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		j = jiffies + 1;
+	}
+
+	if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
+		BUG();
+}
+
+void start_hz_timer(void)
+{
+	cpu_clear(smp_processor_id(), nohz_cpu_mask);
+}
+
===================================================================
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,29 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+	place in head.S */
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+ENTRY(startup_xen)
+	movl %esi,xen_start_info
+	jmp startup_paravirt
+	
+.pushsection ".bss.page_aligned"
+ENTRY(hypercall_page)
+	.skip 0x1000
+.popsection
+
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "linux")
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "2.6")
+	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long,  __PAGE_OFFSET)
+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long,  startup_xen)
+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long,  hypercall_page)
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "yes")
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "no")
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
===================================================================
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,20 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+
+void stop_hz_timer(void);
+void start_hz_timer(void);
+
+#endif /* XEN_OPS_H */
===================================================================
--- a/include/asm-i386/hypercall.h
+++ b/include/asm-i386/hypercall.h
@@ -39,9 +39,6 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
-
-#define __STR(x) #x
-#define STR(x) __STR(x)
 
 extern struct { char _entry[32]; } hypercall_page[];
 
@@ -413,4 +410,22 @@ MULTI_mmuext_op(struct multicall_entry *
 	mcl->args[2] = (unsigned long)success_count;
 	mcl->args[3] = domid;
 }
+
+static inline void
+MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
+{
+	mcl->op = __HYPERVISOR_set_gdt;
+	mcl->args[0] = (unsigned long)frames;
+	mcl->args[1] = entries;
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl, 
+		   unsigned long ss, unsigned long esp)
+{
+	mcl->op = __HYPERVISOR_stack_switch;
+	mcl->args[0] = ss;
+	mcl->args[1] = esp;
+}
+
 #endif /* __HYPERCALL_H__ */
===================================================================
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -43,6 +43,7 @@ extern void fixup_irqs(cpumask_t map);
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+fastcall unsigned int do_IRQ(struct pt_regs *regs);
 void init_IRQ(void);
 void __init native_init_IRQ(void);
 
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -31,6 +31,7 @@ struct Xgt_desc_struct;
 struct Xgt_desc_struct;
 struct tss_struct;
 struct mm_struct;
+struct i386_pda;
 struct paravirt_ops
 {
  	int paravirt_enabled;
@@ -53,6 +54,7 @@ struct paravirt_ops
 	void (*arch_setup)(void);
 	char *(*memory_setup)(void);
 	void (*init_IRQ)(void);
+	void (*init_pda)(struct i386_pda *, int cpu);
 
 	void (*pagetable_setup_start)(pgd_t *pgd_base);
 	void (*pagetable_setup_done)(pgd_t *pgd_base);
@@ -200,6 +202,30 @@ extern struct paravirt_ops paravirt_ops;
 
 void native_pagetable_setup_start(pgd_t *pgd);
 
+/* Non-paravirtualized implementations of various operations for
+   back-ends which don't need their own version. */
+fastcall void native_clts(void);
+
+fastcall unsigned long native_read_cr0(void);
+fastcall void native_write_cr0(unsigned long val);
+
+fastcall unsigned long native_read_cr2(void);
+fastcall void native_write_cr2(unsigned long val);
+
+fastcall unsigned long native_read_cr3(void);
+fastcall void native_write_cr3(unsigned long val);
+
+fastcall unsigned long native_read_cr4(void);
+fastcall unsigned long native_read_cr4_safe(void);
+fastcall void native_write_cr4(unsigned long val);
+
+fastcall void native_wbinvd(void);
+
+fastcall unsigned long long native_read_msr(unsigned int msr, int *err);
+fastcall int native_write_msr(unsigned int msr, unsigned long long val);
+fastcall unsigned long long native_read_tsc(void);
+fastcall unsigned long long native_read_pmc(void);
+
 #ifdef CONFIG_X86_PAE
 fastcall unsigned long long native_pte_val(pte_t);
 fastcall unsigned long long native_pmd_val(pmd_t);
@@ -405,6 +431,19 @@ static inline void paravirt_exit_mmap(st
 {
 	paravirt_ops.exit_mmap(mm);
 }
+
+static inline void paravirt_init_pda(struct i386_pda *pda, int cpu)
+{
+	if (paravirt_ops.init_pda)
+		(*paravirt_ops.init_pda)(pda, cpu);
+}
+
+fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high);
+fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high);
+fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high);
+fastcall void native_store_gdt(struct Xgt_desc_struct *dtr);
+fastcall void native_store_idt(struct Xgt_desc_struct *dtr);
+fastcall unsigned long native_store_tr(void);
 
 #define __flush_tlb() paravirt_ops.flush_tlb_user()
 #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
@@ -699,5 +738,8 @@ static inline void paravirt_exit_mmap(st
 {
 }
 
+static inline void paravirt_init_pda(struct i386_pda *pda, int cpu)
+{
+}
 #endif /* CONFIG_PARAVIRT */
 #endif	/* __ASM_PARAVIRT_H */
===================================================================
--- a/include/asm-i386/pda.h
+++ b/include/asm-i386/pda.h
@@ -16,6 +16,17 @@ struct i386_pda
 	int cpu_number;
 	struct task_struct *pcurrent;	/* current process */
 	struct pt_regs *irq_regs;
+
+#ifdef CONFIG_PARAVIRT
+	union {
+#ifdef CONFIG_XEN
+		struct {
+			struct vcpu_info *vcpu;
+			unsigned long cr3;
+		} xen;
+#endif	/* CONFIG_XEN */
+	};
+#endif	/* CONFIG_PARAVIRT */
 };
 
 extern struct i386_pda *_cpu_pda[];
===================================================================
--- /dev/null
+++ b/include/xen/events.h
@@ -0,0 +1,28 @@
+#ifndef _XEN_EVENTS_H
+#define _XEN_EVENTS_H
+
+#include <linux/irq.h>
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+			      irqreturn_t (*handler)(int, void *),
+			      unsigned long irqflags, const char *devname,
+			      void *dev_id);
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+			    irqreturn_t (*handler)(int, void *),
+			    unsigned long irqflags, const char *devname, void *dev_id);
+
+/*
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (even for bindings
+ * made with bind_evtchn_to_irqhandler()).
+ */
+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+
+static inline void notify_remote_via_evtchn(int port)
+{
+	struct evtchn_send send = { .port = port };
+	(void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
+}
+
+extern void notify_remote_via_irq(int irq);
+#endif	/* _XEN_EVENTS_H */
===================================================================
--- /dev/null
+++ b/include/xen/features.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * features.h
+ *
+ * Query the features reported by Xen.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+
+#ifndef __XEN_FEATURES_H__
+#define __XEN_FEATURES_H__
+
+#include <xen/interface/features.h>
+
+void xen_setup_features(void);
+
+extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
+
+static inline int xen_feature(int flag)
+{
+	switch(flag) {
+	}
+
+	return xen_features[flag];
+}
+
+#endif /* __ASM_XEN_FEATURES_H__ */
===================================================================
--- /dev/null
+++ b/include/xen/page.h
@@ -0,0 +1,175 @@
+#ifndef __XEN_PAGE_H
+#define __XEN_PAGE_H
+
+#include <linux/pfn.h>
+
+#include <asm/uaccess.h>
+
+#include <xen/features.h>
+
+#ifdef CONFIG_X86_PAE
+/* Xen machine address */
+typedef struct xmaddr {
+	unsigned long long maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+	unsigned long long paddr;
+} xpaddr_t;
+#else
+/* Xen machine address */
+typedef struct xmaddr {
+	unsigned long maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+	unsigned long paddr;
+} xpaddr_t;
+#endif
+
+#define XMADDR(x)	((xmaddr_t) { .maddr = (x) })
+#define XPADDR(x)	((xpaddr_t) { .paddr = (x) })
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY	(~0UL)
+#define FOREIGN_FRAME_BIT	(1UL<<31)
+#define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
+
+extern unsigned long *phys_to_machine_mapping;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return pfn;
+
+	return phys_to_machine_mapping[(unsigned int)(pfn)] &
+		~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 1;
+
+	return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+	unsigned long pfn;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return mfn;
+
+#if 0
+	if (unlikely((mfn >> machine_to_phys_order) != 0))
+		return max_mapnr;
+#endif
+
+	pfn = 0;
+	/*
+	 * The array access can fail (e.g., device space beyond end of RAM).
+	 * In such cases it doesn't matter what we return (we return garbage),
+	 * but we must handle the fault without crashing!
+	 */
+	__get_user(pfn, &machine_to_phys_mapping[mfn]);
+
+	return pfn;
+}
+
+static inline xmaddr_t phys_to_machine(xpaddr_t phys)
+{
+	unsigned offset = phys.paddr & ~PAGE_MASK;
+	return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
+}
+
+static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+{
+	unsigned offset = machine.maddr & ~PAGE_MASK;
+	return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ *  1. If the MFN is an I/O page then Xen will set the m2p entry
+ *     to be outside our maximum possible pseudophys range.
+ *  2. If the MFN belongs to a different domain then we will certainly
+ *     not have MFN in our p2m table. Conversely, if the page is ours,
+ *     then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ *      require. In all the cases we care about, the FOREIGN_FRAME bit is
+ *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+	extern unsigned long max_mapnr;
+	unsigned long pfn = mfn_to_pfn(mfn);
+	if ((pfn < max_mapnr)
+	    && !xen_feature(XENFEAT_auto_translated_physmap)
+	    && (phys_to_machine_mapping[pfn] != mfn))
+		return max_mapnr; /* force !pfn_valid() */
+	return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+	phys_to_machine_mapping[pfn] = mfn;
+}
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v)	(phys_to_machine(XPADDR(__pa(v))))
+#define virt_to_mfn(v)		(pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+#ifdef CONFIG_X86_PAE
+#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
+                       (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
+
+static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+	pte_t pte;
+
+	pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | (pgprot_val(pgprot) >> 32);
+	pte.pte_high &= (__supported_pte_mask >> 32);
+	pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
+	pte.pte_low &= __supported_pte_mask;
+
+	return pte;
+}
+
+static inline unsigned long long pte_val_ma(pte_t x)
+{
+	return ((unsigned long long)x.pte_high << 32) | x.pte_low;
+}
+#define pmd_val_ma(v) ((v).pmd)
+#define pud_val_ma(v) ((v).pgd.pgd)
+#else  /* !X86_PAE */
+#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
+#define mfn_pte(pfn, prot)	__pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pte_val_ma(x)	((x).pte_low)
+#define pmd_val_ma(v)	((v).pud.pgd.pgd)
+#endif	/* CONFIG_X86_PAE */
+#define pgd_val_ma(x)	((x).pgd)
+
+#define __pte_ma(x)	((pte_t) { (x) } )
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+void make_lowmem_page_readonly(void *vaddr);
+void make_lowmem_page_readwrite(void *vaddr);
+
+#endif /* __XEN_PAGE_H */

--