All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0 of 7]  lguest host code
@ 2007-02-09 14:59 Rusty Russell
  2007-02-09 15:03 ` [PATCH 1 of 7] lguest: Move mce_disabled to asm/mce.h so lguest can use it Rusty Russell
  0 siblings, 1 reply; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 14:59 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

These seven patches replace [PATCH 6/10], or in the second send, patches
6a. 6b, 6c and 6d.

There are more patches because Andi hassled me about extern decls, so I
had to expose some things.  Tiny patches, but worth separating because
you never know when such changes will break something.  Compiles and
runs here...

Thanks,
Rusty.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1 of 7]  lguest: Move mce_disabled to asm/mce.h so lguest can use it.
  2007-02-09 14:59 [PATCH 0 of 7] lguest host code Rusty Russell
@ 2007-02-09 15:03 ` Rusty Russell
  2007-02-09 15:03   ` [PATCH 2 of 7] lguest: Rename cpu_gdt_descr and remove extern declaration from smpboot.c Rusty Russell
  0 siblings, 1 reply; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 15:03 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -1,4 +1,5 @@
 #include <linux/init.h>
+#include <asm/mce.h>
 
 void amd_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
@@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_
 /* Call the installed machine check handler for this CPU setup. */
 extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
 
-extern int mce_disabled;
 extern int nr_mce_banks;
 
===================================================================
--- a/include/asm-i386/mce.h
+++ b/include/asm-i386/mce.h
@@ -3,3 +3,5 @@ extern void mcheck_init(struct cpuinfo_x
 #else
 #define mcheck_init(c) do {} while(0)
 #endif
+
+extern int mce_disabled;



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2 of 7]  lguest: Rename cpu_gdt_descr and remove extern declaration from smpboot.c
  2007-02-09 15:03 ` [PATCH 1 of 7] lguest: Move mce_disabled to asm/mce.h so lguest can use it Rusty Russell
@ 2007-02-09 15:03   ` Rusty Russell
  2007-02-09 15:04     ` [PATCH 3 of 7] lguest: Remove extern declaration from mm/discontig.c, put in header Rusty Russell
  0 siblings, 1 reply; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 15:03 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

When I implemented the DECLARE_PER_CPU(var) macros, I was careful that
people couldn't use "var" in a non-percpu context, by prepending
percpu__.  I never considered that this would allow them to overload
the same name for a per-cpu and a non-percpu variable.

It is only one of many horrors in the i386 boot code, but let's rename
the non-perpcu cpu_gdt_descr to early_gdt_descr (not boot_gdt_descr,
that's something else...)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -309,7 +309,7 @@ 2:	movl %cr0,%eax
 
 	call check_x87
 	call setup_pda
-	lgdt cpu_gdt_descr
+	lgdt early_gdt_descr
 	lidt idt_descr
 	ljmp $(__KERNEL_CS),$1f
 1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
@@ -365,7 +365,7 @@ setup_pda:
 	movl start_pda, %eax
 
 	/* slot the PDA address into the GDT */
-	mov cpu_gdt_descr+2, %ecx
+	mov early_gdt_descr+2, %ecx
 	mov %ax, (__KERNEL_PDA+0+2)(%ecx)		/* base & 0x0000ffff */
 	shr $16, %eax
 	mov %al, (__KERNEL_PDA+4+0)(%ecx)		/* base & 0x00ff0000 */
@@ -588,7 +588,7 @@ idt_descr:
 
 # boot GDT descriptor (later on used by CPU#0):
 	.word 0				# 32 bit align gdt_desc.address
-ENTRY(cpu_gdt_descr)
+ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
 	.long cpu_gdt_table
 
===================================================================
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -619,7 +619,6 @@ extern struct {
 	unsigned short ss;
 } stack_start;
 extern struct i386_pda *start_pda;
-extern struct Xgt_desc_struct cpu_gdt_descr;
 
 #ifdef CONFIG_NUMA
 
===================================================================
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -22,7 +22,7 @@ struct Xgt_desc_struct {
 
 extern struct Xgt_desc_struct idt_descr;
 DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
-
+extern struct Xgt_desc_struct early_gdt_descr;
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3 of 7]  lguest: Remove extern declaration from mm/discontig.c, put in header.
  2007-02-09 15:03   ` [PATCH 2 of 7] lguest: Rename cpu_gdt_descr and remove extern declaration from smpboot.c Rusty Russell
@ 2007-02-09 15:04     ` Rusty Russell
  2007-02-09 15:09       ` [PATCH 4 of 7] lguest: Config and headers Rusty Russell
  0 siblings, 1 reply; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 15:04 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

lguest wants it too, but Andi won't let me put an extern decl in.
Times, they are a'changin.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -101,7 +101,6 @@ extern void add_one_highpage_init(struct
 extern void add_one_highpage_init(struct page *, int, int);
 
 extern struct e820map e820;
-extern unsigned long init_pg_tables_end;
 extern unsigned long highend_pfn, highstart_pfn;
 extern unsigned long max_low_pfn;
 extern unsigned long totalram_pages;
===================================================================
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -77,6 +77,8 @@ void __init add_memory_region(unsigned l
 void __init add_memory_region(unsigned long long start,
 			      unsigned long long size, int type);
 
+extern unsigned long init_pg_tables_end;
+
 #endif /* __ASSEMBLY__ */
 
 #endif  /*  __KERNEL__  */



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 4 of 7]  lguest: Config and headers
  2007-02-09 15:04     ` [PATCH 3 of 7] lguest: Remove extern declaration from mm/discontig.c, put in header Rusty Russell
@ 2007-02-09 15:09       ` Rusty Russell
  2007-02-09 15:14         ` [PATCH 5 of 7] lguest: the host code (lg.ko) Rusty Russell
  2007-02-09 18:15         ` [PATCH 4 of 7] lguest: Config and headers James Morris
  0 siblings, 2 replies; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 15:09 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

[ This is the previous 6a, with the following Andi-inspired changes:

1) use HYPERVISOR_MAP_ORDER instead of HYPERVISOR_SIZE for clarity
2) mutex instead of a semaphore ]

Unfortunately, we don't have the build infrastructure for "private"
asm-offsets.h files, so there's a not-so-neat include in
arch/i386/kernel/asm-offsets.c.

The four headers are:
asm/lguest.h:
	Things the guest needs to know (hypercall numbers, etc).
asm/lguest_device.h:
	Things lguest devices need to know (lguest bus registration)
asm/lguest_user.h:
	Things that the lguest userspace utility needs (/dev/lguest
	and some devices)
arch/i386/lguest/lg.h:
	Internal header for the lg module (which consists of 8 files).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -226,6 +226,27 @@ config ES7000_CLUSTERED_APIC
 	depends on SMP && X86_ES7000 && MPENTIUMIII
 
 source "arch/i386/Kconfig.cpu"
+
+config LGUEST
+	tristate "Linux hypervisor example code"
+	depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE
+	select LGUEST_GUEST
+	select HVC_DRIVER
+	---help---
+	  This is a very simple module which allows you to run
+	  multiple instances of the same Linux kernel, using the
+	  "lguest" command found in the Documentation/lguest directory.
+	  Note that "lguest" is pronounced to rhyme with "fell quest",
+	  not "rustyvisor".  See Documentation/lguest/lguest.txt.
+
+	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
+
+config LGUEST_GUEST
+	bool
+	help
+	  The guest needs code built-in, even if the host has lguest
+	  support as a module.  The drivers are tiny, so we build them
+	  in too.
 
 config HPET_TIMER
 	bool "HPET Timer Support"
===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -16,6 +16,10 @@
 #include <asm/thread_info.h>
 #include <asm/elf.h>
 #include <asm/pda.h>
+#ifdef CONFIG_LGUEST_GUEST
+#include <asm/lguest.h>
+#include "../lguest/lg.h"
+#endif
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -111,4 +115,19 @@ void foo(void)
 	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
 	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
 #endif
+
+#ifdef CONFIG_LGUEST_GUEST
+	BLANK();
+	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_STATE_host_stackptr, lguest_state, host.stackptr);
+	OFFSET(LGUEST_STATE_host_pgdir, lguest_state, host.pgdir);
+	OFFSET(LGUEST_STATE_host_gdt, lguest_state, host.gdt);
+	OFFSET(LGUEST_STATE_host_idt, lguest_state, host.idt);
+	OFFSET(LGUEST_STATE_regs, lguest_state, regs);
+	OFFSET(LGUEST_STATE_gdt, lguest_state, gdt);
+	OFFSET(LGUEST_STATE_idt, lguest_state, idt);
+	OFFSET(LGUEST_STATE_gdt_table, lguest_state, gdt_table);
+	OFFSET(LGUEST_STATE_trapnum, lguest_state, regs.trapnum);
+	OFFSET(LGUEST_STATE_errcode, lguest_state, regs.errcode);
+#endif
 }
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lg.h
@@ -0,0 +1,253 @@
+#ifndef _LGUEST_H
+#define _LGUEST_H
+
+#include <asm/desc.h>
+/* 64k ought to be enough for anybody! */
+#define HYPERVISOR_MAP_ORDER 16
+#define HYPERVISOR_PAGES ((1 << HYPERVISOR_MAP_ORDER)/PAGE_SIZE)
+
+#define GDT_ENTRY_LGUEST_CS	10
+#define GDT_ENTRY_LGUEST_DS	11
+#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
+#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <linux/binfmts.h>
+#include <linux/futex.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/semaphore.h>
+#include "irq_vectors.h"
+
+#define GUEST_DPL 1
+
+struct lguest_regs
+{
+	/* Manually saved part. */
+	u32 cr3;
+	u32 ebx, ecx, edx;
+	u32 esi, edi, ebp;
+	u32 gs;
+	u32 eax;
+	u32 fs, ds, es;
+	u32 trapnum, errcode;
+	/* Trap pushed part */
+	u32 eip;
+	u32 cs;
+	u32 eflags;
+	u32 esp;
+	u32 ss;
+};
+
+__exit void free_pagetables(void);
+__init int init_pagetables(struct page *hype_pages);
+
+/* Full 4G segment descriptors, suitable for CS and DS. */
+#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) 
+#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) 
+
+/* Simplified version of IDT. */
+struct host_trap
+{
+	unsigned long addr;
+	int disable_interrupts;
+};
+
+struct lguest_dma_info
+{
+	struct list_head list;
+	union futex_key key;
+	unsigned long dmas;
+	u16 next_dma;
+	u16 num_dmas;
+	u16 guestid;
+	u8 interrupt; 	/* 0 when not registered */
+};
+
+struct pgdir
+{
+	u32 cr3;
+	u32 *pgdir;
+};
+
+/* The private info the thread maintains about the guest. */
+struct lguest
+{
+	struct lguest_state *state;
+	struct lguest_data __user *lguest_data;
+	struct task_struct *tsk;
+	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
+	u16 guestid;
+	u32 pfn_limit;
+	u32 page_offset;
+	u32 cr2;
+	int timer_on;
+	int halted;
+	int ts;
+	u32 gpf_eip;
+	u32 last_timer;
+	u32 next_hcall;
+	u16 tls_limits[GDT_ENTRY_TLS_ENTRIES];
+
+	/* We keep a small number of these. */
+	u32 pgdidx;
+	struct pgdir pgdirs[4];
+	void *trap_page;
+
+	/* Cached wakeup: we hold a reference to this task. */
+	struct task_struct *wake;
+
+	unsigned long noirq_start, noirq_end;
+	int dma_is_pending;
+	unsigned long pending_dma; /* struct lguest_dma */
+	unsigned long pending_addr; /* address they're sending to */
+
+	unsigned int stack_pages;
+
+	struct lguest_dma_info dma[LGUEST_MAX_DMA];
+
+	/* Dead? */
+	const char *dead;
+
+	/* We intercept page fault (demand shadow paging & cr2 saving)
+	   protection fault (in/out emulation, TLS handling) and
+	   device not available (TS handling). */
+	struct host_trap page_trap, gpf_trap, fpu_trap;
+
+	/* Virtual interrupts */
+	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
+	struct host_trap interrupt[LGUEST_IRQS];
+};
+
+extern struct page *hype_pages; /* Contiguous pages. */
+extern struct lguest lguests[];
+extern struct mutex lguest_lock;
+
+/* core.c: */
+/* Entry points in hypervisor */
+const unsigned long *__lguest_default_idt_entries(void);
+struct lguest_state *__lguest_states(void);
+u32 lhread_u32(struct lguest *lg, u32 addr);
+void lhwrite_u32(struct lguest *lg, u32 val, u32 addr);
+void lhread(struct lguest *lg, void *buf, u32 addr, unsigned bytes);
+void lhwrite(struct lguest *lg, u32 addr, const void *buf, unsigned bytes);
+int lguest_address_ok(const struct lguest *lg, unsigned long addr);
+int run_guest(struct lguest *lg, char *__user user);
+int find_free_guest(void);
+
+/* interrupts_and_traps.c: */
+void maybe_do_interrupt(struct lguest *lg);
+int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err);
+void check_bug_kill(struct lguest *lg);
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
+
+/* segments.c: */
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num);
+void guest_load_tls(struct lguest *lg,
+		    const struct desc_struct __user *tls_array);
+
+int init_guest_pagetable(struct lguest *lg, u32 pgtable);
+void free_guest_pagetable(struct lguest *lg);
+void guest_new_pagetable(struct lguest *lg, u32 pgtable);
+void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 i);
+void guest_pagetable_clear_all(struct lguest *lg);
+void guest_pagetable_flush_user(struct lguest *lg);
+void guest_set_pte(struct lguest *lg, unsigned long cr3,
+		   unsigned long vaddr, u32 val);
+void map_trap_page(struct lguest *info);
+int demand_page(struct lguest *info, u32 cr2, int write);
+void pin_stack_pages(struct lguest *lg);
+
+int lguest_device_init(void);
+void lguest_device_remove(void);
+void lguest_io_init(void);
+u32 bind_dma(struct lguest *lg,
+	     unsigned long addr, unsigned long udma, u16 numdmas,u8 interrupt);
+int send_dma(struct lguest *info, unsigned long addr,
+	     unsigned long udma);
+void release_all_dma(struct lguest *lg);
+unsigned long get_dma_buffer(struct lguest *lg, unsigned long addr,
+			     unsigned long *interrupt);
+
+void set_wakeup_process(struct lguest *lg, struct task_struct *p);
+int do_async_hcalls(struct lguest *info);
+int hypercall(struct lguest *info, struct lguest_regs *regs);
+
+#define kill_guest(lg, fmt...)					\
+do {								\
+	if (!(lg)->dead) {					\
+		(lg)->dead = kasprintf(GFP_ATOMIC, fmt);	\
+		if (!(lg)->dead)				\
+			(lg)->dead = (void *)1;			\
+	}							\
+} while(0)
+
+static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
+{
+	return vaddr - lg->page_offset;
+}
+
+/* Hardware-defined TSS structure. */
+struct x86_tss
+{
+	unsigned short	back_link,__blh;
+	unsigned long	esp0;
+	unsigned short	ss0,__ss0pad;
+	unsigned long	esp1;
+	unsigned short	ss1,__ss1pad;
+	unsigned long	esp2;
+	unsigned short	ss2,__ss2pad;
+	unsigned long	cr3;
+	unsigned long	eip;
+	unsigned long	eflags;
+	unsigned long	eax,ecx,edx,ebx;
+	unsigned long	esp; /* We actually use this one to save esp. */
+	unsigned long	ebp;
+	unsigned long	esi;
+	unsigned long	edi;
+	unsigned short	es, __espad;
+	unsigned short	cs, __cspad;
+	unsigned short	ss, __sspad;
+	unsigned short	ds, __dspad;
+	unsigned short	fs, __fspad;
+	unsigned short	gs, __gspad;
+	unsigned short	ldt, __ldtpad;
+	unsigned short	trace, io_bitmap_base;
+};
+
+int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
+		    struct lguest_regs *regs, struct x86_tss *tss);
+
+struct lguest_host_state
+{
+	struct Xgt_desc_struct	gdt;
+	struct Xgt_desc_struct	idt;
+	unsigned long		pgdir;
+	unsigned long		stackptr;
+};
+
+/* This sits in the high-mapped shim. */
+struct lguest_state
+{
+	/* Task struct. */
+	struct x86_tss tss;
+
+	/* Gate descriptor table. */
+	struct Xgt_desc_struct gdt;
+	struct desc_struct gdt_table[GDT_ENTRIES];
+
+	/* Interrupt descriptor table. */
+	struct Xgt_desc_struct idt;
+	struct desc_struct idt_table[IDT_ENTRIES];
+
+	/* Host state we store while the guest runs. */
+	struct lguest_host_state host;
+
+	/* This is the stack on which we push our regs. */
+	struct lguest_regs regs;
+};
+#endif	/* __ASSEMBLY__ */
+#endif	/* _LGUEST_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest.h
@@ -0,0 +1,86 @@
+/* Things the lguest guest needs to know. */
+#ifndef _ASM_LGUEST_H
+#define _ASM_LGUEST_H
+
+#define LGUEST_MAGIC_EBP 0x4C687970
+#define LGUEST_MAGIC_EDI 0x652D4D65
+#define LGUEST_MAGIC_ESI 0xFFFFFFFF
+
+#define LHCALL_FLUSH_ASYNC	0
+#define LHCALL_LGUEST_INIT	1
+#define LHCALL_CRASH		2
+#define LHCALL_LOAD_GDT		3
+#define LHCALL_NEW_PGTABLE	4
+#define LHCALL_FLUSH_TLB	5
+#define LHCALL_LOAD_IDT_ENTRY	6
+#define LHCALL_SET_STACK	7
+#define LHCALL_TS		8
+#define LHCALL_TIMER_READ	9
+#define LHCALL_TIMER_START	10
+#define LHCALL_HALT		11
+#define LHCALL_GET_WALLCLOCK	12
+#define LHCALL_BIND_DMA		13
+#define LHCALL_SEND_DMA		14
+#define LHCALL_SET_PTE		15
+#define LHCALL_SET_UNKNOWN_PTE	16
+#define LHCALL_SET_PUD		17
+#define LHCALL_LOAD_TLS		18
+
+#define LGUEST_TRAP_ENTRY 0x1F
+
+static inline unsigned long
+hcall(unsigned long call,
+      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
+		     : "=a"(call)
+		     : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 
+		     : "memory");
+	return call;
+}
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+#define LGUEST_IRQS 32
+
+#define LHCALL_RING_SIZE 64
+struct hcall_ring
+{
+	u32 eax, edx, ebx, ecx;
+};
+
+/* All the good stuff happens here: guest registers it with LGUEST_INIT */
+struct lguest_data
+{
+/* Fields which change during running: */
+	/* 512 == enabled (same as eflags) */
+	unsigned int irq_enabled;
+	/* Blocked interrupts. */
+	DECLARE_BITMAP(interrupts, LGUEST_IRQS); 
+
+	/* Last (userspace) address we got a GPF & reloaded gs. */
+	unsigned int gs_gpf_eip;
+
+	/* Virtual address of page fault. */
+	unsigned long cr2;
+
+	/* Async hypercall ring.  0xFF == done, 0 == pending. */
+	u8 hcall_status[LHCALL_RING_SIZE];
+	struct hcall_ring hcalls[LHCALL_RING_SIZE];
+			
+/* Fields initialized by the hypervisor at boot: */
+	/* Memory not to try to access */
+	unsigned long reserve_mem;
+	/* ID of this guest (used by network driver to set ethernet address) */
+	u16 guestid;
+	/* Multiplier for TSC clock. */
+	u32 clock_mult;
+
+/* Fields initialized by the guest at boot: */
+	/* Instruction range to suppress interrupts even if enabled */
+	unsigned long noirq_start, noirq_end;
+};
+extern struct lguest_data lguest_data;
+extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */
+#endif	/* _ASM_LGUEST_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest_device.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_LGUEST_DEVICE_H
+#define _ASM_LGUEST_DEVICE_H
+/* Everything you need to know about lguest devices. */
+#include <linux/device.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+
+struct lguest_device {
+	/* Unique busid, and index into lguest_page->devices[] */
+	/* By convention, each device can use irq index+1 if it wants to. */
+	unsigned int index;
+
+	struct device dev;
+
+	/* Driver can hang data off here. */
+	void *private;
+};
+
+struct lguest_driver {
+	const char *name;
+	struct module *owner;
+	u16 device_type;
+	int (*probe)(struct lguest_device *dev);
+	void (*remove)(struct lguest_device *dev);
+
+	struct device_driver drv;
+};
+
+extern int register_lguest_driver(struct lguest_driver *drv);
+extern void unregister_lguest_driver(struct lguest_driver *drv);
+#endif /* _ASM_LGUEST_DEVICE_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/lguest_user.h
@@ -0,0 +1,86 @@
+#ifndef _ASM_LGUEST_USER
+#define _ASM_LGUEST_USER
+/* Everything the "lguest" userspace program needs to know. */
+/* They can register up to 32 arrays of lguest_dma. */
+#define LGUEST_MAX_DMA		32
+/* At most we can dma 16 lguest_dma in one op. */
+#define LGUEST_MAX_DMA_SECTIONS	16
+
+/* How many devices?  Assume each one wants up to two dma arrays per device. */
+#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
+
+struct lguest_dma
+{
+	/* 0 if free to be used, filled by hypervisor. */
+ 	u32 used_len;
+	u32 addr[LGUEST_MAX_DMA_SECTIONS];
+	u16 len[LGUEST_MAX_DMA_SECTIONS];
+};
+
+/* This is found at address 0. */
+struct lguest_boot_info
+{
+	u32 max_pfn;
+	u32 initrd_size;
+	char cmdline[256];
+};
+
+struct lguest_block_page
+{
+	/* 0 is a read, 1 is a write. */
+	int type;
+	u32 sector; 	/* Offset in device = sector * 512. */
+	u32 bytes;	/* Length expected to be read/written in bytes */
+	/* 0 = pending, 1 = done, 2 = done, error */
+	int result;
+	u32 num_sectors; /* Disk length = num_sectors * 512 */
+};
+
+/* There is a shared page of these. */
+struct lguest_net
+{
+	union {
+		unsigned char mac[6];
+		struct {
+			u8 promisc;
+			u8 pad;
+			u16 guestid;
+		};
+	};
+};
+
+/* lguest_device_desc->type */
+#define LGUEST_DEVICE_T_CONSOLE	1
+#define LGUEST_DEVICE_T_NET	2
+#define LGUEST_DEVICE_T_BLOCK	3
+
+/* lguest_device_desc->status.  256 and above are device specific. */
+#define LGUEST_DEVICE_S_ACKNOWLEDGE	1 /* We have seen device. */
+#define LGUEST_DEVICE_S_DRIVER		2 /* We have found a driver */
+#define LGUEST_DEVICE_S_DRIVER_OK	4 /* Driver says OK! */
+#define LGUEST_DEVICE_S_REMOVED		8 /* Device has gone away. */
+#define LGUEST_DEVICE_S_REMOVED_ACK	16 /* Driver has been told. */
+#define LGUEST_DEVICE_S_FAILED		128 /* Something actually failed */
+
+#define LGUEST_NET_F_NOCSUM		0x4000 /* Don't bother checksumming */
+#define LGUEST_DEVICE_F_RANDOMNESS	0x8000 /* IRQ is fairly random */
+
+/* We have a page of these descriptors in the lguest_device page. */
+struct lguest_device_desc {
+	u16 type;
+	u16 features;
+	u16 status;
+	u16 num_pages;
+	u32 pfn;
+};
+
+/* Write command first word is a request. */
+enum lguest_req
+{
+	LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
+	LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
+	LHREQ_IRQ, /* + irq */
+};
+
+
+#endif /* _ASM_LGUEST_USER */



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 5 of 7]  lguest: the host code (lg.ko).
  2007-02-09 15:09       ` [PATCH 4 of 7] lguest: Config and headers Rusty Russell
@ 2007-02-09 15:14         ` Rusty Russell
  2007-02-09 15:17           ` [PATCH 6 of 7] lguest: Guest code Rusty Russell
  2007-02-09 18:15         ` [PATCH 4 of 7] lguest: Config and headers James Morris
  1 sibling, 1 reply; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 15:14 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

[ This is patch 6b, with the following Andifications:

1) lguest_entry is now mark used: after the other changes, gcc
   optimized it out.
2) Use header to define math_state_restore, not manual extern.
3) try_to_freeze() inside run loop.
4) cpu_hotplug locks on init and exit code as we frob PGE on all cpus.
5) Remove the log() macro which so amused lkml.
6) switch in IDT interpreting code instead of if chaining. ]

This is the host module (lg.ko) which supports lguest:

arch/i386/lguest/hypervisor.S:
	The actual guest <-> host switching code.  This is compiled into
	a C array, which is mapped to 0xFFC01000 in host and guests.

arch/i386/lguest/core.c:
	The core of the hypervisor, which calls into the assembler
	code which does this actual switch.  Also contains helper
	routines.

arch/i386/lguest/hypercalls.c:
	The entry point for the 19 hypercalls.

arch/i386/lguest/interrupts_and_traps.c:
	Handling of interrupts and traps, except page faults.

arch/i386/lguest/io.c:
	I/O from guest to host, and between guests.

arch/i386/lguest/lguest_user.c:
	/dev/lguest interface for lguest program to launch/control guests.

arch/i386/lguest/page_tables.c:
	Shadow Page table handling: generally we build up the shadow
	page tables by converting from guest page tables when a fault occurs.

arch/i386/lguest/segments.c:
	Segmentation (GDT) handling: we have to ensure they're trimmed
	to avoid guest access to the switching code.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- /dev/null
+++ b/arch/i386/lguest/core.c
@@ -0,0 +1,432 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future.  Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/cpu.h>
+#include <linux/freezer.h>
+#include <asm/lguest.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include <asm/i387.h>
+#include "lg.h"
+
+/* This is our hypervisor, compiled from hypervisor.S. */
+static char __initdata hypervisor_blob[] = {
+#include "hypervisor-blob.c"
+};
+
+#define MAX_LGUEST_GUESTS						\
+	(((1 << HYPERVISOR_MAP_ORDER) - sizeof(hypervisor_blob))	\
+	 / sizeof(struct lguest_state))
+
+static struct vm_struct *hypervisor_vma;
+static int cpu_had_pge;
+static struct {
+	unsigned long offset;
+	unsigned short segment;
+} lguest_entry __attribute_used__;
+struct page *hype_pages; /* Contiguous pages. */
+struct lguest lguests[MAX_LGUEST_GUESTS];
+DEFINE_MUTEX(lguest_lock);
+
+/* IDT entries are at start of hypervisor. */
+const unsigned long *__lguest_default_idt_entries(void)
+{
+	return (void *)HYPE_ADDR;
+}
+
+/* Next is switch_to_guest */
+static void *__lguest_switch_to_guest(void)
+{
+	return (void *)HYPE_ADDR + HYPE_DATA_SIZE;
+}
+
+/* Then we use everything else to hold guest state. */
+struct lguest_state *__lguest_states(void)
+{
+	return (void *)HYPE_ADDR + sizeof(hypervisor_blob);
+}
+
+static __init int map_hypervisor(void)
+{
+	unsigned int i;
+	int err;
+	struct page *pages[HYPERVISOR_PAGES], **pagep = pages;
+
+	hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, HYPERVISOR_MAP_ORDER);
+	if (!hype_pages)
+		return -ENOMEM;
+
+	hypervisor_vma = __get_vm_area(1 << HYPERVISOR_MAP_ORDER, VM_ALLOC,
+				       HYPE_ADDR, VMALLOC_END);
+	if (!hypervisor_vma) {
+		err = -ENOMEM;
+		printk("lguest: could not map hypervisor pages high\n");
+		goto free_pages;
+	}
+
+	for (i = 0; i < HYPERVISOR_PAGES; i++)
+		pages[i] = hype_pages + i;
+
+	err = map_vm_area(hypervisor_vma, PAGE_KERNEL, &pagep);
+	if (err) {
+		printk("lguest: map_vm_area failed: %i\n", err);
+		goto free_vma;
+	}
+	memcpy(hypervisor_vma->addr, hypervisor_blob, sizeof(hypervisor_blob));
+
+	/* Setup LGUEST segments on all cpus */
+	for_each_possible_cpu(i) {
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+		get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+	}
+
+	/* Initialize entry point into hypervisor. */
+	lguest_entry.offset = (long)__lguest_switch_to_guest();
+	lguest_entry.segment = LGUEST_CS;
+
+	printk("lguest: mapped hypervisor at %p\n", hypervisor_vma->addr);
+	return 0;
+
+free_vma:
+	vunmap(hypervisor_vma->addr);
+free_pages:
+	__free_pages(hype_pages, HYPERVISOR_MAP_ORDER);
+	return err;
+}
+
+static __exit void unmap_hypervisor(void)
+{
+	vunmap(hypervisor_vma->addr);
+	__free_pages(hype_pages, HYPERVISOR_MAP_ORDER);
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+	u8 insn;
+	unsigned int insnlen = 0, in = 0, shift = 0;
+	unsigned long physaddr = guest_pa(lg, lg->state->regs.eip);
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->state->regs.eip < lg->page_offset)
+		return 0;
+	lhread(lg, &insn, physaddr, 1);
+
+	/* Operand size prefix means it's actually for ax. */
+	if (insn == 0x66) {
+		shift = 16;
+		insnlen = 1;
+		lhread(lg, &insn, physaddr + insnlen, 1);
+	}
+
+	switch (insn & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		insnlen += 1;
+		break;
+	default:
+		return 0;
+	}
+
+	if (in) {
+		/* Lower bit tells is whether it's a 16 or 32 bit access */
+		if (insn & 0x1)
+			lg->state->regs.eax = 0xFFFFFFFF;
+		else
+			lg->state->regs.eax |= (0xFFFF << shift);
+	}
+	lg->state->regs.eip += insnlen;
+	return 1;
+}
+
+int find_free_guest(void)
+{
+	unsigned int i;
+	for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+		if (!lguests[i].state)
+			return i;
+	return -1;
+}
+
+int lguest_address_ok(const struct lguest *lg, unsigned long addr)
+{
+	return addr / PAGE_SIZE < lg->pfn_limit;
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lhread_u32(struct lguest *lg, u32 addr)
+{
+	u32 val = 0;
+
+	/* Don't let them access lguest_add */
+	if (!lguest_address_ok(lg, addr)
+	    || get_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad read address %u", addr);
+	return val;
+}
+
+void lhwrite_u32(struct lguest *lg, u32 addr, u32 val)
+{
+	if (!lguest_address_ok(lg, addr)
+	    || put_user(val, (u32 __user *)addr) != 0)
+		kill_guest(lg, "bad write address %u", addr);
+}
+
+void lhread(struct lguest *lg, void *b, u32 addr, unsigned bytes)
+{
+	if (addr + bytes < addr || !lguest_address_ok(lg, addr+bytes)
+	    || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+		/* copy_from_user should do this, but as we rely on it... */
+		memset(b, 0, bytes);
+		kill_guest(lg, "bad read address %u len %u", addr, bytes);
+	}
+}
+
+void lhwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes)
+{
+	if (addr + bytes < addr
+	    || !lguest_address_ok(lg, addr+bytes)
+	    || copy_to_user((void __user *)addr, b, bytes) != 0)
+		kill_guest(lg, "bad write address %u len %u", addr, bytes);
+}
+
+/* Saves exporting idt_table from kernel */
+static struct desc_struct *get_idt_table(void)
+{
+	struct Xgt_desc_struct idt;
+
+	asm("sidt %0":"=m" (idt));
+	return (void *)idt.address;
+}
+
+static int usermode(struct lguest_regs *regs)
+{
+	return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+}
+
+/* Trap page resets this when it reloads gs. */
+static int new_gfp_eip(struct lguest *lg, struct lguest_regs *regs)
+{
+	u32 eip;
+	get_user(eip, &lg->lguest_data->gs_gpf_eip);
+	if (eip == regs->eip)
+		return 0;
+	put_user(regs->eip, &lg->lguest_data->gs_gpf_eip);
+	return 1;
+}
+
+static void set_ts(unsigned int guest_ts)
+{
+	u32 cr0;
+	if (guest_ts) {
+		asm("movl %%cr0,%0":"=r" (cr0));
+		if (!(cr0 & 8))
+			asm("movl %0,%%cr0": :"r" (cr0|8));
+	}
+}
+
+static void run_guest_once(struct lguest *lg)
+{
+	unsigned int clobber;
+
+	/* Put eflags on stack, lcall does rest. */
+	asm volatile("pushf; lcall *lguest_entry"
+		     : "=a"(clobber), "=d"(clobber)
+		     : "0"(lg->state), "1"(get_idt_table())
+		     : "memory");
+}
+
+int run_guest(struct lguest *lg, char *__user user)
+{
+	struct lguest_regs *regs = &lg->state->regs;
+
+	while (!lg->dead) {
+		unsigned int cr2 = 0; /* Damn gcc */
+
+		/* Hypercalls first: we might have been out to userspace */
+		if (do_async_hcalls(lg))
+			goto pending_dma;
+
+		if (regs->trapnum == LGUEST_TRAP_ENTRY) {
+			/* Only do hypercall once. */
+			regs->trapnum = 255;
+			if (hypercall(lg, regs))
+				goto pending_dma;
+		}
+
+		if (signal_pending(current))
+			return -EINTR;
+		maybe_do_interrupt(lg);
+
+		try_to_freeze();
+
+		if (lg->dead)
+			break;
+
+		if (lg->halted) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(1);
+			continue;
+		}
+
+		/* Restore limits on TLS segments if in user mode. */
+		if (usermode(regs)) {
+			unsigned int i;
+			for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++)
+				lg->state->gdt_table[GDT_ENTRY_TLS_MIN+i].a
+					|= lg->tls_limits[i];
+		}
+
+		local_irq_disable();
+		map_trap_page(lg);
+
+		/* Host state to be restored after the guest returns. */
+		asm("sidt %0":"=m"(lg->state->host.idt));
+		lg->state->host.gdt = __get_cpu_var(cpu_gdt_descr);
+
+		/* Even if *we* don't want FPU trap, guest might... */
+		set_ts(lg->ts);
+
+		run_guest_once(lg);
+
+		/* Save cr2 now if we page-faulted. */
+		if (regs->trapnum == 14)
+			asm("movl %%cr2,%0" :"=r" (cr2));
+		else if (regs->trapnum == 7)
+			math_state_restore();
+		local_irq_enable();
+
+		switch (regs->trapnum) {
+		case 13: /* We've intercepted a GPF. */
+			if (regs->errcode == 0) {
+				if (emulate_insn(lg))
+					continue;
+
+				/* FIXME: If it's reloading %gs in a loop? */
+				if (usermode(regs) && new_gfp_eip(lg,regs))
+					continue;
+			}
+
+			if (reflect_trap(lg, &lg->gpf_trap, 1))
+				continue;
+			break;
+		case 14: /* We've intercepted a page fault. */
+			if (demand_page(lg, cr2, regs->errcode & 2))
+				continue;
+
+			/* If lguest_data is NULL, this won't hurt. */
+			put_user(cr2, &lg->lguest_data->cr2);
+			if (reflect_trap(lg, &lg->page_trap, 1))
+				continue;
+			kill_guest(lg, "unhandled page fault at %#x"
+				   " (eip=%#x, errcode=%#x)",
+				   cr2, regs->eip, regs->errcode);
+			break;
+		case 7: /* We've intercepted a Device Not Available fault. */
+			/* If they don't want to know, just absorb it. */
+			if (!lg->ts) 
+				continue;
+			if (reflect_trap(lg, &lg->fpu_trap, 0))
+				continue;
+			kill_guest(lg, "unhandled FPU fault at %#x",
+				   regs->eip);
+			break;
+		case 32 ... 255: /* Real interrupt, fall thru */
+			cond_resched();
+		case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+			continue;
+		case 6: /* Invalid opcode before they installed handler */
+			check_bug_kill(lg);
+		}
+		kill_guest(lg,"unhandled trap %i at %#x (err=%i)",
+			   regs->trapnum, regs->eip, regs->errcode);
+	}
+	return -ENOENT;
+
+pending_dma:
+	put_user(lg->pending_dma, (unsigned long *)user);
+	put_user(lg->pending_addr, (unsigned long *)user+1);
+	return sizeof(unsigned long)*2;
+}
+
+#define STRUCT_LGUEST_ELEM_SIZE(elem) sizeof(((struct lguest_state *)0)->elem)
+
+static void adjust_pge(void *on)
+{
+	if (on)
+		write_cr4(read_cr4() | X86_CR4_PGE);
+	else
+		write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+ 
+static int __init init(void)
+{
+	int err;
+
+	if (paravirt_enabled())
+		return -EPERM;
+
+	err = map_hypervisor();
+	if (err)
+		return err;
+
+	err = init_pagetables(hype_pages);
+	if (err) {
+		unmap_hypervisor();
+		return err;
+	}
+	lguest_io_init();
+
+	err = lguest_device_init();
+	if (err) {
+		free_pagetables();
+		unmap_hypervisor();
+		return err;
+	}
+	lock_cpu_hotplug();
+	if (cpu_has_pge) { /* We have a broader idea of "global". */
+		cpu_had_pge = 1;
+		on_each_cpu(adjust_pge, 0, 0, 1);
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+	}
+	unlock_cpu_hotplug();
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	lguest_device_remove();
+	free_pagetables();
+	unmap_hypervisor();
+	lock_cpu_hotplug();
+	if (cpu_had_pge) {
+		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+	}
+	unlock_cpu_hotplug();
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypercalls.c
@@ -0,0 +1,189 @@
+/*  Actual hypercalls, which allow guests to actually do something.
+    Copyright (C) 2006 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/clocksource.h>
+#include <asm/lguest.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void guest_set_stack(struct lguest *lg,
+			    u32 seg, u32 esp, unsigned int pages)
+{
+	/* You cannot have a stack segment with priv level 0. */
+	if ((seg & 0x3) != GUEST_DPL)
+		kill_guest(lg, "bad stack segment %i", seg);
+	if (pages > 2)
+		kill_guest(lg, "bad stack pages %u", pages);
+	lg->state->tss.ss1 = seg;
+	lg->state->tss.esp1 = esp;
+	lg->stack_pages = pages;
+	pin_stack_pages(lg);
+}
+
+/* Return true if DMA to host userspace now pending. */
+static int do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+	switch (regs->eax) {
+	case LHCALL_FLUSH_ASYNC:
+		break;
+	case LHCALL_LGUEST_INIT:
+		kill_guest(lg, "already have lguest_data");
+		break;
+	case LHCALL_CRASH: {
+		char msg[128];
+		lhread(lg, msg, regs->edx, sizeof(msg));
+		msg[sizeof(msg)-1] = '\0';
+		kill_guest(lg, "CRASH: %s", msg);
+		break;
+	}
+	case LHCALL_LOAD_GDT:
+		load_guest_gdt(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_NEW_PGTABLE:
+		guest_new_pagetable(lg, regs->edx);
+		break;
+	case LHCALL_FLUSH_TLB:
+		if (regs->edx)
+			guest_pagetable_clear_all(lg);
+		else
+			guest_pagetable_flush_user(lg);
+		break;
+	case LHCALL_LOAD_IDT_ENTRY:
+		load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_SET_STACK:
+		guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_TS:
+		lg->ts = regs->edx;
+		break;
+	case LHCALL_TIMER_READ: {
+		u32 now = jiffies;
+		mb();
+		regs->eax = now - lg->last_timer;
+		lg->last_timer = now;
+		break;
+	}
+	case LHCALL_TIMER_START:
+		lg->timer_on = 1;
+		if (regs->edx != HZ)
+			kill_guest(lg, "Bad clock speed %i", regs->edx);
+		lg->last_timer = jiffies;
+		break;
+	case LHCALL_HALT:
+		lg->halted = 1;
+		break;
+	case LHCALL_GET_WALLCLOCK: {
+		struct timeval tv;
+		do_gettimeofday(&tv);
+		regs->eax = tv.tv_sec;
+		break;
+	}
+	case LHCALL_BIND_DMA:
+		regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+				     regs->ecx >> 8, regs->ecx & 0xFF);
+		break;
+	case LHCALL_SEND_DMA:
+		return send_dma(lg, regs->edx, regs->ebx);
+	case LHCALL_SET_PTE:
+		guest_set_pte(lg, regs->edx, regs->ebx, regs->ecx);
+		break;
+	case LHCALL_SET_UNKNOWN_PTE:
+		guest_pagetable_clear_all(lg);
+		break;
+	case LHCALL_SET_PUD:
+		guest_set_pud(lg, regs->edx, regs->ebx);
+		break;
+	case LHCALL_LOAD_TLS:
+		guest_load_tls(lg, (struct desc_struct __user*)regs->edx);
+		break;
+	default:
+		kill_guest(lg, "Bad hypercall %i\n", regs->eax);
+	}
+	return 0;
+}
+
+/* We always do queued calls before actual hypercall. */
+int do_async_hcalls(struct lguest *lg)
+{
+	unsigned int i, pending;
+	u8 st[LHCALL_RING_SIZE];
+
+	if (!lg->lguest_data)
+		return 0;
+
+	copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st));
+	for (i = 0; i < ARRAY_SIZE(st); i++) {
+		struct lguest_regs regs;
+		unsigned int n = lg->next_hcall;
+
+		if (st[n] == 0xFF)
+			break;
+
+		if (++lg->next_hcall == LHCALL_RING_SIZE)
+			lg->next_hcall = 0;
+
+		get_user(regs.eax, &lg->lguest_data->hcalls[n].eax);
+		get_user(regs.edx, &lg->lguest_data->hcalls[n].edx);
+		get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx);
+		get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx);
+		pending = do_hcall(lg, &regs);
+		put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+		if (pending)
+			return 1;
+	}
+
+	set_wakeup_process(lg, NULL);
+	return 0;
+}
+
+int hypercall(struct lguest *lg, struct lguest_regs *regs)
+{
+	int pending;
+
+	if (!lg->lguest_data) {
+		if (regs->eax != LHCALL_LGUEST_INIT) {
+			kill_guest(lg, "hypercall %i before LGUEST_INIT",
+				   regs->eax);
+			return 0;
+		}
+
+		lg->lguest_data = (struct lguest_data __user *)regs->edx;
+		/* We check here so we can simply copy_to_user/from_user */
+		if (!lguest_address_ok(lg, (long)lg->lguest_data)
+		    || !lguest_address_ok(lg, (long)(lg->lguest_data+1))){
+			kill_guest(lg, "bad guest page %p", lg->lguest_data);
+			return 0;
+		}
+		get_user(lg->noirq_start, &lg->lguest_data->noirq_start);
+		get_user(lg->noirq_end, &lg->lguest_data->noirq_end);
+		/* We reserve the top pgd entry. */
+		put_user(4U*1024*1024, &lg->lguest_data->reserve_mem);
+		put_user(lg->guestid, &lg->lguest_data->guestid);
+		put_user(clocksource_khz2mult(tsc_khz, 22),
+			 &lg->lguest_data->clock_mult);
+		return 0;
+	}
+	pending = do_hcall(lg, regs);
+	set_wakeup_process(lg, NULL);
+	return pending;
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypervisor.S
@@ -0,0 +1,170 @@
+/* This code sits at 0xFFFF1000 to do the low-level guest<->host switch.
+   Layout is: default_idt_entries (1k), then switch_to_guest entry point. */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+#define SAVE_REGS				\
+	/* Save old guest/host state */		\
+	pushl	%es;				\
+	pushl	%ds;				\
+	pushl	%fs;				\
+	pushl	%eax;				\
+	pushl	%gs;				\
+	pushl	%ebp;				\
+	pushl	%edi;				\
+	pushl	%esi;				\
+	pushl	%edx;				\
+	pushl	%ecx;				\
+	pushl	%ebx;				\
+
+.text
+ENTRY(_start) /* ld complains unless _start is defined. */
+/* %eax contains ptr to target guest state, %edx contains host idt. */
+switch_to_guest:
+	pushl	%ss
+	SAVE_REGS
+	/* Save old stack, switch to guest's stack. */
+	movl	%esp, LGUEST_STATE_host_stackptr(%eax)
+	movl	%eax, %esp
+	/* Guest registers will be at: %esp-$LGUEST_STATE_regs */
+	addl	$LGUEST_STATE_regs, %esp
+	/* Switch to guest's GDT, IDT. */
+	lgdt	LGUEST_STATE_gdt(%eax)
+	lidt	LGUEST_STATE_idt(%eax)
+	/* Save page table top. */
+	movl	%cr3, %ebx
+	movl	%ebx, LGUEST_STATE_host_pgdir(%eax)
+	/* Set host's TSS to available (clear byte 5 bit 2). */
+	movl	(LGUEST_STATE_host_gdt+2)(%eax), %ebx
+	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%ebx)
+	/* Switch to guest page tables */
+	popl	%ebx
+	movl	%ebx, %cr3
+	/* Switch to guest's TSS. */
+	movl	$(GDT_ENTRY_TSS*8), %ebx
+	ltr	%bx
+	/* Restore guest regs */
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%gs
+	/* Now we've loaded gs, neuter the TLS entries down to 1 byte/page */
+	addl	$(LGUEST_STATE_gdt_table+GDT_ENTRY_TLS_MIN*8), %eax
+	movw	$0,(%eax)
+	movw	$0,8(%eax)
+	movw	$0,16(%eax)
+	popl	%eax
+	popl	%fs
+	popl	%ds
+	popl	%es
+	/* Skip error code and trap number */
+	addl	$8, %esp
+	iret
+
+#define SWITCH_TO_HOST							\
+	SAVE_REGS;							\
+	/* Save old pgdir */						\
+	movl	%cr3, %eax;						\
+	pushl	%eax;							\
+	/* Load lguest ds segment for convenience. */			\
+	movl	$(LGUEST_DS), %eax;					\
+	movl	%eax, %ds;						\
+	/* Now figure out who we are */					\
+	movl	%esp, %eax;						\
+	subl	$LGUEST_STATE_regs, %eax;				\
+	/* Switch to host page tables (GDT, IDT and stack are in host   \
+	   mem, so need this first) */					\
+	movl	LGUEST_STATE_host_pgdir(%eax), %ebx;			\
+	movl	%ebx, %cr3;						\
+	/* Set guest's TSS to available (clear byte 5 bit 2). */	\
+	andb	$0xFD, (LGUEST_STATE_gdt_table+GDT_ENTRY_TSS*8+5)(%eax);\
+	/* Switch to host's GDT & IDT. */				\
+	lgdt	LGUEST_STATE_host_gdt(%eax);				\
+	lidt	LGUEST_STATE_host_idt(%eax);				\
+	/* Switch to host's stack. */					\
+	movl	LGUEST_STATE_host_stackptr(%eax), %esp;			\
+	/* Switch to host's TSS */					\
+	movl	$(GDT_ENTRY_TSS*8), %eax;				\
+	ltr	%ax;							\
+	/* Restore host regs */						\
+	popl	%ebx;							\
+	popl	%ecx;							\
+	popl	%edx;							\
+	popl	%esi;							\
+	popl	%edi;							\
+	popl	%ebp;							\
+	popl	%gs;							\
+	popl	%eax;							\
+	popl	%fs;							\
+	popl	%ds;							\
+	popl	%es;							\
+	popl	%ss
+	
+/* Return to run_guest_once. */
+return_to_host:
+	SWITCH_TO_HOST
+	iret
+
+deliver_to_host:
+	SWITCH_TO_HOST
+decode_idt_and_jmp:
+	/* Decode IDT and jump to hosts' irq handler.  When that does iret, it
+	 * will return to run_guest_once.  This is a feature. */
+	/* We told gcc we'd clobber edx and eax... */
+	movl	LGUEST_STATE_trapnum(%eax), %eax
+	leal	(%edx,%eax,8), %eax
+	movzwl	(%eax),%edx
+	movl	4(%eax), %eax
+	xorw	%ax, %ax
+	orl	%eax, %edx
+	jmp	*%edx
+
+deliver_to_host_with_errcode:
+	SWITCH_TO_HOST
+	pushl	LGUEST_STATE_errcode(%eax)
+	jmp decode_idt_and_jmp
+
+/* Real hardware interrupts are delivered straight to the host.  Others
+   cause us to return to run_guest_once so it can decide what to do.  Note
+   that some of these are overridden by the guest to deliver directly, and
+   never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+	.data; .long 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+ .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+	pushl	$0
+ .endif
+	pushl	$\N
+	jmp	\TARGET
+	ALIGN
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+	IRQ_STUB irq \TARGET
+  irq=irq+1
+ .endr
+.endm
+	
+/* We intercept every interrupt, because we may need to switch back to
+ * host.  Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+irq_stubs:
+.data
+default_idt_entries:	
+.text
+	IRQ_STUBS 0 1 return_to_host		/* First two traps */
+	IRQ_STUB 2 deliver_to_host_with_errcode	/* NMI */
+	IRQ_STUBS 3 31 return_to_host		/* Rest of traps */
+	IRQ_STUBS 32 127 deliver_to_host	/* Real interrupts */
+	IRQ_STUB 128 return_to_host		/* System call (overridden) */
+	IRQ_STUBS 129 255 deliver_to_host	/* Other real interrupts */
+
+/* Everything after this is used for the lguest_state structs. */
+ALIGN
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/interrupts_and_traps.c
@@ -0,0 +1,230 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 val)
+{
+	lhwrite_u32(lg, (u32)--(*gstack), val);
+}
+
+int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err)
+{
+	u32 __user *gstack;
+	u32 eflags, ss, irq_enable;
+	struct lguest_regs *regs = &lg->state->regs;
+
+	if (!trap->addr)
+		return 0;
+
+	/* If they want a ring change, we use new stack and push old ss/esp */
+	if ((regs->ss&0x3) != GUEST_DPL) {
+		gstack = (u32 __user *)guest_pa(lg, lg->state->tss.esp1);
+		ss = lg->state->tss.ss1;
+		push_guest_stack(lg, &gstack, regs->ss);
+		push_guest_stack(lg, &gstack, regs->esp);
+	} else {
+		gstack = (u32 __user *)guest_pa(lg, regs->esp);
+		ss = regs->ss;
+	}
+
+	/* We use IF bit in eflags to indicate whether irqs were disabled
+	   (it's always 0, since irqs are enabled when guest is running). */
+	eflags = regs->eflags;
+	get_user(irq_enable, &lg->lguest_data->irq_enabled);
+	eflags |= (irq_enable & 512);
+
+	push_guest_stack(lg, &gstack, eflags);
+	push_guest_stack(lg, &gstack, regs->cs);
+	push_guest_stack(lg, &gstack, regs->eip);
+
+	if (has_err)
+		push_guest_stack(lg, &gstack, regs->errcode);
+
+	/* Change the real stack so hypervisor returns to trap handler */
+	regs->ss = ss;
+	regs->esp = (u32)gstack + lg->page_offset;
+	regs->cs = (__KERNEL_CS|GUEST_DPL);
+	regs->eip = trap->addr;
+
+	/* GS will be neutered on way back to guest. */
+	put_user(0, &lg->lguest_data->gs_gpf_eip);
+
+	/* Disable interrupts for an interrupt gate. */
+	if (trap->disable_interrupts)
+		put_user(0, &lg->lguest_data->irq_enabled);
+	return 1;
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+	unsigned int irq;
+	DECLARE_BITMAP(irqs, LGUEST_IRQS);
+
+	if (!lg->lguest_data)
+		return;
+
+	/* If timer has changed, set timer interrupt. */
+	if (lg->timer_on && jiffies != lg->last_timer)
+		set_bit(0, lg->irqs_pending);
+
+	/* Mask out any interrupts they have blocked. */
+	copy_from_user(&irqs, lg->lguest_data->interrupts, sizeof(irqs));
+	bitmap_andnot(irqs, lg->irqs_pending, irqs, LGUEST_IRQS);
+
+	irq = find_first_bit(irqs, LGUEST_IRQS);
+	if (irq >= LGUEST_IRQS)
+		return;
+
+	/* If they're halted, we re-enable interrupts. */
+	if (lg->halted) {
+		/* Re-enable interrupts. */
+		put_user(512, &lg->lguest_data->irq_enabled);
+		lg->halted = 0;
+	} else {
+		/* Maybe they have interrupts disabled? */
+		u32 irq_enabled;
+		get_user(irq_enabled, &lg->lguest_data->irq_enabled);
+		if (!irq_enabled)
+			return;
+	}
+
+	if (lg->interrupt[irq].addr != 0) {
+		clear_bit(irq, lg->irqs_pending);
+		reflect_trap(lg, &lg->interrupt[irq], 0);
+	}
+}
+
+void check_bug_kill(struct lguest *lg)
+{
+#ifdef CONFIG_BUG
+	u32 eip = lg->state->regs.eip - PAGE_OFFSET;
+	u16 insn;
+
+	/* This only works for addresses in linear mapping... */
+	if (lg->state->regs.eip < PAGE_OFFSET)
+		return;
+	lhread(lg, &insn, eip, sizeof(insn));
+	if (insn == 0x0b0f) {
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+		u16 l;
+		u32 f;
+		char file[128];
+		lhread(lg, &l, eip+sizeof(insn), sizeof(l));
+		lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f));
+		lhread(lg, file, f - PAGE_OFFSET, sizeof(file));
+		file[sizeof(file)-1] = 0;
+		kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l);
+#else
+		kill_guest(lg, "BUG() at %#x", eip);
+#endif	/* CONFIG_DEBUG_BUGVERBOSE */
+	}
+#endif	/* CONFIG_BUG */
+}
+
+static void copy_trap(struct lguest *lg,
+		      struct host_trap *trap,
+		      const struct desc_struct *desc)
+{
+	u8 type = ((desc->b >> 8) & 0xF);
+
+	/* Not present? */
+	if (!(desc->b & 0x8000)) {
+		trap->addr = 0;
+		return;
+	}
+	if (type != 0xE && type != 0xF)
+		kill_guest(lg, "bad IDT type %i", type);
+	trap->disable_interrupts = (type == 0xE);
+	trap->addr = ((desc->a & 0x0000FFFF) | (desc->b & 0xFFFF0000));
+}
+
+/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
+static u8 tramp[] 
+= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
+    0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+    /* movl 0, %ss:lguest_data.gs_gpf_eip */
+    0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
+};
+#define TRAMP_MOVL_TARGET_OFF 7
+#define TRAMP_JMP_TARGET_OFF 16
+
+static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
+{
+	u32 addr, off;
+
+	off = sizeof(tramp)*i;
+	memcpy(lg->trap_page + off, tramp, sizeof(tramp));
+
+	/* 0 is to be placed in lguest_data.gs_gpf_eip. */
+	addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
+	memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
+
+	/* Address is relative to where end of jmp will be. */
+	addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
+	memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
+	return (-4*1024*1024) + off;
+}
+
+/* We bounce through the trap page, for two reasons: firstly, we need
+   the interrupt destination always mapped, to avoid double faults,
+   secondly we want to reload %gs to make it innocuous on entering kernel.
+ */
+static void setup_idt(struct lguest *lg,
+		      unsigned int i,
+		      const struct desc_struct *desc)
+{
+	u8 type = ((desc->b >> 8) & 0xF);
+	u32 taddr;
+
+	/* Not present? */
+	if (!(desc->b & 0x8000)) {
+		/* FIXME: When we need this, we'll know... */
+		if (lg->state->idt_table[i].a & 0x8000)
+			kill_guest(lg, "removing interrupts not supported");
+		return;
+	}
+
+	/* We could reflect and disable interrupts, but guest can do itself. */
+	if (type != 0xF)
+		kill_guest(lg, "bad direct IDT %i type %i", i, type);
+
+	taddr = setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000));
+
+	lg->state->idt_table[i].a = (((__KERNEL_CS|GUEST_DPL)<<16)
+					| (taddr & 0x0000FFFF));
+	lg->state->idt_table[i].b = (desc->b&0xEF00)|(taddr&0xFFFF0000);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 high)
+{
+	struct desc_struct d = { low, high };
+
+	switch (i) {
+	/* Ignore NMI, doublefault, hypercall, spurious interrupt. */
+	case 2:
+	case 8:
+	case 15:
+	case LGUEST_TRAP_ENTRY:
+	/* FIXME: We should handle debug and int3 */
+	case 1:
+	case 3:
+		return;
+	/* We intercept page fault, general protection fault and fpu missing */
+	case 13:
+		copy_trap(lg, &lg->gpf_trap, &d);
+		return;
+	case 14:
+		copy_trap(lg, &lg->page_trap, &d);
+		return;
+	case 7:
+		copy_trap(lg, &lg->fpu_trap, &d);
+		return;
+	}
+
+	/* Other traps go straight to guest. */
+	if (i < FIRST_EXTERNAL_VECTOR || i == SYSCALL_VECTOR)
+		setup_idt(lg, i, &d);
+	/* A virtual interrupt */
+	else if (i < FIRST_EXTERNAL_VECTOR + LGUEST_IRQS)
+		copy_trap(lg, &lg->interrupt[i-FIRST_EXTERNAL_VECTOR], &d);
+}
+
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/io.c
@@ -0,0 +1,413 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[64];
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (!dma->len[i])
+			return 1;
+		if (!lguest_address_ok(lg, dma->addr[i]))
+			goto kill;
+		if (dma->len[i] > PAGE_SIZE)
+			goto kill;
+		/* We could do over a page, but is it worth it? */
+		if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+			goto kill;
+	}
+	return 1;
+
+kill:
+	kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]);
+	return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+	return jhash2((u32*)&key->both.word,
+		      (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+		      key->both.offset)
+		% ARRAY_SIZE(dma_hash);
+}
+
+/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
+static void unlink_dma(struct lguest_dma_info *dmainfo)
+{
+	BUG_ON(!mutex_is_locked(&lguest_lock));
+	dmainfo->interrupt = 0;
+	list_del(&dmainfo->list);
+	drop_futex_key_refs(&dmainfo->key);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+	return (a->both.word == b->both.word
+		&& a->both.ptr == b->both.ptr
+		&& a->both.offset == b->both.offset);
+}
+
+static u32 unbind_dma(struct lguest *lg,
+		      const union futex_key *key,
+		      unsigned long dmas)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
+			unlink_dma(&lg->dma[i]);
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+u32 bind_dma(struct lguest *lg,
+	     unsigned long addr, unsigned long dmas, u16 numdmas, u8 interrupt)
+{
+	unsigned int i;
+	u32 ret = 0;
+	union futex_key key;
+
+	if (interrupt >= LGUEST_IRQS)
+		return 0;
+
+	mutex_lock(&lguest_lock);
+	down_read(&current->mm->mmap_sem);
+	if (get_futex_key((u32 __user *)addr, &key) != 0) {
+		kill_guest(lg, "bad dma address %#lx", addr);
+		goto unlock;
+	}
+	get_futex_key_refs(&key);
+
+	if (interrupt == 0)
+		ret = unbind_dma(lg, &key, dmas);
+	else {
+		for (i = 0; i < LGUEST_MAX_DMA; i++) {
+			if (lg->dma[i].interrupt == 0) {
+				lg->dma[i].dmas = dmas;
+				lg->dma[i].num_dmas = numdmas;
+				lg->dma[i].next_dma = 0;
+				lg->dma[i].key = key;
+				lg->dma[i].guestid = lg->guestid;
+				lg->dma[i].interrupt = interrupt;
+				list_add(&lg->dma[i].list,
+					 &dma_hash[hash(&key)]);
+				ret = 1;
+				goto unlock;
+			}
+		}
+	}
+	drop_futex_key_refs(&key);
+unlock:
+ 	up_read(&current->mm->mmap_sem);
+	mutex_unlock(&lguest_lock);
+	return ret;
+}
+
+/* lhread from another guest */
+static int lhread_other(struct lguest *lg,
+			void *buf, u32 addr, unsigned bytes)
+{
+	if (addr + bytes < addr
+	    || !lguest_address_ok(lg, addr+bytes)
+	    || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
+		memset(buf, 0, bytes);
+		kill_guest(lg, "bad address in registered DMA struct");
+		return 0;
+	}
+	return 1;
+}
+
+/* lhwrite to another guest */
+static int lhwrite_other(struct lguest *lg, u32 addr,
+			 const void *buf, unsigned bytes)
+{
+	if (addr + bytes < addr
+	    || !lguest_address_ok(lg, addr+bytes)
+	    || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
+		!= bytes)) {
+		kill_guest(lg, "bad address writing to registered DMA");
+		return 0;
+	}
+	return 1;
+}
+
+static u32 copy_data(const struct lguest_dma *src,
+		     const struct lguest_dma *dst,
+		     struct page *pages[])
+{
+	unsigned int totlen, si, di, srcoff, dstoff;
+	void *maddr = NULL;
+
+	totlen = 0;
+	si = di = 0;
+	srcoff = dstoff = 0;
+	while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
+	       && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
+		u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
+
+		if (!maddr)
+			maddr = kmap(pages[di]);
+
+		/* FIXME: This is not completely portable, since
+		   archs do different things for copy_to_user_page. */
+		if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
+				   (void *__user)src->addr[si], len) != 0) {
+			totlen = 0;
+			break;
+		}
+
+		totlen += len;
+		srcoff += len;
+		dstoff += len;
+		if (srcoff == src->len[si]) {
+			si++;
+			srcoff = 0;
+		}
+		if (dstoff == dst->len[di]) {
+			kunmap(pages[di]);
+			maddr = NULL;
+			di++;
+			dstoff = 0;
+		}
+	}
+
+	if (maddr)
+		kunmap(pages[di]);
+
+	return totlen;
+}
+
+/* Src is us, ie. current. */
+static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
+		  struct lguest *dstlg, const struct lguest_dma *dst)
+{
+	int i;
+	u32 ret;
+	struct page *pages[LGUEST_MAX_DMA_SECTIONS];
+
+	if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
+		return 0;
+
+	/* First get the destination pages */
+	for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+		if (dst->len[i] == 0)
+			break;
+		if (get_user_pages(dstlg->tsk, dstlg->mm,
+				   dst->addr[i], 1, 1, 1, pages+i, NULL)
+		    != 1) {
+			ret = 0;
+			goto drop_pages;
+		}
+	}
+
+	/* Now copy until we run out of src or dst. */
+	ret = copy_data(src, dst, pages);
+
+drop_pages:
+	while (--i >= 0)
+		put_page(pages[i]);
+	return ret;
+}
+
+/* We cache one process to wakeup: helps for batching & wakes outside locks. */
+void set_wakeup_process(struct lguest *lg, struct task_struct *p)
+{
+	if (p == lg->wake)
+		return;
+
+	if (lg->wake) {
+		wake_up_process(lg->wake);
+		put_task_struct(lg->wake);
+	}
+	lg->wake = p;
+	if (lg->wake)
+		get_task_struct(lg->wake);
+}
+
+static int dma_transfer(struct lguest *srclg,
+			unsigned long udma,
+			struct lguest_dma_info *dst)
+{
+	struct lguest_dma dst_dma, src_dma;
+	struct lguest *dstlg;
+	u32 i, dma = 0;
+
+	dstlg = &lguests[dst->guestid];
+	/* Get our dma list. */
+	lhread(srclg, &src_dma, udma, sizeof(src_dma));
+
+	/* We can't deadlock against them dmaing to us, because this
+	 * is all under the lguest_lock. */
+	down_read(&dstlg->mm->mmap_sem);
+
+	for (i = 0; i < dst->num_dmas; i++) {
+		dma = (dst->next_dma + i) % dst->num_dmas;
+		if (!lhread_other(dstlg, &dst_dma,
+				  dst->dmas + dma * sizeof(struct lguest_dma),
+				  sizeof(dst_dma))) {
+			goto fail;
+		}
+		if (!dst_dma.used_len)
+			break;
+	}
+	if (i != dst->num_dmas) {
+		unsigned long used_lenp;
+		unsigned int ret;
+
+		ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
+		/* Put used length in src. */
+		lhwrite_u32(srclg,
+			    udma+offsetof(struct lguest_dma, used_len), ret);
+		if (ret == 0 && src_dma.len[0] != 0)
+			goto fail;
+
+		/* Make sure destination sees contents before length. */
+		mb();
+		used_lenp = dst->dmas
+			+ dma * sizeof(struct lguest_dma)
+			+ offsetof(struct lguest_dma, used_len);
+		lhwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
+		dst->next_dma++;
+	}
+ 	up_read(&dstlg->mm->mmap_sem);
+
+	/* Do this last so dst doesn't simply sleep on lock. */
+	set_bit(dst->interrupt, dstlg->irqs_pending);
+	set_wakeup_process(srclg, dstlg->tsk);
+	return i == dst->num_dmas;
+
+fail:
+	up_read(&dstlg->mm->mmap_sem);
+	return 0;
+}
+
+int send_dma(struct lguest *lg, unsigned long addr, unsigned long udma)
+{
+	union futex_key key;
+	int pending = 0, empty = 0;
+
+again:
+	mutex_lock(&lguest_lock);
+	down_read(&current->mm->mmap_sem);
+	if (get_futex_key((u32 __user *)addr, &key) != 0) {
+		kill_guest(lg, "bad sending DMA address");
+		goto unlock;
+	}
+	/* Shared mapping?  Look for other guests... */
+	if (key.shared.offset & 1) {
+		struct lguest_dma_info *i, *n;
+		list_for_each_entry_safe(i, n, &dma_hash[hash(&key)], list) {
+			if (i->guestid == lg->guestid)
+				continue;
+			if (!key_eq(&key, &i->key))
+				continue;
+
+			empty += dma_transfer(lg, udma, i);
+			break;
+		}
+		if (empty == 1) {
+			/* Give any recipients one chance to restock. */
+			up_read(&current->mm->mmap_sem);
+			mutex_unlock(&lguest_lock);
+			yield();
+			empty++;
+			goto again;
+		}
+		pending = 0;
+	} else {
+		/* Private mapping: tell our userspace. */
+		lg->dma_is_pending = 1;
+		lg->pending_dma = udma;
+		lg->pending_addr = addr;
+		pending = 1;
+	}
+unlock:
+	up_read(&current->mm->mmap_sem);
+	mutex_unlock(&lguest_lock);
+	return pending;
+}
+
+void release_all_dma(struct lguest *lg)
+{
+	unsigned int i;
+
+	BUG_ON(!mutex_is_locked(&lguest_lock));
+
+	down_read(&lg->mm->mmap_sem);
+	for (i = 0; i < LGUEST_MAX_DMA; i++) {
+		if (lg->dma[i].interrupt)
+			unlink_dma(&lg->dma[i]);
+	}
+	up_read(&lg->mm->mmap_sem);
+}
+
+/* Userspace wants a dma buffer from this guest. */
+unsigned long get_dma_buffer(struct lguest *lg,
+			     unsigned long addr, unsigned long *interrupt)
+{
+	unsigned long ret = 0;
+	union futex_key key;
+	struct lguest_dma_info *i;
+
+	mutex_lock(&lguest_lock);
+	down_read(&current->mm->mmap_sem);
+	if (get_futex_key((u32 __user *)addr, &key) != 0) {
+		kill_guest(lg, "bad registered DMA buffer");
+		goto unlock;
+	}
+	list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+		if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
+			unsigned int j;
+			for (j = 0; j < i->num_dmas; j++) {
+				struct lguest_dma dma;
+
+				ret = i->dmas + j * sizeof(struct lguest_dma);
+				lhread(lg, &dma, ret, sizeof(dma));
+				if (dma.used_len == 0)
+					break;
+			}
+			*interrupt = i->interrupt;
+			break;
+		}
+	}
+unlock:
+	up_read(&current->mm->mmap_sem);
+	mutex_unlock(&lguest_lock);
+	return ret;
+}
+
+void lguest_io_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+		INIT_LIST_HEAD(&dma_hash[i]);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest_user.c
@@ -0,0 +1,242 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "lg.h"
+
+static struct lguest_state *setup_guest_state(unsigned int num, void *pgdir,
+					      unsigned long start)
+{
+	struct lguest_state *guest = &__lguest_states()[num];
+	unsigned int i;
+	const long *def = __lguest_default_idt_entries();
+	struct lguest_regs *regs;
+
+	guest->gdt_table[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+	guest->gdt_table[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+	guest->gdt.size = GDT_ENTRIES*8-1;
+	guest->gdt.address = (unsigned long)&guest->gdt_table;
+
+	/* Other guest's IDTs are initialized from default. */
+	guest->idt.size = 8 * IDT_ENTRIES;
+	guest->idt.address = (long)guest->idt_table;
+	for (i = 0; i < IDT_ENTRIES; i++) {
+		u32 flags = 0x8e00;
+
+		/* They can't "int" into any of them except hypercall. */
+		if (i == LGUEST_TRAP_ENTRY)
+			flags |= (GUEST_DPL << 13);
+
+		guest->idt_table[i].a = (LGUEST_CS<<16) | (def[i]&0x0000FFFF);
+		guest->idt_table[i].b = (def[i]&0xFFFF0000) | flags;
+	}
+
+	memset(&guest->tss, 0, sizeof(guest->tss));
+	guest->tss.ss0 = LGUEST_DS;
+	guest->tss.esp0 = (unsigned long)(guest+1);
+	guest->tss.io_bitmap_base = sizeof(guest->tss); /* No I/O for you! */
+
+	/* Write out stack in format lguest expects, so we can switch to it. */
+	regs = &guest->regs;
+	regs->cr3 = __pa(pgdir);
+	regs->eax = regs->ebx = regs->ecx = regs->edx = regs->esp = 0;
+	regs->edi = LGUEST_MAGIC_EDI;
+	regs->ebp = LGUEST_MAGIC_EBP;
+	regs->esi = LGUEST_MAGIC_ESI;
+	regs->gs = regs->fs = 0;
+	regs->ds = regs->es = __KERNEL_DS|GUEST_DPL;
+	regs->trapnum = regs->errcode = 0;
+	regs->eip = start;
+	regs->cs = __KERNEL_CS|GUEST_DPL;
+	regs->eflags = 0x202; 	/* Interrupts enabled. */
+	regs->ss = __KERNEL_DS|GUEST_DPL;
+
+	if (!fixup_gdt_table(guest->gdt_table, ARRAY_SIZE(guest->gdt_table),
+			     &guest->regs, &guest->tss))
+		return NULL;
+
+	return guest;
+}
+
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+	unsigned long addr, udma, irq;
+
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	udma = get_dma_buffer(lg, addr, &irq);
+	if (!udma)
+		return -ENOENT;
+
+	/* We put irq number in udma->used_len. */
+	lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+	return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+	u32 irq;
+
+	if (get_user(irq, input) != 0)
+		return -EFAULT;
+	if (irq >= LGUEST_IRQS)
+		return -EINVAL;
+	set_bit(irq, lg->irqs_pending);
+	return 0;
+}
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return -EINVAL;
+
+	if (lg->dead) {
+		size_t len;
+
+		if (lg->dead == (void *)-1)
+			return -ENOMEM;
+
+		len = min(size, strlen(lg->dead)+1);
+		if (copy_to_user(user, lg->dead, len) != 0)
+			return -EFAULT;
+		return len;
+	}
+
+	if (lg->dma_is_pending)
+		lg->dma_is_pending = 0;
+
+	return run_guest(lg, user);
+}
+
+/* Take: pfnlimit, pgdir, start, pageoffset. */
+static int initialize(struct file *file, const u32 __user *input)
+{
+	struct lguest *lg;
+	int err, i;
+	u32 args[4];
+
+	if (file->private_data)
+		return -EBUSY;
+
+	if (copy_from_user(args, input, sizeof(args)) != 0)
+		return -EFAULT;
+
+	if (args[1] <= PAGE_SIZE)
+		return -EINVAL;
+
+	mutex_lock(&lguest_lock);
+	i = find_free_guest();
+	if (i < 0) {
+		err = -ENOSPC;
+		goto unlock;
+	}
+	lg = &lguests[i];
+	lg->guestid = i;
+	lg->pfn_limit = args[0];
+	lg->page_offset = args[3];
+
+	lg->trap_page = (u32 *)get_zeroed_page(GFP_KERNEL);
+	if (!lg->trap_page) {
+		err = -ENOMEM;
+		goto release_guest;
+	}
+
+	err = init_guest_pagetable(lg, args[1]);
+	if (err)
+		goto free_trap_page;
+
+	lg->state = setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+	if (!lg->state) {
+		err = -ENOEXEC;
+		goto release_pgtable;
+	}
+	mutex_unlock(&lguest_lock);
+
+	lg->tsk = current;
+	lg->mm = get_task_mm(current);
+	file->private_data = lg;
+	return sizeof(args);
+
+release_pgtable:
+	free_guest_pagetable(lg);
+free_trap_page:
+	free_page((long)lg->trap_page);
+release_guest:
+	memset(lg, 0, sizeof(*lg));
+unlock:
+	mutex_unlock(&lguest_lock);
+	return err;
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+		     size_t size, loff_t *off)
+{
+	struct lguest *lg = file->private_data;
+	u32 req;
+
+	if (get_user(req, input) != 0)
+		return -EFAULT;
+	input += sizeof(req);
+
+	if (req != LHREQ_INITIALIZE && !lg)
+		return -EINVAL;
+	if (lg && lg->dead)
+		return -ENOENT;
+
+	switch (req) {
+	case LHREQ_INITIALIZE:
+		return initialize(file, (const u32 __user *)input);
+	case LHREQ_GETDMA:
+		return user_get_dma(lg, (const u32 __user *)input);
+	case LHREQ_IRQ:
+		return user_send_irq(lg, (const u32 __user *)input);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+	struct lguest *lg = file->private_data;
+
+	if (!lg)
+		return 0;
+
+	mutex_lock(&lguest_lock);
+	release_all_dma(lg);
+	free_page((long)lg->trap_page);
+	free_guest_pagetable(lg);
+	mmput(lg->mm);
+	if (lg->dead != (void *)1)
+		kfree(lg->dead);
+	memset(lg->state, 0, sizeof(*lg->state));
+	memset(lg, 0, sizeof(*lg));
+	mutex_unlock(&lguest_lock);
+	return 0;
+}
+
+static struct file_operations lguest_fops = {
+	.owner	 = THIS_MODULE,
+	.release = close,
+	.write	 = write,
+	.read	 = read,
+};
+static struct miscdevice lguest_dev = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "lguest",
+	.fops	= &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+	return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+	misc_deregister(&lguest_dev);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/page_tables.c
@@ -0,0 +1,374 @@
+/* Shadow page table operations.
+ * Copyright (C) Rusty Russell IBm Corporation 2006.
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include "lg.h"
+
+#define PTES_PER_PAGE_SHIFT 10
+#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
+#define HYPERVISOR_PGD_ENTRY (PTES_PER_PAGE - 1)
+
+static DEFINE_PER_CPU(u32 *, hypervisor_pte_pages) = { NULL };
+#define hypervisor_pte_page(cpu) per_cpu(hypervisor_pte_pages, cpu)
+
+static unsigned vaddr_to_pgd(unsigned long vaddr)
+{
+	return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+}
+
+/* These access the real versions. */
+static u32 *toplev(struct lguest *lg, u32 i, unsigned long vaddr)
+{
+	unsigned int index = vaddr_to_pgd(vaddr);
+
+	if (index >= HYPERVISOR_PGD_ENTRY) {
+		kill_guest(lg, "attempt to access hypervisor pages");
+		index = 0;
+	} 
+	return &lg->pgdirs[i].pgdir[index];
+}
+
+static u32 *pteof(struct lguest *lg, u32 top, unsigned long vaddr)
+{
+	u32 *page = __va(top&PAGE_MASK);
+	BUG_ON(!(top & _PAGE_PRESENT));
+	return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+}
+
+/* These access the guest versions. */
+static u32 gtoplev(struct lguest *lg, unsigned long vaddr)
+{
+	unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+	return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(u32);
+}
+
+static u32 gpteof(struct lguest *lg, u32 gtop, unsigned long vaddr)
+{
+	u32 gpage = (gtop&PAGE_MASK);
+	BUG_ON(!(gtop & _PAGE_PRESENT));
+	return gpage + ((vaddr >> PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(u32);
+}
+
+static void release_pte(u32 pte)
+{
+	if (pte & _PAGE_PRESENT)
+		put_page(pfn_to_page(pte >> PAGE_SHIFT));
+}
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+	struct vm_area_struct *vma;
+	struct page *page;
+	unsigned long ret = -1UL;
+
+	down_read(&current->mm->mmap_sem);
+	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+			   1, write, 1, &page, &vma) == 1)
+		ret = page_to_pfn(page);
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+static u32 check_pgtable_entry(struct lguest *lg, u32 entry)
+{
+	if ((entry & (_PAGE_PWT|_PAGE_PSE))
+	    || (entry >> PAGE_SHIFT) >= lg->pfn_limit)
+		kill_guest(lg, "bad page table entry");
+	return entry & ~_PAGE_GLOBAL;
+}
+
+static u32 get_pte(struct lguest *lg, u32 entry, int write)
+{
+	u32 pfn;
+
+	pfn = get_pfn(entry >> PAGE_SHIFT, write);
+	if (pfn == -1UL) {
+		kill_guest(lg, "failed to get page %u", entry>>PAGE_SHIFT);
+		return 0;
+	}
+	return ((pfn << PAGE_SHIFT) | (entry & (PAGE_SIZE-1)));
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+   swapped.  It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return NULL or the pte page. */
+static int page_in(struct lguest *lg, u32 vaddr, unsigned flags)
+{
+	u32 gtop, gpte;
+	u32 *top, *pte, *ptepage;
+	u32 val;
+
+	gtop = gtoplev(lg, vaddr);
+	val = lhread_u32(lg, gtop);
+	if (!(val & _PAGE_PRESENT))
+		return 0;
+
+	top = toplev(lg, lg->pgdidx, vaddr);
+	if (!(*top & _PAGE_PRESENT)) {
+		/* Get a PTE page for them. */
+		ptepage = (void *)get_zeroed_page(GFP_KERNEL);
+		/* FIXME: Steal from self in this case? */
+		if (!ptepage) {
+			kill_guest(lg, "out of memory allocating pte page");
+			return 0;
+		}
+		val = check_pgtable_entry(lg, val);
+		*top = (__pa(ptepage) | (val & (PAGE_SIZE-1)));
+	} else
+		ptepage = __va(*top & PAGE_MASK);
+
+	gpte = gpteof(lg, val, vaddr);
+	val = lhread_u32(lg, gpte);
+
+	/* No page, or write to readonly page? */
+	if (!(val&_PAGE_PRESENT) || ((flags&_PAGE_DIRTY) && !(val&_PAGE_RW)))
+		return 0;
+
+	pte = pteof(lg, *top, vaddr);
+	val = check_pgtable_entry(lg, val) | flags;
+
+	/* We're done with the old pte. */
+	release_pte(*pte);
+
+	/* We don't make it writable if this isn't a write: later
+	 * write will fault so we can set dirty bit in guest. */
+	if (val & _PAGE_DIRTY)
+		*pte = get_pte(lg, val, 1);
+	else
+		*pte = get_pte(lg, val & ~_PAGE_RW, 0);
+
+	/* Now we update dirty/accessed on guest. */
+	lhwrite_u32(lg, gpte, val);
+	return 1;
+}
+
+int demand_page(struct lguest *lg, u32 vaddr, int write)
+{
+	return page_in(lg, vaddr, (write ? _PAGE_DIRTY : 0)|_PAGE_ACCESSED);
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+	unsigned int i;
+	u32 stack = lg->state->tss.esp1;
+
+	for (i = 0; i < lg->stack_pages; i++)
+		if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+			kill_guest(lg, "bad stack page %i@%#x", i, stack);
+}
+
+static unsigned int find_pgdir(struct lguest *lg, u32 pgtable)
+{
+	unsigned int i;
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].cr3 == pgtable)
+			break;
+	return i;
+}
+
+static void release_pgd(struct lguest *lg, u32 *pgd)
+{
+	if (*pgd & _PAGE_PRESENT) {
+		unsigned int i;
+		u32 *ptepage = __va(*pgd & ~(PAGE_SIZE-1));
+		for (i = 0; i < PTES_PER_PAGE; i++)
+			release_pte(ptepage[i]);
+		free_page((long)ptepage);
+		*pgd = 0;
+	}
+}
+
+static void flush_user_mappings(struct lguest *lg, int idx)
+{
+	unsigned int i;
+	for (i = 0; i < vaddr_to_pgd(lg->page_offset); i++)
+		release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+}
+
+void guest_pagetable_flush_user(struct lguest *lg)
+{
+	flush_user_mappings(lg, lg->pgdidx);
+}
+
+static unsigned int new_pgdir(struct lguest *lg, u32 cr3)
+{
+	unsigned int next;
+
+	next = (lg->pgdidx + random32()) % ARRAY_SIZE(lg->pgdirs);
+	if (!lg->pgdirs[next].pgdir) {
+		lg->pgdirs[next].pgdir = (u32 *)get_zeroed_page(GFP_KERNEL);
+		if (!lg->pgdirs[next].pgdir)
+			next = lg->pgdidx;
+	}
+	lg->pgdirs[next].cr3 = cr3;
+	/* Release all the non-kernel mappings. */
+	flush_user_mappings(lg, next);
+
+	return next;
+}
+
+void guest_new_pagetable(struct lguest *lg, u32 pgtable)
+{
+	int newpgdir;
+
+	newpgdir = find_pgdir(lg, pgtable);
+	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+		newpgdir = new_pgdir(lg, pgtable);
+	lg->pgdidx = newpgdir;
+	lg->state->regs.cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
+	pin_stack_pages(lg);
+}
+
+static void release_all_pagetables(struct lguest *lg)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		if (lg->pgdirs[i].pgdir)
+			for (j = 0; j < HYPERVISOR_PGD_ENTRY; j++)
+				release_pgd(lg, lg->pgdirs[i].pgdir + j);
+}
+
+void guest_pagetable_clear_all(struct lguest *lg)
+{
+	release_all_pagetables(lg);
+	pin_stack_pages(lg);
+}
+
+static void do_set_pte(struct lguest *lg, int idx,
+		       unsigned long vaddr, u32 val)
+{
+	u32 *top = toplev(lg, idx, vaddr);
+	if (*top & _PAGE_PRESENT) {
+		u32 *pte = pteof(lg, *top, vaddr);
+		release_pte(*pte);
+		if (val & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+			val = check_pgtable_entry(lg, val);
+			*pte = get_pte(lg, val, val & _PAGE_DIRTY);
+		} else
+			*pte = 0;
+	}
+}
+
+void guest_set_pte(struct lguest *lg,
+		   unsigned long cr3, unsigned long vaddr, u32 val)
+{
+	/* Kernel mappings must be changed on all top levels. */
+	if (vaddr >= lg->page_offset) {
+		unsigned int i;
+		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+			if (lg->pgdirs[i].pgdir)
+				do_set_pte(lg, i, vaddr, val);
+	} else {
+		int pgdir = find_pgdir(lg, cr3);
+		if (pgdir != ARRAY_SIZE(lg->pgdirs))
+			do_set_pte(lg, pgdir, vaddr, val);
+	}
+}
+
+void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 idx)
+{
+	int pgdir;
+
+	if (idx >= HYPERVISOR_PGD_ENTRY)
+		return;
+
+	pgdir = find_pgdir(lg, cr3);
+	if (pgdir < ARRAY_SIZE(lg->pgdirs))
+		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+}
+
+int init_guest_pagetable(struct lguest *lg, u32 pgtable)
+{
+	/* We assume this in flush_user_mappings, so check now */
+	if (vaddr_to_pgd(lg->page_offset) >= HYPERVISOR_PGD_ENTRY)
+		return -EINVAL;
+	lg->pgdidx = 0;
+	lg->pgdirs[lg->pgdidx].cr3 = pgtable;
+	lg->pgdirs[lg->pgdidx].pgdir = (u32*)get_zeroed_page(GFP_KERNEL);
+	if (!lg->pgdirs[lg->pgdidx].pgdir)
+		return -ENOMEM;
+	return 0;
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+	unsigned int i;
+
+	release_all_pagetables(lg);
+	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+		free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_trap_page(struct lguest *lg)
+{
+	int cpu = smp_processor_id();
+	
+	hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
+
+	/* Since hypervisor less that 4MB, we simply mug top pte page. */
+	lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =
+		(__pa(hypervisor_pte_page(cpu))| _PAGE_KERNEL);
+}
+
+static void free_hypervisor_pte_pages(void)
+{
+	int i;
+	
+	for_each_possible_cpu(i)
+		free_page((long)hypervisor_pte_page(i));
+}
+
+static __init int alloc_hypervisor_pte_pages(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		hypervisor_pte_page(i) = (u32 *)get_zeroed_page(GFP_KERNEL);
+		if (!hypervisor_pte_page(i)) {
+			free_hypervisor_pte_pages();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+static __init void populate_hypervisor_pte_page(int cpu)
+{
+	int i;
+	u32 *pte = hypervisor_pte_page(cpu);
+
+	for (i = 0; i < HYPERVISOR_PAGES; i++) {
+		/* First entry set dynamically in map_trap_page */
+		pte[i+1] = ((page_to_pfn(&hype_pages[i]) << PAGE_SHIFT) 
+			    | _PAGE_KERNEL_EXEC);
+	}
+}
+
+__init int init_pagetables(struct page hype_pages[])
+{
+	int ret;
+	unsigned int i;
+
+	ret = alloc_hypervisor_pte_pages();
+	if (ret)
+		return ret;
+
+	for_each_possible_cpu(i)
+		populate_hypervisor_pte_page(i);
+	return 0;
+}
+
+__exit void free_pagetables(void)
+{
+	free_hypervisor_pte_pages();
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/segments.c
@@ -0,0 +1,171 @@
+#include "lg.h"
+
+/* Dealing with GDT entries is such a horror, I convert to sanity and back */
+struct decoded_gdt_entry
+{
+	u32 base, limit;
+	union {
+		struct {
+			unsigned type:4;
+			unsigned dtype:1;
+			unsigned dpl:2;
+			unsigned present:1;
+			unsigned unused:4;
+			unsigned avl:1;
+			unsigned mbz:1;
+			unsigned def:1;
+			unsigned page_granularity:1;
+		};
+		u16 raw_attributes;
+	};
+};
+
+static struct decoded_gdt_entry decode_gdt_entry(const struct desc_struct *en)
+{
+	struct decoded_gdt_entry de;
+	de.base = ((en->a >> 16) | ((en->b & 0xff) << 16) 
+		   | (en->b & 0xFF000000));
+	de.limit = ((en->a & 0xFFFF) | (en->b & 0xF0000));
+	de.raw_attributes = (en->b >> 8);
+	return de;
+}
+
+static struct desc_struct encode_gdt_entry(const struct decoded_gdt_entry *de)
+{
+	struct desc_struct en;
+	en.a = ((de->limit & 0xFFFF) | (de->base << 16));
+	en.b = (((de->base >> 16) & 0xFF) 
+		 | ((((u32)de->raw_attributes) & 0xF0FF) << 8)
+		 | (de->limit & 0xF0000)
+		 | (de->base & 0xFF000000));
+	return en;
+}
+
+static int check_desc(const struct decoded_gdt_entry *dec)
+{
+	return (dec->mbz == 0 && dec->dtype == 1 && (dec->type & 4) == 0);
+}
+
+static void check_segment(const struct desc_struct *gdt, u32 *segreg)
+{
+	if (*segreg > 255 || !(gdt[*segreg >> 3].b & 0x8000))
+		*segreg = 0;
+}
+
+/* Ensure our manually-loaded segment regs don't fault in switch_to_guest. */
+static void check_live_segments(const struct desc_struct *gdt,
+				struct lguest_regs *regs)
+{
+	check_segment(gdt, &regs->es);
+	check_segment(gdt, &regs->ds);
+	check_segment(gdt, &regs->fs);
+	check_segment(gdt, &regs->gs);
+}
+
+int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
+		    struct lguest_regs *regs, struct x86_tss *tss)
+{
+	unsigned int i;
+	struct decoded_gdt_entry dec;
+
+	for (i = 0; i < num; i++) {
+		unsigned long base, length;
+
+		/* We override these ones, so we don't care what they give. */
+		if (i == GDT_ENTRY_TSS
+		    || i == GDT_ENTRY_LGUEST_CS
+		    || i == GDT_ENTRY_LGUEST_DS
+		    || i == GDT_ENTRY_DOUBLEFAULT_TSS)
+			continue;
+
+		dec = decode_gdt_entry(&gdt[i]);
+		if (!dec.present)
+			continue;
+
+		if (!check_desc(&dec))
+			return 0;
+
+		base = dec.base;
+		length = dec.limit + 1;
+		if (dec.page_granularity) {
+			base *= PAGE_SIZE;
+			length *= PAGE_SIZE;
+		}
+
+		/* Unacceptable base? */
+		if (base >= HYPE_ADDR)
+			return 0;
+
+		/* Wrap around or segment overlaps hypervisor mem? */
+		if (!length
+		    || base + length < base
+		    || base + length > HYPE_ADDR) {
+			/* Trim to edge of hypervisor. */
+			length = HYPE_ADDR - base;
+			if (dec.page_granularity)
+				dec.limit = (length / PAGE_SIZE) - 1;
+			else
+				dec.limit = length - 1;
+		}
+		if (dec.dpl == 0)
+			dec.dpl = GUEST_DPL;
+		gdt[i] = encode_gdt_entry(&dec);
+	}
+	check_live_segments(gdt, regs);
+
+	/* Now put in hypervisor data and code segments. */
+	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+
+	/* Finally, TSS entry */
+	dec.base = (unsigned long)tss;
+	dec.limit = sizeof(*tss)-1;
+	dec.type = 0x9;
+	dec.dtype = 0;
+	dec.def = 0;
+	dec.present = 1;
+	dec.mbz = 0;
+	dec.page_granularity = 0;
+	gdt[GDT_ENTRY_TSS] = encode_gdt_entry(&dec);
+
+	return 1;
+}
+
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num)
+{
+	if (num > GDT_ENTRIES)
+		kill_guest(lg, "too many gdt entries %i", num);
+
+	lhread(lg, lg->state->gdt_table, table,
+	       num * sizeof(lg->state->gdt_table[0]));
+	if (!fixup_gdt_table(lg->state->gdt_table, num, 
+			     &lg->state->regs, &lg->state->tss))
+		kill_guest(lg, "bad gdt table");
+}
+
+/* We don't care about limit here, since we only let them use these in
+ * usermode (where lack of USER bit in pagetable protects hypervisor mem).
+ * However, we want to ensure it doesn't fault when loaded, since *we* are
+ * the ones who will load it in switch_to_guest.
+ */
+void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls)
+{
+	unsigned int i;
+	struct desc_struct *tls = &lg->state->gdt_table[GDT_ENTRY_TLS_MIN];
+
+	lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+	for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++) {
+		struct decoded_gdt_entry dec = decode_gdt_entry(&tls[i]);
+
+		if (!dec.present)
+			continue;
+
+		/* We truncate to one byte/page (depending on G bit) to neuter
+		   it, so ensure it's more than 1 page below trap page. */
+		tls[i].a &= 0xFFFF0000;
+		lg->tls_limits[i] = dec.limit;
+		if (!check_desc(&dec) || dec.base > HYPE_ADDR - PAGE_SIZE)
+			kill_guest(lg, "bad TLS descriptor %i", i);
+	}
+	check_live_segments(lg->state->gdt_table, &lg->state->regs);
+}



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 6 of 7]  lguest: Guest code
  2007-02-09 15:14         ` [PATCH 5 of 7] lguest: the host code (lg.ko) Rusty Russell
@ 2007-02-09 15:17           ` Rusty Russell
  2007-02-09 15:21             ` [PATCH 7 of 7] lguest: Makefile Rusty Russell
  0 siblings, 1 reply; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 15:17 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

[ Changes since 6c:
1) No more extern declarations!
2) Use native_cpuid directly rather than open-coding it.
3) Add declaration of boot_pda to asm/pda.h ]

This is the guest code which replaces the parts of paravirt_ops with
hypercalls.  It's fairly trivial.  This patch also includes trivial
bus driver for lguest devices, and one new extern declaration for
boot_pda, previously frobbed only from head.S.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest.c
@@ -0,0 +1,582 @@
+/*
+ * Lguest specific paravirt-ops implementation
+ *
+ * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/start_kernel.h>
+#include <linux/string.h>
+#include <linux/console.h>
+#include <linux/screen_info.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <asm/paravirt.h>
+#include <asm/lguest.h>
+#include <asm/lguest_user.h>
+#include <asm/param.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/e820.h>
+#include <asm/pda.h>
+#include <asm/asm-offsets.h>
+#include <asm/mce.h>
+
+struct lguest_data lguest_data;
+struct lguest_device_desc *lguest_devices;
+static __initdata const struct lguest_boot_info *boot = __va(0);
+
+void async_hcall(unsigned long call,
+		 unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	/* Note: This code assumes we're uniprocessor. */
+	static unsigned int next_call;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (lguest_data.hcall_status[next_call] != 0xFF) {
+		/* Table full, so do normal hcall which will flush table. */
+		hcall(call, arg1, arg2, arg3);
+	} else {
+		lguest_data.hcalls[next_call].eax = call;
+		lguest_data.hcalls[next_call].edx = arg1;
+		lguest_data.hcalls[next_call].ebx = arg2;
+		lguest_data.hcalls[next_call].ecx = arg3;
+		wmb();
+		lguest_data.hcall_status[next_call] = 0;
+		if (++next_call == LHCALL_RING_SIZE)
+			next_call = 0;
+	}
+	local_irq_restore(flags);
+}
+
+#ifdef PARAVIRT_LAZY_NONE 	/* Not in 2.6.20. */
+static int lazy_mode;
+static void fastcall lguest_lazy_mode(int mode)
+{
+	lazy_mode = mode;
+	if (mode == PARAVIRT_LAZY_NONE)
+		hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+}
+
+static void lazy_hcall(unsigned long call,
+		       unsigned long arg1,
+		       unsigned long arg2,
+		       unsigned long arg3)
+{
+	if (lazy_mode == PARAVIRT_LAZY_NONE)
+		hcall(call, arg1, arg2, arg3);
+	else
+		async_hcall(call, arg1, arg2, arg3);
+}
+#else
+#define lazy_hcall hcall
+#endif
+
+static unsigned long fastcall save_fl(void)
+{
+	return lguest_data.irq_enabled;
+}
+
+static void fastcall restore_fl(unsigned long flags)
+{
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = flags;
+}
+
+static void fastcall irq_disable(void)
+{
+	lguest_data.irq_enabled = 0;
+}
+
+static void fastcall irq_enable(void)
+{
+	/* Linux i386 code expects bit 9 set. */
+	/* FIXME: Check if interrupt pending... */
+	lguest_data.irq_enabled = 512;
+}
+
+static void fastcall lguest_load_gdt(const struct Xgt_desc_struct *desc)
+{
+	BUG_ON((desc->size+1)/8 != GDT_ENTRIES);
+	hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);
+}
+
+static void fastcall lguest_load_idt(const struct Xgt_desc_struct *desc)
+{
+	unsigned int i;
+	struct desc_struct *idt = (void *)desc->address;
+
+	for (i = 0; i < (desc->size+1)/8; i++)
+		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);
+}
+
+static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
+{
+	hcall(LHCALL_CRASH, __pa(p), 0, 0);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block paniced = {
+	.notifier_call = lguest_panic
+};
+
+static cycle_t lguest_clock_read(void)
+{
+	/* FIXME: This is just the native one.  Account stolen time! */
+	return paravirt_ops.read_tsc();
+}
+
+/* FIXME: Update iff tsc rate changes. */
+static struct clocksource lguest_clock = {
+	.name			= "lguest",
+	.rating			= 400,
+	.read			= lguest_clock_read,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.is_continuous		= 1,
+};
+
+static char *lguest_memory_setup(void)
+{
+	/* We do these here because lockcheck barfs if before start_kernel */
+	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
+	lguest_clock.mult = lguest_data.clock_mult;
+	clocksource_register(&lguest_clock);
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
+	return "LGUEST";
+}
+
+static fastcall void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	int is_feature = (*eax == 1);
+
+	native_cpuid(eax, ebx, ecx, edx);
+	if (is_feature) {
+		unsigned long *excap = (unsigned long *)ecx,
+			*features = (unsigned long *)edx;
+		/* Hypervisor needs to know when we flush kernel pages. */
+		set_bit(X86_FEATURE_PGE, features);
+		/* We don't have any features! */
+		clear_bit(X86_FEATURE_VME, features);
+		clear_bit(X86_FEATURE_DE, features);
+		clear_bit(X86_FEATURE_PSE, features);
+		clear_bit(X86_FEATURE_PAE, features);
+		clear_bit(X86_FEATURE_SEP, features);
+		clear_bit(X86_FEATURE_APIC, features);
+		clear_bit(X86_FEATURE_MTRR, features);
+		/* No MWAIT, either */
+		clear_bit(3, excap);
+	}
+}
+
+static unsigned long current_cr3;
+static void fastcall lguest_write_cr3(unsigned long cr3)
+{
+	hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);
+	current_cr3 = cr3;
+}
+
+static void fastcall lguest_flush_tlb(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);
+}
+
+static void fastcall lguest_flush_tlb_kernel(void)
+{
+	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
+}
+
+static void fastcall lguest_flush_tlb_single(u32 addr)
+{
+	/* Simply set it to zero, and it will fault back in. */
+	lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);
+}
+
+/* FIXME: Eliminate all callers of this. */
+static fastcall void lguest_set_pte(pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	/* Don't bother with hypercall before initial setup. */
+	if (current_cr3)
+		hcall(LHCALL_SET_UNKNOWN_PTE, 0, 0, 0);
+}
+
+static fastcall void lguest_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+	*ptep = pteval;
+	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);
+}
+
+/* We only support two-level pagetables at the moment. */
+static fastcall void lguest_set_pud(pmd_t *pmdp, pmd_t pmdval)
+{
+	*pmdp = pmdval;
+	lazy_hcall(LHCALL_SET_PUD, __pa(pmdp)&PAGE_MASK,
+		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static fastcall void lguest_apic_write(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall void lguest_apic_write_atomic(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall unsigned long lguest_apic_read(unsigned long reg)
+{
+	return 0;
+}
+#endif
+
+/* We move eflags word to lguest_data.irq_enabled to restore interrupt
+   state.  For page faults, gpfs and virtual interrupts, the
+   hypervisor has saved eflags manually, otherwise it was delivered
+   directly and so eflags reflects the real machine IF state,
+   ie. interrupts on.  Since the kernel always dies if it takes such a
+   trap with interrupts disabled anyway, turning interrupts back on
+   unconditionally here is OK. */
+asm("lguest_iret:"
+    " pushl	%eax;"
+    " movl	12(%esp), %eax;"
+    "lguest_noirq_start:;"
+    " movl	%eax,%ss:lguest_data+"__stringify(LGUEST_DATA_irq_enabled)";"
+    " popl	%eax;"
+    " iret;"
+    "lguest_noirq_end:");
+extern void fastcall lguest_iret(void);
+extern char lguest_noirq_start[], lguest_noirq_end[];
+
+static void fastcall lguest_load_esp0(struct tss_struct *tss,
+				     struct thread_struct *thread)
+{
+	lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0,
+		   THREAD_SIZE/PAGE_SIZE);
+}
+
+static fastcall void lguest_load_tr_desc(void)
+{
+}
+
+static fastcall void lguest_set_ldt(const void *addr, unsigned entries)
+{
+	/* FIXME: Implement. */
+	BUG_ON(entries);
+}
+
+static fastcall void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
+}
+
+static fastcall void lguest_set_debugreg(int regno, unsigned long value)
+{
+	/* FIXME: Implement */
+}
+
+static unsigned int lguest_cr0;
+static fastcall void lguest_clts(void)
+{
+	lazy_hcall(LHCALL_TS, 0, 0, 0);
+	lguest_cr0 &= ~8U;
+}
+
+static fastcall unsigned long lguest_read_cr0(void)
+{
+	return lguest_cr0;
+}
+
+static fastcall void lguest_write_cr0(unsigned long val)
+{
+	hcall(LHCALL_TS, val & 8, 0, 0);
+	lguest_cr0 = val;
+}
+
+static fastcall unsigned long lguest_read_cr2(void)
+{
+	return lguest_data.cr2;
+}
+
+static fastcall unsigned long lguest_read_cr3(void)
+{
+	return current_cr3;
+}
+
+/* Used to enable/disable PGE, but we don't care. */
+static fastcall unsigned long lguest_read_cr4(void)
+{
+	return 0;
+}
+
+static fastcall void lguest_write_cr4(unsigned long val)
+{
+}
+
+static void fastcall lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+{
+	do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0));
+	update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static void disable_lguest_irq(unsigned int irq)
+{
+	set_bit(irq, lguest_data.interrupts);
+}
+
+static void enable_lguest_irq(unsigned int irq)
+{
+	clear_bit(irq, lguest_data.interrupts);
+	/* FIXME: If it's pending? */
+}
+
+static struct irq_chip lguest_irq_controller = {
+	.name		= "lguest",
+	.mask		= disable_lguest_irq,
+	.mask_ack	= disable_lguest_irq,
+	.unmask		= enable_lguest_irq,
+};
+
+static void lguest_time_init(void)
+{
+	set_irq_handler(0, lguest_time_irq);
+	hcall(LHCALL_TIMER_START,HZ,0,0);
+}
+
+static void __init lguest_init_IRQ(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_IRQS; i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		if (vector != SYSCALL_VECTOR) {
+			set_intr_gate(vector, interrupt[i]);
+			set_irq_chip_and_handler(i, &lguest_irq_controller,
+						 handle_level_irq);
+		}
+	}
+	irq_ctx_init(smp_processor_id());
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+	u32 *lp = (u32 *)((char *)dt + entry*8);
+	lp[0] = entry_low;
+	lp[1] = entry_high;
+}
+
+static fastcall void lguest_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+	/* FIXME: Allow this. */
+	BUG();
+}
+
+static fastcall void lguest_write_gdt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);
+}
+
+static fastcall void lguest_write_idt_entry(void *dt, int entrynum,
+					   u32 low, u32 high)
+{
+	native_write_dt_entry(dt, entrynum, low, high);
+	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);
+}
+
+#define LGUEST_IRQ "lguest_data+"__stringify(LGUEST_DATA_irq_enabled)
+#define DEF_LGUEST(name, code)				\
+	extern const char start_##name[], end_##name[];		\
+	asm("start_" #name ": " code "; end_" #name ":")
+DEF_LGUEST(cli, "movl $0," LGUEST_IRQ);
+DEF_LGUEST(sti, "movl $512," LGUEST_IRQ);
+DEF_LGUEST(popf, "movl %eax," LGUEST_IRQ);
+DEF_LGUEST(pushf, "movl " LGUEST_IRQ ",%eax");
+DEF_LGUEST(pushf_cli, "movl " LGUEST_IRQ ",%eax; movl $0," LGUEST_IRQ);
+DEF_LGUEST(iret, ".byte 0xE9,0,0,0,0"); /* jmp ... */
+
+static const struct lguest_insns
+{
+	const char *start, *end;
+} lguest_insns[] = {
+	[PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+	[PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+	[PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+	[PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+	[PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+	[PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+};
+static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
+{
+	unsigned int insn_len;
+
+	/* Don't touch it if we don't have a replacement */
+	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
+		return len;
+
+	insn_len = lguest_insns[type].end - lguest_insns[type].start;
+
+	/* Similarly if we can't fit replacement. */
+	if (len < insn_len)
+		return len;
+
+	memcpy(insns, lguest_insns[type].start, insn_len);
+	if (type == PARAVIRT_INTERRUPT_RETURN) {
+		/* Jumps are relative. */
+		u32 off = (u32)lguest_iret - ((u32)insns + insn_len);
+		memcpy(insns+1, &off, sizeof(off));
+	}
+	return insn_len;
+}
+
+static void fastcall lguest_safe_halt(void)
+{
+	hcall(LHCALL_HALT, 0, 0, 0);
+}
+
+static unsigned long lguest_get_wallclock(void)
+{
+	return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
+}
+
+static void lguest_power_off(void)
+{
+	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
+}
+
+static __attribute_used__ __init void lguest_init(void)
+{
+	paravirt_ops.name = "lguest";
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.kernel_rpl = 1;
+
+	paravirt_ops.save_fl = save_fl;
+	paravirt_ops.restore_fl = restore_fl;
+	paravirt_ops.irq_disable = irq_disable;
+	paravirt_ops.irq_enable = irq_enable;
+	paravirt_ops.load_gdt = lguest_load_gdt;
+	paravirt_ops.memory_setup = lguest_memory_setup;
+	paravirt_ops.cpuid = lguest_cpuid;
+	paravirt_ops.write_cr3 = lguest_write_cr3;
+	paravirt_ops.flush_tlb_user = lguest_flush_tlb;
+	paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+	paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+	paravirt_ops.set_pte = lguest_set_pte;
+	paravirt_ops.set_pte_at = lguest_set_pte_at;
+	paravirt_ops.set_pmd = lguest_set_pud;
+#ifdef CONFIG_X86_LOCAL_APIC
+	paravirt_ops.apic_write = lguest_apic_write;
+	paravirt_ops.apic_write_atomic = lguest_apic_write_atomic;
+	paravirt_ops.apic_read = lguest_apic_read;
+#endif
+	paravirt_ops.load_idt = lguest_load_idt;
+	paravirt_ops.iret = lguest_iret;
+	paravirt_ops.load_esp0 = lguest_load_esp0;
+	paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+	paravirt_ops.set_ldt = lguest_set_ldt;
+	paravirt_ops.load_tls = lguest_load_tls;
+	paravirt_ops.set_debugreg = lguest_set_debugreg;
+	paravirt_ops.clts = lguest_clts;
+	paravirt_ops.read_cr0 = lguest_read_cr0;
+	paravirt_ops.write_cr0 = lguest_write_cr0;
+	paravirt_ops.init_IRQ = lguest_init_IRQ;
+	paravirt_ops.read_cr2 = lguest_read_cr2;
+	paravirt_ops.read_cr3 = lguest_read_cr3;
+	paravirt_ops.read_cr4 = lguest_read_cr4;
+	paravirt_ops.write_cr4 = lguest_write_cr4;
+	paravirt_ops.write_ldt_entry = lguest_write_ldt_entry;
+	paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
+	paravirt_ops.write_idt_entry = lguest_write_idt_entry;
+	paravirt_ops.patch = lguest_patch;
+	paravirt_ops.safe_halt = lguest_safe_halt;
+	paravirt_ops.get_wallclock = lguest_get_wallclock;
+	paravirt_ops.time_init = lguest_time_init;
+#ifdef PARAVIRT_LAZY_NONE
+	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
+#endif
+
+	memset(lguest_data.hcall_status,0xFF,sizeof(lguest_data.hcall_status));
+	lguest_data.noirq_start = (u32)lguest_noirq_start;
+	lguest_data.noirq_end = (u32)lguest_noirq_end;
+	hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
+	strncpy(saved_command_line, boot->cmdline, COMMAND_LINE_SIZE);
+
+	/* We use top of mem for initial pagetables. */
+	init_pg_tables_end = __pa(pg0);
+
+	/* set up PDA descriptor */
+	pack_descriptor((u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].a,
+			(u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].b,
+			(unsigned)&boot_pda, sizeof(boot_pda)-1,
+			0x80 | DESCTYPE_S | 0x02, 0);
+	load_gdt(&early_gdt_descr);
+	asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+
+	reserve_top_address(lguest_data.reserve_mem);
+
+	cpu_detect(&new_cpu_data);
+	/* Need this before paging_init. */
+	set_bit(X86_FEATURE_PGE, new_cpu_data.x86_capability);
+	/* Math is always hard! */
+	new_cpu_data.hard_math = 1;
+
+	/* FIXME: Better way? */
+	/* Suppress vgacon startup code */
+	SCREEN_INFO.orig_video_isVGA = VIDEO_TYPE_VLFB;
+
+	add_preferred_console("hvc", 0, NULL);
+
+#ifdef CONFIG_X86_MCE
+	mce_disabled = 1;
+#endif
+
+#ifdef CONFIG_ACPI
+	acpi_disabled = 1;
+	acpi_ht = 0;
+#endif
+	if (boot->initrd_size) {
+		/* We stash this at top of memory. */
+		INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
+		INITRD_SIZE = boot->initrd_size;
+		LOADER_TYPE = 0xFF;
+	}
+
+	pm_power_off = lguest_power_off;
+	start_kernel();
+}
+
+asm("lguest_maybe_init:\n"
+    "	cmpl $"__stringify(LGUEST_MAGIC_EBP)", %ebp\n"
+    "	jne 1f\n"
+    "	cmpl $"__stringify(LGUEST_MAGIC_EDI)", %edi\n"
+    "	jne 1f\n"
+    "	cmpl $"__stringify(LGUEST_MAGIC_ESI)", %esi\n"
+    "	je lguest_init\n"
+    "1: ret");
+extern void asmlinkage lguest_maybe_init(void);
+paravirt_probe(lguest_maybe_init);
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest_bus.c
@@ -0,0 +1,180 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <asm/lguest_device.h>
+#include <asm/lguest.h>
+#include <asm/io.h>
+
+static ssize_t type_show(struct device *_dev,
+                         struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hu", lguest_devices[dev->index].type);
+}
+static ssize_t features_show(struct device *_dev,
+                             struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hx", lguest_devices[dev->index].features);
+}
+static ssize_t pfn_show(struct device *_dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%u", lguest_devices[dev->index].pfn);
+}
+static ssize_t status_show(struct device *_dev,
+                           struct device_attribute *attr, char *buf)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	return sprintf(buf, "%hx", lguest_devices[dev->index].status);
+}
+static ssize_t status_store(struct device *_dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1)
+		return -EINVAL;
+	return count;
+}
+static struct device_attribute lguest_dev_attrs[] = {
+	__ATTR_RO(type),
+	__ATTR_RO(features),
+	__ATTR_RO(pfn),
+	__ATTR(status, 0644, status_show, status_store),
+	__ATTR_NULL
+};
+
+static int lguest_dev_match(struct device *_dev, struct device_driver *_drv)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv);
+
+	return (drv->device_type == lguest_devices[dev->index].type);
+}
+
+struct lguest_bus {
+	struct bus_type bus;
+	struct device dev;
+};
+
+static struct lguest_bus lguest_bus = {
+	.bus = {
+		.name  = "lguest",
+		.match = lguest_dev_match,
+		.dev_attrs = lguest_dev_attrs,
+	},
+	.dev = {
+		.parent = NULL,
+		.bus_id = "lguest",
+	}
+};
+
+static int lguest_dev_probe(struct device *_dev)
+{
+	int ret;
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(dev->dev.driver,
+						struct lguest_driver, drv);
+
+	lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER;
+	ret = drv->probe(dev);
+	if (ret == 0)
+		lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK;
+	return ret;
+}
+
+static int lguest_dev_remove(struct device *_dev)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+	struct lguest_driver *drv = container_of(dev->dev.driver,
+						struct lguest_driver, drv);
+
+	if (dev->dev.driver && drv->remove)
+		drv->remove(dev);
+	put_device(&dev->dev);
+	return 0;
+}
+
+int register_lguest_driver(struct lguest_driver *drv)
+{
+	if (!lguest_devices)
+		return 0;
+	
+	drv->drv.bus = &lguest_bus.bus;
+	drv->drv.name = drv->name;
+	drv->drv.owner = drv->owner;
+	drv->drv.probe = lguest_dev_probe;
+	drv->drv.remove = lguest_dev_remove;
+
+	return driver_register(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(register_lguest_driver);
+
+void unregister_lguest_driver(struct lguest_driver *drv)
+{
+	if (!lguest_devices)
+		return;
+
+	driver_unregister(&drv->drv);
+}
+EXPORT_SYMBOL_GPL(unregister_lguest_driver);
+
+static void release_lguest_device(struct device *_dev)
+{
+	struct lguest_device *dev = container_of(_dev,struct lguest_device,dev);
+
+	lguest_devices[dev->index].status |= LGUEST_DEVICE_S_REMOVED_ACK;
+	kfree(dev);
+}
+
+static void add_lguest_device(unsigned int index)
+{
+	struct lguest_device *new;
+
+	lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE;
+	new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL);
+	if (!new) {
+		printk(KERN_EMERG "Cannot allocate lguest device %u\n", index);
+		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+		return;
+	}
+
+	new->index = index;
+	new->private = NULL;
+	memset(&new->dev, 0, sizeof(new->dev));
+	new->dev.parent = &lguest_bus.dev;
+	new->dev.bus = &lguest_bus.bus;
+	new->dev.release = release_lguest_device;
+	sprintf(new->dev.bus_id, "%u", index);
+	if (device_register(&new->dev) != 0) {
+		printk(KERN_EMERG "Cannot register lguest device %u\n", index);
+		lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED;
+		kfree(new);
+	}
+}
+
+static void scan_devices(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < LGUEST_MAX_DEVICES; i++)
+		if (lguest_devices[i].type)
+			add_lguest_device(i);
+}
+
+static int __init lguest_bus_init(void)
+{
+	if (strcmp(paravirt_ops.name, "lguest") != 0)
+		return 0;
+
+	/* Devices are in page above top of "normal" mem. */
+	lguest_devices = ioremap(max_pfn << PAGE_SHIFT, PAGE_SIZE);
+
+	if (bus_register(&lguest_bus.bus) != 0
+	    || device_register(&lguest_bus.dev) != 0)
+		panic("lguest bus registration failed");
+
+	scan_devices();
+	return 0;
+}
+postcore_initcall(lguest_bus_init);
===================================================================
--- a/include/asm-i386/pda.h
+++ b/include/asm-i386/pda.h
@@ -97,4 +97,5 @@ extern struct i386_pda _proxy_pda;
 #define sub_pda(field,val) pda_to_op("sub",field,val)
 #define or_pda(field,val) pda_to_op("or",field,val)
 
+extern struct i386_pda boot_pda;
 #endif	/* _I386_PDA_H */



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 7 of 7]  lguest: Makefile
  2007-02-09 15:17           ` [PATCH 6 of 7] lguest: Guest code Rusty Russell
@ 2007-02-09 15:21             ` Rusty Russell
  0 siblings, 0 replies; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 15:21 UTC (permalink / raw)
  To: lkml - Kernel Mailing List; +Cc: Andi Kleen, Andrew Morton, virtualization

[ This has no changes since 6d/10.  None!  Andi tried to make me change
it, but I defended it.  This one patch is unscathed.  Bwaha! ]

Finally, we put in the Makefile, so it will build.

There's a small complexity in creating the switcher code
(hypervisor.S) ready to be copied into the top of memory, but
it's not too bad.  Really.  Stop looking at me like that, Andi.
It's fine as it is.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

===================================================================
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -108,6 +108,7 @@ drivers-$(CONFIG_PCI)			+= arch/i386/pci
 # must be linked after kernel/
 drivers-$(CONFIG_OPROFILE)		+= arch/i386/oprofile/
 drivers-$(CONFIG_PM)			+= arch/i386/power/
+drivers-$(CONFIG_LGUEST_GUEST)		+= arch/i386/lguest/
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/Makefile
@@ -0,0 +1,22 @@
+# Guest requires the paravirt_ops replacement and the bus driver.
+obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_bus.o
+
+# Host requires the other files, which can be a module.
+obj-$(CONFIG_LGUEST)	+= lg.o
+lg-objs := core.o hypercalls.o page_tables.o interrupts_and_traps.o \
+	segments.o io.o lguest_user.o
+
+# We use top 4MB for guest traps page, then hypervisor. */
+HYPE_ADDR := (0xFFC00000+4096)
+# The data is only 1k (256 interrupt handler pointers)
+HYPE_DATA_SIZE := 1024
+CFLAGS += -DHYPE_ADDR="$(HYPE_ADDR)" -DHYPE_DATA_SIZE="$(HYPE_DATA_SIZE)"
+
+$(obj)/core.o: $(obj)/hypervisor-blob.c
+# This links the hypervisor in the right place and turns it into a C array.
+$(obj)/hypervisor-raw: $(obj)/hypervisor.o
+	@$(LD) -static -Tdata=`printf %#x $$(($(HYPE_ADDR)))` -Ttext=`printf %#x $$(($(HYPE_ADDR)+$(HYPE_DATA_SIZE)))` -o $@ $< && $(OBJCOPY) -O binary $@
+$(obj)/hypervisor-blob.c: $(obj)/hypervisor-raw
+	@od -tx1 -An -v $< | sed -e 's/^ /0x/' -e 's/$$/,/' -e 's/ /,0x/g' > $@
+
+clean-files := hypervisor-blob.c hypervisor-raw



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4 of 7]  lguest: Config and headers
  2007-02-09 15:09       ` [PATCH 4 of 7] lguest: Config and headers Rusty Russell
  2007-02-09 15:14         ` [PATCH 5 of 7] lguest: the host code (lg.ko) Rusty Russell
@ 2007-02-09 18:15         ` James Morris
  2007-02-09 23:41           ` Rusty Russell
  1 sibling, 1 reply; 12+ messages in thread
From: James Morris @ 2007-02-09 18:15 UTC (permalink / raw)
  To: Rusty Russell
  Cc: lkml - Kernel Mailing List, Andi Kleen, Andrew Morton, virtualization

On Sat, 10 Feb 2007, Rusty Russell wrote:

> +/* 64k ought to be enough for anybody! */
> +#define HYPERVISOR_MAP_ORDER 16
> +#define HYPERVISOR_PAGES ((1 << HYPERVISOR_MAP_ORDER)/PAGE_SIZE)

I think it'd be better to go back to defining HYPERVISOR_SIZE then derive 
the map order from that via get_order(), as it should be 4 instead of 16; 
and this code is now both implying PAGE_SIZE while also using it for 
calculations. 


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4 of 7]  lguest: Config and headers
  2007-02-09 18:15         ` [PATCH 4 of 7] lguest: Config and headers James Morris
@ 2007-02-09 23:41           ` Rusty Russell
  2007-02-10  3:45             ` James Morris
  0 siblings, 1 reply; 12+ messages in thread
From: Rusty Russell @ 2007-02-09 23:41 UTC (permalink / raw)
  To: James Morris
  Cc: lkml - Kernel Mailing List, Andi Kleen, Andrew Morton, virtualization

On Fri, 2007-02-09 at 13:15 -0500, James Morris wrote:
> On Sat, 10 Feb 2007, Rusty Russell wrote:
> 
> > +/* 64k ought to be enough for anybody! */
> > +#define HYPERVISOR_MAP_ORDER 16
> > +#define HYPERVISOR_PAGES ((1 << HYPERVISOR_MAP_ORDER)/PAGE_SIZE)
> 
> I think it'd be better to go back to defining HYPERVISOR_SIZE then derive 
> the map order from that via get_order(), as it should be 4 instead of 16; 
> and this code is now both implying PAGE_SIZE while also using it for 
> calculations. 

Well it was the use of get_order() which triggered Andi's alarm bells,
so I went back to deriving it.  This code is correct, however.

get_order() is one of those classic functions only a kernel coder could
love.  Look how lovingly it has been optimized:

#define get_order(n)							\
(									\
	__builtin_constant_p(n) ?					\
	((n < (1UL << PAGE_SHIFT)) ? 0 : ilog2(n) - PAGE_SHIFT) :	\
	__get_order(n, PAGE_SHIFT)					\
 )

All that time spent, yet no consideration that it should be called
"get_page_order()" or some name which hints that the divide by page size
is happening.  It's even documented in the comment above, so someone
thought it needed explaining.  Too bad they chose to explain it instead
of actually clarifying it. 8(

Cheers,
Rusty.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4 of 7]  lguest: Config and headers
  2007-02-09 23:41           ` Rusty Russell
@ 2007-02-10  3:45             ` James Morris
  2007-02-10  9:33               ` Rusty Russell
  0 siblings, 1 reply; 12+ messages in thread
From: James Morris @ 2007-02-10  3:45 UTC (permalink / raw)
  To: Rusty Russell
  Cc: lkml - Kernel Mailing List, Andi Kleen, Andrew Morton, virtualization

On Sat, 10 Feb 2007, Rusty Russell wrote:

> Well it was the use of get_order() which triggered Andi's alarm bells,
> so I went back to deriving it.  This code is correct, however.

+       hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, HYPERVISOR_MAP_ORDER);
+       if (!hype_pages)
+               return -ENOMEM;

This will try and allocate 2^16 pages.  I guess we need a 
HYPERVISOR_PAGE_ORDER ?


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 4 of 7]  lguest: Config and headers
  2007-02-10  3:45             ` James Morris
@ 2007-02-10  9:33               ` Rusty Russell
  0 siblings, 0 replies; 12+ messages in thread
From: Rusty Russell @ 2007-02-10  9:33 UTC (permalink / raw)
  To: James Morris
  Cc: lkml - Kernel Mailing List, Andi Kleen, Andrew Morton, virtualization

On Fri, 2007-02-09 at 22:45 -0500, James Morris wrote:
> On Sat, 10 Feb 2007, Rusty Russell wrote:
> 
> > Well it was the use of get_order() which triggered Andi's alarm bells,
> > so I went back to deriving it.  This code is correct, however.
> 
> +       hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, HYPERVISOR_MAP_ORDER);
> +       if (!hype_pages)
> +               return -ENOMEM;
> 
> This will try and allocate 2^16 pages.  I guess we need a 
> HYPERVISOR_PAGE_ORDER ?

Fuck a brick, you're right.  Worked here tho 8)  There's a moral here
somewhere about futzing with perfectly working code after midnight, I'm
sure.

Name: Don't allocate 2^16 pages in lg.ko

Code cleanup which avoided use of get_order() resulted in a thinko: we
allocated 65536 pages instead of 65536 bytes.  Thanks to James Morris
for pointing this out (twice!).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 3bcbcc7c7659 arch/i386/lguest/core.c
--- a/arch/i386/lguest/core.c	Sat Feb 10 10:44:25 2007 +1100
+++ b/arch/i386/lguest/core.c	Sat Feb 10 20:26:03 2007 +1100
@@ -24,8 +24,8 @@ static char __initdata hypervisor_blob[]
 #include "hypervisor-blob.c"
 };
 
-#define MAX_LGUEST_GUESTS						\
-	(((1 << HYPERVISOR_MAP_ORDER) - sizeof(hypervisor_blob))	\
+#define MAX_LGUEST_GUESTS						  \
+	(((PAGE_SIZE << HYPERVISOR_PAGE_ORDER) - sizeof(hypervisor_blob)) \
 	 / sizeof(struct lguest_state))
 
 static struct vm_struct *hypervisor_vma;
@@ -62,12 +62,12 @@ static __init int map_hypervisor(void)
 	int err;
 	struct page *pages[HYPERVISOR_PAGES], **pagep = pages;
 
-	hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, HYPERVISOR_MAP_ORDER);
+	hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, HYPERVISOR_PAGE_ORDER);
 	if (!hype_pages)
 		return -ENOMEM;
 
-	hypervisor_vma = __get_vm_area(1 << HYPERVISOR_MAP_ORDER, VM_ALLOC,
-				       HYPE_ADDR, VMALLOC_END);
+	hypervisor_vma = __get_vm_area(PAGE_SIZE << HYPERVISOR_PAGE_ORDER,
+				       VM_ALLOC, HYPE_ADDR, VMALLOC_END);
 	if (!hypervisor_vma) {
 		err = -ENOMEM;
 		printk("lguest: could not map hypervisor pages high\n");
@@ -100,14 +100,14 @@ free_vma:
 free_vma:
 	vunmap(hypervisor_vma->addr);
 free_pages:
-	__free_pages(hype_pages, HYPERVISOR_MAP_ORDER);
+	__free_pages(hype_pages, HYPERVISOR_PAGE_ORDER);
 	return err;
 }
 
 static __exit void unmap_hypervisor(void)
 {
 	vunmap(hypervisor_vma->addr);
-	__free_pages(hype_pages, HYPERVISOR_MAP_ORDER);
+	__free_pages(hype_pages, HYPERVISOR_PAGE_ORDER);
 }
 
 /* IN/OUT insns: enough to get us past boot-time probing. */
diff -r 3bcbcc7c7659 arch/i386/lguest/lg.h
--- a/arch/i386/lguest/lg.h	Sat Feb 10 10:44:25 2007 +1100
+++ b/arch/i386/lguest/lg.h	Sat Feb 10 20:27:54 2007 +1100
@@ -3,8 +3,8 @@
 
 #include <asm/desc.h>
 /* 64k ought to be enough for anybody! */
-#define HYPERVISOR_MAP_ORDER 16
-#define HYPERVISOR_PAGES ((1 << HYPERVISOR_MAP_ORDER)/PAGE_SIZE)
+#define HYPERVISOR_PAGE_ORDER (16 - PAGE_SHIFT)
+#define HYPERVISOR_PAGES (1 << HYPERVISOR_PAGE_ORDER)
 
 #define GDT_ENTRY_LGUEST_CS	10
 #define GDT_ENTRY_LGUEST_DS	11



^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2007-02-10  9:34 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-09 14:59 [PATCH 0 of 7] lguest host code Rusty Russell
2007-02-09 15:03 ` [PATCH 1 of 7] lguest: Move mce_disabled to asm/mce.h so lguest can use it Rusty Russell
2007-02-09 15:03   ` [PATCH 2 of 7] lguest: Rename cpu_gdt_descr and remove extern declaration from smpboot.c Rusty Russell
2007-02-09 15:04     ` [PATCH 3 of 7] lguest: Remove extern declaration from mm/discontig.c, put in header Rusty Russell
2007-02-09 15:09       ` [PATCH 4 of 7] lguest: Config and headers Rusty Russell
2007-02-09 15:14         ` [PATCH 5 of 7] lguest: the host code (lg.ko) Rusty Russell
2007-02-09 15:17           ` [PATCH 6 of 7] lguest: Guest code Rusty Russell
2007-02-09 15:21             ` [PATCH 7 of 7] lguest: Makefile Rusty Russell
2007-02-09 18:15         ` [PATCH 4 of 7] lguest: Config and headers James Morris
2007-02-09 23:41           ` Rusty Russell
2007-02-10  3:45             ` James Morris
2007-02-10  9:33               ` Rusty Russell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.