linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 10:05   ` Adrian Bunk
                     ` (3 more replies)
  2006-05-09  7:00 ` [RFC PATCH 02/35] Makefile support to build Xen subarch Chris Wright
                   ` (34 subsequent siblings)
  35 siblings, 4 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: config-xen --]
[-- Type: text/plain, Size: 4555 bytes --]

The XEN config option is selected from the i386 subarch menu by
choosing the X86_XEN "Xen-compatible" subarch.

The XEN_SHADOW_MODE option defines the memory virtualization mode for
the kernel -- with it enabled, the kernel expects the hypervisor to
perform translation between pseudo-physical and machine addresses on
its behalf.

The disabled config options are:
- DOUBLEFAULT: are trapped by Xen and not virtualized
- HZ: defaults to 100 in Xen VMs
- Power management: not supported in unprivileged VMs
- SMP: not supported in this set of patches
- X86_{UP,LOCAL,IO}_APIC: not supported in unprivileged VMs

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/Kconfig       |   18 ++++++++++++++----
 arch/i386/Kconfig.debug |    1 +
 drivers/xen/Kconfig     |   21 +++++++++++++++++++++
 kernel/Kconfig.hz       |    4 ++--
 kernel/Kconfig.preempt  |    1 +
 5 files changed, 39 insertions(+), 6 deletions(-)

--- linus-2.6.orig/arch/i386/Kconfig
+++ linus-2.6/arch/i386/Kconfig
@@ -55,6 +55,7 @@ menu "Processor type and features"
 
 config SMP
 	bool "Symmetric multi-processing support"
+	depends on !X86_XEN
 	---help---
 	  This enables support for systems with more than one CPU. If you have
 	  a system with only one CPU, like most personal computers, say N. If
@@ -91,6 +92,12 @@ config X86_PC
 	help
 	  Choose this option if your computer is a standard PC or compatible.
 
+config X86_XEN
+	bool "Xen-compatible"
+	help
+	  Choose this option if you plan to run this kernel on top of the
+	  Xen Hypervisor.
+
 config X86_ELAN
 	bool "AMD Elan"
 	help
@@ -193,6 +200,7 @@ source "arch/i386/Kconfig.cpu"
 
 config HPET_TIMER
 	bool "HPET Timer Support"
+	depends on !X86_XEN
 	help
 	  This enables the use of the HPET for the kernel's internal timer.
 	  HPET is the next generation timer replacing legacy 8254s.
@@ -244,7 +252,7 @@ source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
-	depends on !SMP && !(X86_VISWS || X86_VOYAGER)
+	depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN)
 	help
 	  A local APIC (Advanced Programmable Interrupt Controller) is an
 	  integrated interrupt controller in the CPU. If you have a single-CPU
@@ -269,12 +277,12 @@ config X86_UP_IOAPIC
 
 config X86_LOCAL_APIC
 	bool
-	depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
+	depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || X86_XEN))
 	default y
 
 config X86_IO_APIC
 	bool
-	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
+	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN))
 	default y
 
 config X86_VISWS_APIC
@@ -767,7 +775,7 @@ endmenu
 
 
 menu "Power management options (ACPI, APM)"
-	depends on !X86_VOYAGER
+	depends on !(X86_VOYAGER || X86_XEN)
 
 source kernel/power/Kconfig
 
@@ -1089,6 +1097,8 @@ source "security/Kconfig"
 
 source "crypto/Kconfig"
 
+source "drivers/xen/Kconfig"
+
 source "lib/Kconfig"
 
 #
--- linus-2.6.orig/arch/i386/Kconfig.debug
+++ linus-2.6/arch/i386/Kconfig.debug
@@ -84,6 +84,7 @@ config X86_MPPARSE
 config DOUBLEFAULT
 	default y
 	bool "Enable doublefault exception handler" if EMBEDDED
+	depends on !X86_XEN
 	help
           This option allows trapping of rare doublefault exceptions that
           would otherwise cause a system to silently reboot. Disabling this
--- linus-2.6.orig/kernel/Kconfig.hz
+++ linus-2.6/kernel/Kconfig.hz
@@ -3,7 +3,7 @@
 #
 
 choice
-	prompt "Timer frequency"
+	prompt "Timer frequency" if !XEN
 	default HZ_250
 	help
 	 Allows the configuration of the timer frequency. It is customary
@@ -40,7 +40,7 @@ endchoice
 
 config HZ
 	int
-	default 100 if HZ_100
+	default 100 if HZ_100 || XEN
 	default 250 if HZ_250
 	default 1000 if HZ_1000
 
--- linus-2.6.orig/kernel/Kconfig.preempt
+++ linus-2.6/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
+	depends on !XEN
 	help
 	  This option reduces the latency of the kernel by making
 	  all kernel code (that is not executing in a critical section)
--- /dev/null
+++ linus-2.6/drivers/xen/Kconfig
@@ -0,0 +1,21 @@
+#
+# This Kconfig describe xen options
+#
+
+mainmenu "Xen Configuration"
+
+config XEN
+	bool
+	default y if X86_XEN
+	help
+	  This is the Linux Xen port.
+
+if XEN
+
+config XEN_SHADOW_MODE
+	bool
+	default y
+	help
+	  Fakes out a shadow mode kernel
+
+endif

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 02/35] Makefile support to build Xen subarch
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 03/35] Add Xen interface header files Chris Wright
                   ` (33 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-mach-xen --]
[-- Type: text/plain, Size: 1304 bytes --]

Use arch/i386/mach-xen when building Xen subarch. The separate
subarchitecture allows us to hide details of interfacing with the
hypervisor from i386 common code.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/Makefile          |    5 +++++
 arch/i386/mach-xen/Makefile |    7 +++++++
 2 files changed, 12 insertions(+)

--- linus-2.6.orig/arch/i386/Makefile
+++ linus-2.6/arch/i386/Makefile
@@ -71,6 +71,10 @@ mcore-$(CONFIG_X86_BIGSMP)	:= mach-defau
 mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
 mcore-$(CONFIG_X86_SUMMIT)  := mach-default
 
+# Xen subarch support
+mflags-$(CONFIG_X86_XEN)	:= -Iinclude/asm-i386/mach-xen
+mcore-$(CONFIG_X86_XEN)		:= mach-xen
+
 # generic subarchitecture
 mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
 mcore-$(CONFIG_X86_GENERICARCH) := mach-default
@@ -99,6 +103,7 @@ drivers-$(CONFIG_PM)			+= arch/i386/powe
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
+CPPFLAGS += $(mflags-y)
 
 boot := arch/i386/boot
 
--- /dev/null
+++ linus-2.6/arch/i386/mach-xen/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y				:= setup.o
+
+setup-y				:= ../mach-default/setup.o

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 02/35] Makefile support to build Xen subarch Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 14:49   ` Martin J. Bligh
                     ` (2 more replies)
  2006-05-09  7:00 ` [RFC PATCH 04/35] Hypervisor " Chris Wright
                   ` (32 subsequent siblings)
  35 siblings, 3 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: xen-interface-headers --]
[-- Type: text/plain, Size: 85451 bytes --]

Add Xen interface header files. These are taken fairly directly from
the Xen tree and hence the style is not entirely in accordance with
Linux guidelines. There is a tension between fitting with Linux coding
rules and ease of maintenance.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/xen/interface/arch-x86_32.h   |  197 +++++++++++++++
 include/xen/interface/event_channel.h |  205 +++++++++++++++
 include/xen/interface/features.h      |   53 ++++
 include/xen/interface/grant_table.h   |  311 +++++++++++++++++++++++
 include/xen/interface/io/blkif.h      |   85 ++++++
 include/xen/interface/io/console.h    |   33 ++
 include/xen/interface/io/netif.h      |   84 ++++++
 include/xen/interface/io/ring.h       |  262 ++++++++++++++++++++
 include/xen/interface/io/xenbus.h     |   42 +++
 include/xen/interface/io/xs_wire.h    |   97 +++++++
 include/xen/interface/memory.h        |  155 +++++++++++
 include/xen/interface/physdev.h       |   71 +++++
 include/xen/interface/sched.h         |   87 ++++++
 include/xen/interface/vcpu.h          |  119 +++++++++
 include/xen/interface/version.h       |   70 +++++
 include/xen/interface/xen.h           |  441 ++++++++++++++++++++++++++++++++++
 16 files changed, 2312 insertions(+)

--- /dev/null
+++ linus-2.6/include/xen/interface/arch-x86_32.h
@@ -0,0 +1,197 @@
+/******************************************************************************
+ * arch-x86_32.h
+ * 
+ * Guest OS interface to x86 32-bit Xen.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
+#define __XEN_PUBLIC_ARCH_X86_32_H__
+
+#ifdef __XEN__
+#define __DEFINE_GUEST_HANDLE(name, type) \
+    typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define __DEFINE_GUEST_HANDLE(name, type) \
+    typedef type * __guest_handle_ ## name
+#endif
+
+#define DEFINE_GUEST_HANDLE_STRUCT(name) \
+	__DEFINE_GUEST_HANDLE(name, struct name)
+#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
+#define GUEST_HANDLE(name)        __guest_handle_ ## name
+
+#ifndef __ASSEMBLY__
+/* Guest handles for primitive C types. */
+__DEFINE_GUEST_HANDLE(uchar, unsigned char);
+__DEFINE_GUEST_HANDLE(uint,  unsigned int);
+__DEFINE_GUEST_HANDLE(ulong, unsigned long);
+DEFINE_GUEST_HANDLE(char);
+DEFINE_GUEST_HANDLE(int);
+DEFINE_GUEST_HANDLE(long);
+DEFINE_GUEST_HANDLE(void);
+#endif
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE  14
+#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS    FLAT_RING3_CS
+#define FLAT_USER_DS    FLAT_RING3_DS
+#define FLAT_USER_SS    FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The 
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#ifdef CONFIG_X86_PAE
+#define __HYPERVISOR_VIRT_START 0xF5800000
+#else
+#define __HYPERVISOR_VIRT_START 0xFC000000
+#endif
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table()
+ */
+#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
+#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
+struct trap_info {
+    uint8_t       vector;  /* exception vector                              */
+    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
+    uint16_t      cs;      /* code selector                                 */
+    unsigned long address; /* code offset                                   */
+};
+DEFINE_GUEST_HANDLE_STRUCT(trap_info);
+
+struct cpu_user_regs {
+    uint32_t ebx;
+    uint32_t ecx;
+    uint32_t edx;
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t ebp;
+    uint32_t eax;
+    uint16_t error_code;    /* private */
+    uint16_t entry_vector;  /* private */
+    uint32_t eip;
+    uint16_t cs;
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad0;
+    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
+    uint32_t esp;
+    uint16_t ss, _pad1;
+    uint16_t es, _pad2;
+    uint16_t ds, _pad3;
+    uint16_t fs, _pad4;
+    uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled 
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+struct vcpu_guest_context {
+    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_HVM_GUEST  (1<<1)
+#define VGCF_IN_KERNEL  (1<<2)
+    unsigned long flags;                    /* VGCF_* flags                 */
+    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
+    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
+    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
+    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
+    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
+    unsigned long failsafe_callback_eip;
+    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+struct arch_shared_info {
+    unsigned long max_pfn;                  /* max pfn that appears in table */
+    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    unsigned long pfn_to_mfn_frame_list_list;
+    unsigned long nmi_reason;
+};
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
+#endif
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/event_channel.h
@@ -0,0 +1,205 @@
+/******************************************************************************
+ * event_channel.h
+ * 
+ * Event channels between domains.
+ * 
+ * Copyright (c) 2003-2004, K A Fraser.
+ */
+
+#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
+#define __XEN_PUBLIC_EVENT_CHANNEL_H__
+
+typedef uint32_t evtchn_port_t;
+DEFINE_GUEST_HANDLE(evtchn_port_t);
+
+/*
+ * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
+ * accepting interdomain bindings from domain <remote_dom>. A fresh port
+ * is allocated in <dom> and returned as <port>.
+ * NOTES:
+ *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
+ *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
+ */
+#define EVTCHNOP_alloc_unbound    6
+struct evtchn_alloc_unbound {
+    /* IN parameters */
+    domid_t dom, remote_dom;
+    /* OUT parameters */
+    evtchn_port_t port;
+};
+
+/*
+ * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
+ * the calling domain and <remote_dom>. <remote_dom,remote_port> must identify
+ * a port that is unbound and marked as accepting bindings from the calling
+ * domain. A fresh port is allocated in the calling domain and returned as
+ * <local_port>.
+ * NOTES:
+ *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
+ */
+#define EVTCHNOP_bind_interdomain 0
+struct evtchn_bind_interdomain {
+    /* IN parameters. */
+    domid_t remote_dom;
+    evtchn_port_t remote_port;
+    /* OUT parameters. */
+    evtchn_port_t local_port;
+};
+
+/*
+ * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
+ * vcpu.
+ * NOTES:
+ *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
+ *  2. The allocated event channel is bound to the specified vcpu. The binding
+ *     may not be changed.
+ */
+#define EVTCHNOP_bind_virq        1
+struct evtchn_bind_virq {
+    /* IN parameters. */
+    uint32_t virq;
+    uint32_t vcpu;
+    /* OUT parameters. */
+    evtchn_port_t port;
+};
+
+/*
+ * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
+ * NOTES:
+ *  1. A physical IRQ may be bound to at most one event channel per domain.
+ *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
+ */
+#define EVTCHNOP_bind_pirq        2
+struct evtchn_bind_pirq {
+    /* IN parameters. */
+    uint32_t pirq;
+#define BIND_PIRQ__WILL_SHARE 1
+    uint32_t flags; /* BIND_PIRQ__* */
+    /* OUT parameters. */
+    evtchn_port_t port;
+};
+
+/*
+ * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
+ * NOTES:
+ *  1. The allocated event channel is bound to the specified vcpu. The binding
+ *     may not be changed.
+ */
+#define EVTCHNOP_bind_ipi         7
+struct evtchn_bind_ipi {
+    uint32_t vcpu;
+    /* OUT parameters. */
+    evtchn_port_t port;
+};
+
+/*
+ * EVTCHNOP_close: Close a local event channel <port>. If the channel is
+ * interdomain then the remote end is placed in the unbound state
+ * (EVTCHNSTAT_unbound), awaiting a new connection.
+ */
+#define EVTCHNOP_close            3
+struct evtchn_close {
+    /* IN parameters. */
+    evtchn_port_t port;
+};
+
+/*
+ * EVTCHNOP_send: Send an event to the remote end of the channel whose local
+ * endpoint is <port>.
+ */
+#define EVTCHNOP_send             4
+struct evtchn_send {
+    /* IN parameters. */
+    evtchn_port_t port;
+};
+
+/*
+ * EVTCHNOP_status: Get the current status of the communication channel which
+ * has an endpoint at <dom, port>.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may obtain the status of an event
+ *     channel for which <dom> is not DOMID_SELF.
+ */
+#define EVTCHNOP_status           5
+struct evtchn_status {
+    /* IN parameters */
+    domid_t  dom;
+    evtchn_port_t port;
+    /* OUT parameters */
+#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
+#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
+#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
+#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
+#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
+#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
+    uint32_t status;
+    uint32_t vcpu;                 /* VCPU to which this channel is bound.   */
+    union {
+        struct {
+            domid_t dom;
+        } unbound; /* EVTCHNSTAT_unbound */
+        struct {
+            domid_t dom;
+            evtchn_port_t port;
+        } interdomain; /* EVTCHNSTAT_interdomain */
+        uint32_t pirq;      /* EVTCHNSTAT_pirq        */
+        uint32_t virq;      /* EVTCHNSTAT_virq        */
+    } u;
+};
+
+/*
+ * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
+ * event is pending.
+ * NOTES:
+ *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
+ *     the binding. This binding cannot be changed.
+ *  2. All other channels notify vcpu0 by default. This default is set when
+ *     the channel is allocated (a port that is freed and subsequently reused
+ *     has its binding reset to vcpu0).
+ */
+#define EVTCHNOP_bind_vcpu        8
+struct evtchn_bind_vcpu {
+    /* IN parameters. */
+    evtchn_port_t port;
+    uint32_t vcpu;
+};
+
+/*
+ * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
+ * a notification to the appropriate VCPU if an event is pending.
+ */
+#define EVTCHNOP_unmask           9
+struct evtchn_unmask {
+    /* IN parameters. */
+    evtchn_port_t port;
+};
+
+struct evtchn_op {
+    uint32_t cmd; /* EVTCHNOP_* */
+    union {
+        struct evtchn_alloc_unbound    alloc_unbound;
+        struct evtchn_bind_interdomain bind_interdomain;
+        struct evtchn_bind_virq        bind_virq;
+        struct evtchn_bind_pirq        bind_pirq;
+        struct evtchn_bind_ipi         bind_ipi;
+        struct evtchn_close            close;
+        struct evtchn_send             send;
+        struct evtchn_status           status;
+        struct evtchn_bind_vcpu        bind_vcpu;
+        struct evtchn_unmask           unmask;
+    } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
+
+#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/features.h
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * features.h
+ * 
+ * Feature flags, reported by XENVER_get_features.
+ * 
+ * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_FEATURES_H__
+#define __XEN_PUBLIC_FEATURES_H__
+
+/*
+ * If set, the guest does not need to write-protect its pagetables, and can
+ * update them via direct writes.
+ */
+#define XENFEAT_writable_page_tables       0
+
+/*
+ * If set, the guest does not need to write-protect its segment descriptor
+ * tables, and can update them via direct writes.
+ */
+#define XENFEAT_writable_descriptor_tables 1
+
+/*
+ * If set, translation between the guest's 'pseudo-physical' address space
+ * and the host's machine address space are handled by the hypervisor. In this
+ * mode the guest does not need to perform phys-to/from-machine translations
+ * when performing page table operations.
+ */
+#define XENFEAT_auto_translated_physmap    2
+
+/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
+#define XENFEAT_supervisor_mode_kernel     3
+
+/*
+ * If set, the guest does not need to allocate x86 PAE page directories
+ * below 4GB. This flag is usually implied by auto_translated_physmap.
+ */
+#define XENFEAT_pae_pgdir_above_4gb        4
+
+#define XENFEAT_NR_SUBMAPS 1
+
+#endif /* __XEN_PUBLIC_FEATURES_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/grant_table.h
@@ -0,0 +1,311 @@
+/******************************************************************************
+ * grant_table.h
+ * 
+ * Interface for granting foreign access to page frames, and receiving
+ * page-ownership transfers.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
+#define __XEN_PUBLIC_GRANT_TABLE_H__
+
+
+/***********************************
+ * GRANT TABLE REPRESENTATION
+ */
+
+/* Some rough guidelines on accessing and updating grant-table entries
+ * in a concurrency-safe manner. For more information, Linux contains a
+ * reference implementation for guest OSes (arch/i386/mach-xen/grant_table.c).
+ * 
+ * NB. WMB is a no-op on current-generation x86 processors. However, a
+ *     compiler barrier will still be required.
+ * 
+ * Introducing a valid entry into the grant table:
+ *  1. Write ent->domid.
+ *  2. Write ent->frame:
+ *      GTF_permit_access:   Frame to which access is permitted.
+ *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ *                           frame, or zero if none.
+ *  3. Write memory barrier (WMB).
+ *  4. Write ent->flags, inc. valid type.
+ * 
+ * Invalidating an unused GTF_permit_access entry:
+ *  1. flags = ent->flags.
+ *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
+ *
+ * Invalidating an in-use GTF_permit_access entry:
+ *  This cannot be done directly. Request assistance from the domain controller
+ *  which can set a timeout on the use of a grant entry and take necessary
+ *  action. (NB. This is not yet implemented!).
+ * 
+ * Invalidating an unused GTF_accept_transfer entry:
+ *  1. flags = ent->flags.
+ *  2. Observe that !(flags & GTF_transfer_committed). [*]
+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
+ *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
+ *      The guest must /not/ modify the grant entry until the address of the
+ *      transferred frame is written. It is safe for the guest to spin waiting
+ *      for this to occur (detect by observing GTF_transfer_completed in
+ *      ent->flags).
+ *
+ * Invalidating a committed GTF_accept_transfer entry:
+ *  1. Wait for (ent->flags & GTF_transfer_completed).
+ *
+ * Changing a GTF_permit_access from writable to read-only:
+ *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
+ * 
+ * Changing a GTF_permit_access from read-only to writable:
+ *  Use SMP-safe bit-setting instruction.
+ */
+
+/*
+ * A grant table comprises a packed array of grant entries in one or more
+ * page frames shared between Xen and a guest.
+ * [XEN]: This field is written by Xen and read by the sharing guest.
+ * [GST]: This field is written by the guest and read by Xen.
+ */
+struct grant_entry {
+    /* GTF_xxx: various type and flag information.  [XEN,GST] */
+    uint16_t flags;
+    /* The domain being granted foreign privileges. [GST] */
+    domid_t  domid;
+    /*
+     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
+     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
+     */
+    uint32_t frame;
+};
+
+/*
+ * Type of grant entry.
+ *  GTF_invalid: This grant entry grants no privileges.
+ *  GTF_permit_access: Allow @domid to map/access @frame.
+ *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
+ *                       to this guest. Xen writes the page number to @frame.
+ */
+#define GTF_invalid         (0U<<0)
+#define GTF_permit_access   (1U<<0)
+#define GTF_accept_transfer (2U<<0)
+#define GTF_type_mask       (3U<<0)
+
+/*
+ * Subflags for GTF_permit_access.
+ *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
+ *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
+ *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ */
+#define _GTF_readonly       (2)
+#define GTF_readonly        (1U<<_GTF_readonly)
+#define _GTF_reading        (3)
+#define GTF_reading         (1U<<_GTF_reading)
+#define _GTF_writing        (4)
+#define GTF_writing         (1U<<_GTF_writing)
+
+/*
+ * Subflags for GTF_accept_transfer:
+ *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
+ *      to transferring ownership of a page frame. When a guest sees this flag
+ *      it must /not/ modify the grant entry until GTF_transfer_completed is
+ *      set by Xen.
+ *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
+ *      after reading GTF_transfer_committed. Xen will always write the frame
+ *      address, followed by ORing this flag, in a timely manner.
+ */
+#define _GTF_transfer_committed (2)
+#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
+#define _GTF_transfer_completed (3)
+#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
+
+
+/***********************************
+ * GRANT TABLE QUERIES AND USES
+ */
+
+/*
+ * Reference to a grant entry in a specified domain's grant table.
+ */
+typedef uint32_t grant_ref_t;
+
+/*
+ * Handle to track a mapping created via a grant reference.
+ */
+typedef uint32_t grant_handle_t;
+
+/*
+ * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
+ * by devices and/or host CPUs. If successful, <handle> is a tracking number
+ * that must be presented later to destroy the mapping(s). On error, <handle>
+ * is a negative status code.
+ * NOTES:
+ *  1. If GNTPIN_map_for_dev is specified then <dev_bus_addr> is the address
+ *     via which I/O devices may access the granted frame.
+ *  2. If GNTPIN_map_for_host is specified then a mapping will be added at
+ *     either a host virtual address in the current address space, or at
+ *     a PTE at the specified machine address.  The type of mapping to
+ *     perform is selected through the GNTMAP_contains_pte flag, and the 
+ *     address is specified in <host_addr>.
+ *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
+ *     host mapping is destroyed by other means then it is *NOT* guaranteed
+ *     to be accounted to the correct grant reference!
+ */
+#define GNTTABOP_map_grant_ref        0
+struct gnttab_map_grant_ref {
+    /* IN parameters. */
+    uint64_t host_addr;
+    uint32_t flags;               /* GNTMAP_* */
+    grant_ref_t ref;
+    domid_t  dom;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+    grant_handle_t handle;
+    uint64_t dev_bus_addr;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
+
+/*
+ * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
+ * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that
+ * field is ignored. If non-zero, they must refer to a device/host mapping
+ * that is tracked by <handle>
+ * NOTES:
+ *  1. The call may fail in an undefined manner if either mapping is not
+ *     tracked by <handle>.
+ *  3. After executing a batch of unmaps, it is guaranteed that no stale
+ *     mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_grant_ref      1
+struct gnttab_unmap_grant_ref {
+    /* IN parameters. */
+    uint64_t host_addr;
+    uint64_t dev_bus_addr;
+    grant_handle_t handle;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
+
+/*
+ * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
+ * <nr_frames> pages. The frame addresses are written to the <frame_list>.
+ * Only <nr_frames> addresses are written, even if the table is larger.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
+ *  3. Xen may not support more than a single grant-table page per domain.
+ */
+#define GNTTABOP_setup_table          2
+struct gnttab_setup_table {
+    /* IN parameters. */
+    domid_t  dom;
+    uint32_t nr_frames;
+    /* OUT parameters. */
+    int16_t  status;              /* GNTST_* */
+    GUEST_HANDLE(ulong) frame_list;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
+
+/*
+ * GNTTABOP_dump_table: Dump the contents of the grant table to the
+ * xen console. Debugging use only.
+ */
+#define GNTTABOP_dump_table           3
+struct gnttab_dump_table {
+    /* IN parameters. */
+    domid_t dom;
+    /* OUT parameters. */
+    int16_t status;               /* GNTST_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
+
+/*
+ * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
+ * foreign domain has previously registered its interest in the transfer via
+ * <domid, ref>.
+ * 
+ * Note that, even if the transfer fails, the specified page no longer belongs
+ * to the calling domain *unless* the error is GNTST_bad_page.
+ */
+#define GNTTABOP_transfer                4
+struct gnttab_transfer {
+    /* IN parameters. */
+    unsigned long mfn;
+    domid_t       domid;
+    grant_ref_t   ref;
+    /* OUT parameters. */
+    int16_t       status;
+};
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
+
+/*
+ * Bitfield values for update_pin_status.flags.
+ */
+ /* Map the grant entry for access by I/O devices. */
+#define _GNTMAP_device_map      (0)
+#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
+ /* Map the grant entry for access by host CPUs. */
+#define _GNTMAP_host_map        (1)
+#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
+ /* Accesses to the granted frame will be restricted to read-only access. */
+#define _GNTMAP_readonly        (2)
+#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
+ /*
+  * GNTMAP_host_map subflag:
+  *  0 => The host mapping is usable only by the guest OS.
+  *  1 => The host mapping is usable by guest OS + current application.
+  */
+#define _GNTMAP_application_map (3)
+#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
+
+ /*
+  * GNTMAP_contains_pte subflag:
+  *  0 => This map request contains a host virtual address.
+  *  1 => This map request contains the machine addess of the PTE to update.
+  */
+#define _GNTMAP_contains_pte    (4)
+#define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
+
+/*
+ * Values for error status returns. All errors are -ve.
+ */
+#define GNTST_okay             (0)  /* Normal return.                        */
+#define GNTST_general_error    (-1) /* General undefined error.              */
+#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
+#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
+#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
+#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
+#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
+#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
+#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
+#define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
+
+#define GNTTABOP_error_msgs {                   \
+    "okay",                                     \
+    "undefined error",                          \
+    "unrecognised domain id",                   \
+    "invalid grant reference",                  \
+    "invalid mapping handle",                   \
+    "invalid virtual address",                  \
+    "invalid device address",                   \
+    "no spare translation slot in the I/O MMU", \
+    "permission denied",                        \
+    "bad page"                                  \
+}
+
+#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/io/blkif.h
@@ -0,0 +1,85 @@
+/******************************************************************************
+ * blkif.h
+ * 
+ * Unified block-device I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_BLKIF_H__
+#define __XEN_PUBLIC_IO_BLKIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/*
+ * Front->back notifications: When enqueuing a new request, sending a
+ * notification can be made conditional on req_event (i.e., the generic
+ * hold-off mechanism provided by the ring macros). Backends must set
+ * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()).
+ * 
+ * Back->front notifications: When enqueuing a new response, sending a
+ * notification can be made conditional on rsp_event (i.e., the generic
+ * hold-off mechanism provided by the ring macros). Frontends must set
+ * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
+ */
+
+#ifndef blkif_vdev_t
+#define blkif_vdev_t   uint16_t
+#endif
+#define blkif_sector_t uint64_t
+
+#define BLKIF_OP_READ      0
+#define BLKIF_OP_WRITE     1
+
+/*
+ * Maximum scatter/gather segments per request.
+ * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
+ */
+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+
+struct blkif_request {
+    uint8_t        operation;    /* BLKIF_OP_???                         */
+    uint8_t        nr_segments;  /* number of segments                   */
+    blkif_vdev_t   handle;       /* only for read/write requests         */
+    uint64_t       id;           /* private guest value, echoed in resp  */
+    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+    struct blkif_request_segment {
+        grant_ref_t gref;        /* reference to I/O buffer frame        */
+        /* @first_sect: first sector in frame to transfer (inclusive).   */
+        /* @last_sect: last sector in frame to transfer (inclusive).     */
+        uint8_t     first_sect, last_sect;
+    } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+struct blkif_response {
+    uint64_t        id;              /* copied from request */
+    uint8_t         operation;       /* copied from request */
+    int16_t         status;          /* BLKIF_RSP_???       */
+};
+
+#define BLKIF_RSP_ERROR  -1 /* non-specific 'error' */
+#define BLKIF_RSP_OKAY    0 /* non-specific 'okay'  */
+
+/*
+ * Generate blkif ring structures and types.
+ */
+
+DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
+
+#define VDISK_CDROM        0x1
+#define VDISK_REMOVABLE    0x2
+#define VDISK_READONLY     0x4
+
+#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/io/console.h
@@ -0,0 +1,33 @@
+/******************************************************************************
+ * console.h
+ * 
+ * Console I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2005, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
+#define __XEN_PUBLIC_IO_CONSOLE_H__
+
+typedef uint32_t XENCONS_RING_IDX;
+
+#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1))
+
+struct xencons_interface {
+    char in[1024];
+    char out[2048];
+    XENCONS_RING_IDX in_cons, in_prod;
+    XENCONS_RING_IDX out_cons, out_prod;
+};
+
+#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/io/netif.h
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * netif.h
+ * 
+ * Unified network-device I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_NETIF_H__
+#define __XEN_PUBLIC_IO_NETIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/*
+ * Note that there is *never* any need to notify the backend when
+ * enqueuing receive requests (struct netif_rx_request). Notifications
+ * after enqueuing any other type of message should be conditional on
+ * the appropriate req_event or rsp_event field in the shared ring.
+ */
+
+/* Protocol checksum field is blank in the packet (hardware offload)? */
+#define _NETTXF_csum_blank     (0)
+#define  NETTXF_csum_blank     (1U<<_NETTXF_csum_blank)
+
+/* Packet data has been validated against protocol checksum. */
+#define _NETTXF_data_validated (1)
+#define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
+
+struct netif_tx_request {
+    grant_ref_t gref;      /* Reference to buffer page */
+    uint16_t offset;       /* Offset within buffer page */
+    uint16_t flags;        /* NETTXF_* */
+    uint16_t id;           /* Echoed in response message. */
+    uint16_t size;         /* Packet size in bytes.       */
+};
+
+struct netif_tx_response {
+    uint16_t id;
+    int16_t  status;       /* NETIF_RSP_* */
+};
+
+struct netif_rx_request {
+    uint16_t    id;        /* Echoed in response message.        */
+    grant_ref_t gref;      /* Reference to incoming granted frame */
+};
+
+/* Packet data has been validated against protocol checksum. */
+#define _NETRXF_data_validated (0)
+#define  NETRXF_data_validated (1U<<_NETRXF_data_validated)
+
+/* Protocol checksum field is blank in the packet (hardware offload)? */
+#define _NETRXF_csum_blank     (1)
+#define  NETRXF_csum_blank     (1U<<_NETRXF_csum_blank)
+
+struct netif_rx_response {
+    uint16_t id;
+    uint16_t offset;       /* Offset in page of start of received packet  */
+    uint16_t flags;        /* NETRXF_* */
+    int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
+};
+
+/*
+ * Generate netif ring structures and types.
+ */
+
+DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
+DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
+
+#define NETIF_RSP_DROPPED         -2
+#define NETIF_RSP_ERROR           -1
+#define NETIF_RSP_OKAY             0
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/io/ring.h
@@ -0,0 +1,262 @@
+/******************************************************************************
+ * ring.h
+ * 
+ * Shared producer-consumer ring macros.
+ *
+ * Tim Deegan and Andrew Warfield November 2004.
+ */
+
+#ifndef __XEN_PUBLIC_IO_RING_H__
+#define __XEN_PUBLIC_IO_RING_H__
+
+typedef unsigned int RING_IDX;
+
+/* Round a 32-bit unsigned constant down to the nearest power of two. */
+#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
+#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
+#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
+#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
+#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
+
+/*
+ * Calculate size of a shared ring, given the total available space for the
+ * ring and indexes (_sz), and the name tag of the request/response structure.
+ * A ring contains as many entries as will fit, rounded down to the nearest 
+ * power of two (so we can mask with (size-1) to loop around).
+ */
+#define __RING_SIZE(_s, _sz) \
+    (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
+
+/*
+ * Macros to make the correct C datatypes for a new kind of ring.
+ * 
+ * To make a new ring datatype, you need to have two message structures,
+ * let's say struct request, and struct response already defined.
+ *
+ * In a header where you want the ring datatype declared, you then do:
+ *
+ *     DEFINE_RING_TYPES(mytag, struct request, struct response);
+ *
+ * These expand out to give you a set of types, as you can see below.
+ * The most important of these are:
+ * 
+ *     struct mytag_sring      - The shared ring.
+ *     struct mytag_front_ring - The 'front' half of the ring.
+ *     struct mytag_back_ring  - The 'back' half of the ring.
+ *
+ * To initialize a ring in your code you need to know the location and size
+ * of the shared memory area (PAGE_SIZE, for instance). To initialise
+ * the front half:
+ *
+ *     struct mytag_front_ring front_ring;
+ *     SHARED_RING_INIT((struct mytag_sring *)shared_page);
+ *     FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
+ *                     PAGE_SIZE);
+ *
+ * Initializing the back follows similarly (note that only the front
+ * initializes the shared ring):
+ *
+ *     struct mytag_back_ring back_ring;
+ *     BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
+ *                    PAGE_SIZE);
+ */
+
+#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
+                                                                        \
+/* Shared ring entry */                                                 \
+union __name##_sring_entry {                                            \
+    __req_t req;                                                        \
+    __rsp_t rsp;                                                        \
+};                                                                      \
+                                                                        \
+/* Shared ring page */                                                  \
+struct __name##_sring {                                                 \
+    RING_IDX req_prod, req_event;                                       \
+    RING_IDX rsp_prod, rsp_event;                                       \
+    uint8_t  pad[48];                                                   \
+    union __name##_sring_entry ring[1]; /* variable-length */           \
+};                                                                      \
+                                                                        \
+/* "Front" end's private variables */                                   \
+struct __name##_front_ring {                                            \
+    RING_IDX req_prod_pvt;                                              \
+    RING_IDX rsp_cons;                                                  \
+    unsigned int nr_ents;                                               \
+    struct __name##_sring *sring;                                       \
+};                                                                      \
+                                                                        \
+/* "Back" end's private variables */                                    \
+struct __name##_back_ring {                                             \
+    RING_IDX rsp_prod_pvt;                                              \
+    RING_IDX req_cons;                                                  \
+    unsigned int nr_ents;                                               \
+    struct __name##_sring *sring;                                       \
+};
+
+/*
+ * Macros for manipulating rings.
+ * 
+ * FRONT_RING_whatever works on the "front end" of a ring: here 
+ * requests are pushed on to the ring and responses taken off it.
+ * 
+ * BACK_RING_whatever works on the "back end" of a ring: here 
+ * requests are taken off the ring and responses put on.
+ * 
+ * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. 
+ * This is OK in 1-for-1 request-response situations where the 
+ * requestor (front end) never has more than RING_SIZE()-1
+ * outstanding requests.
+ */
+
+/* Initialising empty rings */
+#define SHARED_RING_INIT(_s) do {                                       \
+    (_s)->req_prod  = (_s)->rsp_prod  = 0;                              \
+    (_s)->req_event = (_s)->rsp_event = 1;                              \
+    memset((_s)->pad, 0, sizeof((_s)->pad));                            \
+} while(0)
+
+#define FRONT_RING_INIT(_r, _s, __size) do {                            \
+    (_r)->req_prod_pvt = 0;                                             \
+    (_r)->rsp_cons = 0;                                                 \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+    (_r)->sring = (_s);                                                 \
+} while (0)
+
+#define BACK_RING_INIT(_r, _s, __size) do {                             \
+    (_r)->rsp_prod_pvt = 0;                                             \
+    (_r)->req_cons = 0;                                                 \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+    (_r)->sring = (_s);                                                 \
+} while (0)
+
+/* Initialize to existing shared indexes -- for recovery */
+#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
+    (_r)->sring = (_s);                                                 \
+    (_r)->req_prod_pvt = (_s)->req_prod;                                \
+    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+} while (0)
+
+#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
+    (_r)->sring = (_s);                                                 \
+    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
+    (_r)->req_cons = (_s)->req_prod;                                    \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+} while (0)
+
+/* How big is this ring? */
+#define RING_SIZE(_r)                                                   \
+    ((_r)->nr_ents)
+
+/* Test if there is an empty slot available on the front ring.
+ * (This is only meaningful from the front. )
+ */
+#define RING_FULL(_r)                                                   \
+    (((_r)->req_prod_pvt - (_r)->rsp_cons) == RING_SIZE(_r))
+
+/* Test if there are outstanding messages to be processed on a ring. */
+#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
+    ((_r)->rsp_cons != (_r)->sring->rsp_prod)
+
+#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
+    (((_r)->req_cons != (_r)->sring->req_prod) &&                       \
+     (((_r)->req_cons - (_r)->rsp_prod_pvt) != RING_SIZE(_r)))
+
+/* Direct access to individual ring elements, by index. */
+#define RING_GET_REQUEST(_r, _idx)                                      \
+    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
+
+#define RING_GET_RESPONSE(_r, _idx)                                     \
+    (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
+
+/* Loop termination condition: Would the specified index overflow the ring? */
+#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
+    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
+
+#define RING_PUSH_REQUESTS(_r) do {                                     \
+    wmb(); /* back sees requests /before/ updated producer index */     \
+    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
+} while (0)
+
+#define RING_PUSH_RESPONSES(_r) do {                                    \
+    wmb(); /* front sees responses /before/ updated producer index */   \
+    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
+} while (0)
+
+/*
+ * Notification hold-off (req_event and rsp_event):
+ * 
+ * When queueing requests or responses on a shared ring, it may not always be
+ * necessary to notify the remote end. For example, if requests are in flight
+ * in a backend, the front may be able to queue further requests without
+ * notifying the back (if the back checks for new requests when it queues
+ * responses).
+ * 
+ * When enqueuing requests or responses:
+ * 
+ *  Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument
+ *  is a boolean return value. True indicates that the receiver requires an
+ *  asynchronous notification.
+ * 
+ * After dequeuing requests or responses (before sleeping the connection):
+ * 
+ *  Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES().
+ *  The second argument is a boolean return value. True indicates that there
+ *  are pending messages on the ring (i.e., the connection should not be put
+ *  to sleep).
+ * 
+ *  These macros will set the req_event/rsp_event field to trigger a
+ *  notification on the very next message that is enqueued. If you want to
+ *  create batches of work (i.e., only receive a notification after several
+ *  messages have been enqueued) then you will need to create a customised
+ *  version of the FINAL_CHECK macro in your own code, which sets the event
+ *  field appropriately.
+ */
+
+#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {           \
+    RING_IDX __old = (_r)->sring->req_prod;                             \
+    RING_IDX __new = (_r)->req_prod_pvt;                                \
+    wmb(); /* back sees requests /before/ updated producer index */     \
+    (_r)->sring->req_prod = __new;                                      \
+    mb(); /* back sees new requests /before/ we check req_event */      \
+    (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <           \
+                 (RING_IDX)(__new - __old));                            \
+} while (0)
+
+#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {          \
+    RING_IDX __old = (_r)->sring->rsp_prod;                             \
+    RING_IDX __new = (_r)->rsp_prod_pvt;                                \
+    wmb(); /* front sees responses /before/ updated producer index */   \
+    (_r)->sring->rsp_prod = __new;                                      \
+    mb(); /* front sees new responses /before/ we check rsp_event */    \
+    (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <           \
+                 (RING_IDX)(__new - __old));                            \
+} while (0)
+
+#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do {             \
+    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
+    if (_work_to_do) break;                                             \
+    (_r)->sring->req_event = (_r)->req_cons + 1;                        \
+    mb();                                                               \
+    (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                   \
+} while (0)
+
+#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do {            \
+    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
+    if (_work_to_do) break;                                             \
+    (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                        \
+    mb();                                                               \
+    (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                  \
+} while (0)
+
+#endif /* __XEN_PUBLIC_IO_RING_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/io/xenbus.h
@@ -0,0 +1,42 @@
+/*****************************************************************************
+ * xenbus.h
+ *
+ * Xenbus protocol details.
+ *
+ * Copyright (C) 2005 XenSource Ltd.
+ */
+
+#ifndef _XEN_PUBLIC_IO_XENBUS_H
+#define _XEN_PUBLIC_IO_XENBUS_H
+
+/* The state of either end of the Xenbus, i.e. the current communication
+   status of initialisation across the bus.  States here imply nothing about
+   the state of the connection between the driver and the kernel's device
+   layers.  */
+typedef enum
+{
+  XenbusStateUnknown      = 0,
+  XenbusStateInitialising = 1,
+  XenbusStateInitWait     = 2,  /* Finished early initialisation, but waiting
+                                   for information from the peer or hotplug
+				   scripts. */
+  XenbusStateInitialised  = 3,  /* Initialised and waiting for a connection
+				   from the peer. */
+  XenbusStateConnected    = 4,
+  XenbusStateClosing      = 5,  /* The device is being closed due to an error
+				   or an unplug event. */
+  XenbusStateClosed       = 6
+
+} XenbusState;
+
+#endif /* _XEN_PUBLIC_IO_XENBUS_H */
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/io/xs_wire.h
@@ -0,0 +1,97 @@
+/*
+ * Details of the "wire" protocol between Xen Store Daemon and client
+ * library or guest kernel.
+ * Copyright (C) 2005 Rusty Russell IBM Corporation
+ */
+
+#ifndef _XS_WIRE_H
+#define _XS_WIRE_H
+
+enum xsd_sockmsg_type
+{
+    XS_DEBUG,
+    XS_DIRECTORY,
+    XS_READ,
+    XS_GET_PERMS,
+    XS_WATCH,
+    XS_UNWATCH,
+    XS_TRANSACTION_START,
+    XS_TRANSACTION_END,
+    XS_INTRODUCE,
+    XS_RELEASE,
+    XS_GET_DOMAIN_PATH,
+    XS_WRITE,
+    XS_MKDIR,
+    XS_RM,
+    XS_SET_PERMS,
+    XS_WATCH_EVENT,
+    XS_ERROR,
+    XS_IS_DOMAIN_INTRODUCED
+};
+
+#define XS_WRITE_NONE "NONE"
+#define XS_WRITE_CREATE "CREATE"
+#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
+
+/* We hand errors as strings, for portability. */
+struct xsd_errors
+{
+    int errnum;
+    const char *errstring;
+};
+#define XSD_ERROR(x) { x, #x }
+static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
+    XSD_ERROR(EINVAL),
+    XSD_ERROR(EACCES),
+    XSD_ERROR(EEXIST),
+    XSD_ERROR(EISDIR),
+    XSD_ERROR(ENOENT),
+    XSD_ERROR(ENOMEM),
+    XSD_ERROR(ENOSPC),
+    XSD_ERROR(EIO),
+    XSD_ERROR(ENOTEMPTY),
+    XSD_ERROR(ENOSYS),
+    XSD_ERROR(EROFS),
+    XSD_ERROR(EBUSY),
+    XSD_ERROR(EAGAIN),
+    XSD_ERROR(EISCONN)
+};
+
+struct xsd_sockmsg
+{
+    uint32_t type;  /* XS_??? */
+    uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
+    uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
+    uint32_t len;   /* Length of data following this. */
+
+    /* Generally followed by nul-terminated string(s). */
+};
+
+enum xs_watch_type
+{
+    XS_WATCH_PATH = 0,
+    XS_WATCH_TOKEN
+};
+
+/* Inter-domain shared memory communications. */
+#define XENSTORE_RING_SIZE 1024
+typedef uint32_t XENSTORE_RING_IDX;
+#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1))
+struct xenstore_domain_interface {
+    char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */
+    char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */
+    XENSTORE_RING_IDX req_cons, req_prod;
+    XENSTORE_RING_IDX rsp_cons, rsp_prod;
+};
+
+#endif /* _XS_WIRE_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/memory.h
@@ -0,0 +1,155 @@
+/******************************************************************************
+ * memory.h
+ * 
+ * Memory reservation and information.
+ * 
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_MEMORY_H__
+#define __XEN_PUBLIC_MEMORY_H__
+
+/*
+ * Increase or decrease the specified domain's memory reservation. Returns a
+ * -ve errcode on failure, or the # extents successfully allocated or freed.
+ * arg == addr of struct xen_memory_reservation.
+ */
+#define XENMEM_increase_reservation 0
+#define XENMEM_decrease_reservation 1
+#define XENMEM_populate_physmap     6
+struct xen_memory_reservation {
+
+    /*
+     * XENMEM_increase_reservation:
+     *   OUT: MFN (*not* GMFN) bases of extents that were allocated
+     * XENMEM_decrease_reservation:
+     *   IN:  GMFN bases of extents to free
+     * XENMEM_populate_physmap:
+     *   IN:  GPFN bases of extents to populate with memory
+     *   OUT: GMFN bases of extents that were allocated
+     *   (NB. This command also updates the mach_to_phys translation table)
+     */
+    GUEST_HANDLE(ulong) extent_start;
+
+    /* Number of extents, and size/alignment of each (2^extent_order pages). */
+    unsigned long  nr_extents;
+    unsigned int   extent_order;
+
+    /*
+     * Maximum # bits addressable by the user of the allocated region (e.g., 
+     * I/O devices often have a 32-bit limitation even in 64-bit systems). If 
+     * zero then the user has no addressing restriction.
+     * This field is not used by XENMEM_decrease_reservation.
+     */
+    unsigned int   address_bits;
+
+    /*
+     * Domain whose reservation is being changed.
+     * Unprivileged domains can specify only DOMID_SELF.
+     */
+    domid_t        domid;
+
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
+
+/*
+ * Returns the maximum machine frame number of mapped RAM in this system.
+ * This command always succeeds (it never returns an error code).
+ * arg == NULL.
+ */
+#define XENMEM_maximum_ram_page     2
+
+/*
+ * Returns the current or maximum memory reservation, in pages, of the
+ * specified domain (may be DOMID_SELF). Returns -ve errcode on failure.
+ * arg == addr of domid_t.
+ */
+#define XENMEM_current_reservation  3
+#define XENMEM_maximum_reservation  4
+
+/*
+ * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
+ * mapping table. Architectures which do not have a m2p table do not implement
+ * this command.
+ * arg == addr of xen_machphys_mfn_list_t.
+ */
+#define XENMEM_machphys_mfn_list    5
+struct xen_machphys_mfn_list {
+    /*
+     * Size of the 'extent_start' array. Fewer entries will be filled if the
+     * machphys table is smaller than max_extents * 2MB.
+     */
+    unsigned int max_extents;
+
+    /*
+     * Pointer to buffer to fill with list of extent starts. If there are
+     * any large discontiguities in the machine address space, 2MB gaps in
+     * the machphys table will be represented by an MFN base of zero.
+     */
+    GUEST_HANDLE(ulong) extent_start;
+
+    /*
+     * Number of extents written to the above array. This will be smaller
+     * than 'max_extents' if the machphys table is smaller than max_e * 2MB.
+     */
+    unsigned int nr_extents;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
+
+/*
+ * Sets the GPFN at which a particular page appears in the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_add_to_physmap_t.
+ */
+#define XENMEM_add_to_physmap      7
+struct xen_add_to_physmap {
+    /* Which domain to change the mapping for. */
+    domid_t domid;
+
+    /* Source mapping space. */
+#define XENMAPSPACE_shared_info 0 /* shared info page */
+#define XENMAPSPACE_grant_table 1 /* grant table page */
+    unsigned int space;
+
+    /* Index into source mapping space. */
+    unsigned long idx;
+
+    /* GPFN where the source mapping page should appear. */
+    unsigned long gpfn;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
+
+/*
+ * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
+ * code on failure. This call only works for auto-translated guests.
+ */
+#define XENMEM_translate_gpfn_list  8
+struct xen_translate_gpfn_list {
+    /* Which domain to translate for? */
+    domid_t domid;
+
+    /* Length of list. */
+    unsigned long nr_gpfns;
+
+    /* List of GPFNs to translate. */
+    GUEST_HANDLE(ulong) gpfn_list;
+
+    /*
+     * Output list to contain MFN translations. May be the same as the input
+     * list (in which case each input GPFN is overwritten with the output MFN).
+     */
+    GUEST_HANDLE(ulong) mfn_list;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+
+#endif /* __XEN_PUBLIC_MEMORY_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/physdev.h
@@ -0,0 +1,71 @@
+
+#ifndef __XEN_PUBLIC_PHYSDEV_H__
+#define __XEN_PUBLIC_PHYSDEV_H__
+
+/* Commands to HYPERVISOR_physdev_op() */
+#define PHYSDEVOP_IRQ_UNMASK_NOTIFY     4
+#define PHYSDEVOP_IRQ_STATUS_QUERY      5
+#define PHYSDEVOP_SET_IOPL              6
+#define PHYSDEVOP_SET_IOBITMAP          7
+#define PHYSDEVOP_APIC_READ             8
+#define PHYSDEVOP_APIC_WRITE            9
+#define PHYSDEVOP_ASSIGN_VECTOR         10
+
+struct physdevop_irq_status_query {
+    /* IN */
+    uint32_t irq;
+    /* OUT */
+/* Need to call PHYSDEVOP_IRQ_UNMASK_NOTIFY when the IRQ has been serviced? */
+#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY (1<<0)
+    uint32_t flags;
+};
+
+struct physdevop_set_iopl {
+    /* IN */
+    uint32_t iopl;
+};
+
+struct physdevop_set_iobitmap {
+    /* IN */
+    uint8_t *bitmap;
+    uint32_t nr_ports;
+};
+
+struct physdevop_apic {
+    /* IN */
+    unsigned long apic_physbase;
+    uint32_t reg;
+    /* IN or OUT */
+    uint32_t value;
+};
+
+struct physdevop_irq {
+    /* IN */
+    uint32_t irq;
+    /* OUT */
+    uint32_t vector;
+};
+
+struct physdev_op {
+    uint32_t cmd;
+    union {
+        struct physdevop_irq_status_query      irq_status_query;
+        struct physdevop_set_iopl              set_iopl;
+        struct physdevop_set_iobitmap          set_iobitmap;
+        struct physdevop_apic                  apic_op;
+        struct physdevop_irq                   irq_op;
+    } u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(physdev_op);
+
+#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/sched.h
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * sched.h
+ * 
+ * Scheduler state interactions
+ * 
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_SCHED_H__
+#define __XEN_PUBLIC_SCHED_H__
+
+#include "event_channel.h"
+
+/*
+ * The prototype for this hypercall is:
+ *  long sched_op_new(int cmd, void *arg)
+ * @cmd == SCHEDOP_??? (scheduler operation).
+ * @arg == Operation-specific extra argument(s), as described below.
+ * 
+ * **NOTE**:
+ * Versions of Xen prior to 3.0.2 provide only the following legacy version
+ * of this hypercall, supporting only the commands yield, block and shutdown:
+ *  long sched_op(int cmd, unsigned long arg)
+ * @cmd == SCHEDOP_??? (scheduler operation).
+ * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
+ *      == SHUTDOWN_* code (SCHEDOP_shutdown)
+ */
+
+/*
+ * Voluntarily yield the CPU.
+ * @arg == NULL.
+ */
+#define SCHEDOP_yield       0
+
+/*
+ * Block execution of this VCPU until an event is received for processing.
+ * If called with event upcalls masked, this operation will atomically
+ * reenable event delivery and check for pending events before blocking the
+ * VCPU. This avoids a "wakeup waiting" race.
+ * @arg == NULL.
+ */
+#define SCHEDOP_block       1
+
+/*
+ * Halt execution of this domain (all VCPUs) and notify the system controller.
+ * @arg == pointer to sched_shutdown structure.
+ */
+#define SCHEDOP_shutdown    2
+struct sched_shutdown {
+    unsigned int reason; /* SHUTDOWN_* */
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
+
+/*
+ * Poll a set of event-channel ports. Return when one or more are pending. An
+ * optional timeout may be specified.
+ * @arg == pointer to sched_poll structure.
+ */
+#define SCHEDOP_poll        3
+struct sched_poll {
+    GUEST_HANDLE(evtchn_port_t) ports;
+    unsigned int nr_ports;
+    uint64_t timeout;
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+
+/*
+ * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
+ * software to determine the appropriate action. For the most part, Xen does
+ * not care about the shutdown code.
+ */
+#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
+#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
+#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
+#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
+
+#endif /* __XEN_PUBLIC_SCHED_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/vcpu.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * vcpu.h
+ * 
+ * VCPU initialisation, query, and hotplug.
+ * 
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_VCPU_H__
+#define __XEN_PUBLIC_VCPU_H__
+
+/*
+ * Prototype for this hypercall is:
+ *  int vcpu_op(int cmd, int vcpuid, void *extra_args)
+ * @cmd        == VCPUOP_??? (VCPU operation).
+ * @vcpuid     == VCPU to operate on.
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+/*
+ * Initialise a VCPU. Each VCPU can be initialised only once. A 
+ * newly-initialised VCPU will not run until it is brought up by VCPUOP_up.
+ * 
+ * @extra_arg == pointer to vcpu_guest_context structure containing initial
+ *               state for the VCPU.
+ */
+#define VCPUOP_initialise           0
+
+/*
+ * Bring up a VCPU. This makes the VCPU runnable. This operation will fail
+ * if the VCPU has not been initialised (VCPUOP_initialise).
+ */
+#define VCPUOP_up                   1
+
+/*
+ * Bring down a VCPU (i.e., make it non-runnable).
+ * There are a few caveats that callers should observe:
+ *  1. This operation may return, and VCPU_is_up may return false, before the
+ *     VCPU stops running (i.e., the command is asynchronous). It is a good
+ *     idea to ensure that the VCPU has entered a non-critical loop before
+ *     bringing it down. Alternatively, this operation is guaranteed
+ *     synchronous if invoked by the VCPU itself.
+ *  2. After a VCPU is initialised, there is currently no way to drop all its
+ *     references to domain memory. Even a VCPU that is down still holds
+ *     memory references via its pagetable base pointer and GDT. It is good
+ *     practise to move a VCPU onto an 'idle' or default page table, LDT and
+ *     GDT before bringing it down.
+ */
+#define VCPUOP_down                 2
+
+/* Returns 1 if the given VCPU is up. */
+#define VCPUOP_is_up                3
+
+/*
+ * Return information about the state and running time of a VCPU.
+ * @extra_arg == pointer to vcpu_runstate_info structure.
+ */
+#define VCPUOP_get_runstate_info    4
+struct vcpu_runstate_info {
+    /* VCPU's current state (RUNSTATE_*). */
+    int      state;
+    /* When was current state entered (system time, ns)? */
+    uint64_t state_entry_time;
+    /*
+     * Time spent in each RUNSTATE_* (ns). The sum of these times is
+     * guaranteed not to drift from system time.
+     */
+    uint64_t time[4];
+};
+
+/* VCPU is currently running on a physical CPU. */
+#define RUNSTATE_running  0
+
+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
+#define RUNSTATE_runnable 1
+
+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
+#define RUNSTATE_blocked  2
+
+/*
+ * VCPU is not runnable, but it is not blocked.
+ * This is a 'catch all' state for things like hotplug and pauses by the
+ * system administrator (or for critical sections in the hypervisor).
+ * RUNSTATE_blocked dominates this state (it is the preferred state).
+ */
+#define RUNSTATE_offline  3
+
+/*
+ * Register a shared memory area from which the guest may obtain its own
+ * runstate information without needing to execute a hypercall.
+ * Notes:
+ *  1. The registered address may be virtual or physical, depending on the
+ *     platform. The virtual address should be registered on x86 systems.
+ *  2. Only one shared area may be registered per VCPU. The shared area is
+ *     updated by the hypervisor each time the VCPU is scheduled. Thus
+ *     runstate.state will always be RUNSTATE_running and
+ *     runstate.state_entry_time will indicate the system time at which the
+ *     VCPU was last scheduled to run.
+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
+ */
+#define VCPUOP_register_runstate_memory_area 5
+struct vcpu_register_runstate_memory_area {
+    union {
+        struct vcpu_runstate_info *v;
+        uint64_t p;
+    } addr;
+};
+
+#endif /* __XEN_PUBLIC_VCPU_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/version.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * version.h
+ * 
+ * Xen version, type, and compile information.
+ * 
+ * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_VERSION_H__
+#define __XEN_PUBLIC_VERSION_H__
+
+/* NB. All ops return zero on success, except XENVER_version. */
+
+/* arg == NULL; returns major:minor (16:16). */
+#define XENVER_version      0
+
+/* arg == xen_extraversion_t. */
+#define XENVER_extraversion 1
+struct xen_extraversion {
+    char extraversion[16];
+};
+#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion))
+
+/* arg == xen_compile_info_t. */
+#define XENVER_compile_info 2
+struct xen_compile_info {
+    char compiler[64];
+    char compile_by[16];
+    char compile_domain[32];
+    char compile_date[32];
+};
+
+#define XENVER_capabilities 3
+struct xen_capabilities_info {
+    char info[1024];
+};
+#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info))
+
+#define XENVER_changeset 4
+struct xen_changeset_info {
+    char info[64];
+};
+#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info))
+
+#define XENVER_platform_parameters 5
+struct xen_platform_parameters {
+    unsigned long virt_start;
+};
+
+#define XENVER_get_features 6
+struct xen_feature_info {
+    unsigned int submap_idx;    /* IN: which 32-bit submap to return */
+    uint32_t     submap;        /* OUT: 32-bit submap */
+};
+
+/* Declares the features reported by XENVER_get_features. */
+#include "features.h"
+
+#endif /* __XEN_PUBLIC_VERSION_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--- /dev/null
+++ linus-2.6/include/xen/interface/xen.h
@@ -0,0 +1,441 @@
+/******************************************************************************
+ * xen.h
+ * 
+ * Guest OS interface to Xen.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_XEN_H__
+#define __XEN_PUBLIC_XEN_H__
+
+#include "arch-x86_32.h"
+
+/*
+ * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
+ */
+
+/*
+ * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
+ *         EAX = return value
+ *         (argument registers may be clobbered on return)
+ * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. 
+ *         RAX = return value
+ *         (argument registers not clobbered on return; RCX, R11 are)
+ */
+#define __HYPERVISOR_set_trap_table        0
+#define __HYPERVISOR_mmu_update            1
+#define __HYPERVISOR_set_gdt               2
+#define __HYPERVISOR_stack_switch          3
+#define __HYPERVISOR_set_callbacks         4
+#define __HYPERVISOR_fpu_taskswitch        5
+#define __HYPERVISOR_sched_op              6
+#define __HYPERVISOR_dom0_op               7
+#define __HYPERVISOR_set_debugreg          8
+#define __HYPERVISOR_get_debugreg          9
+#define __HYPERVISOR_update_descriptor    10
+#define __HYPERVISOR_memory_op            12
+#define __HYPERVISOR_multicall            13
+#define __HYPERVISOR_update_va_mapping    14
+#define __HYPERVISOR_set_timer_op         15
+#define __HYPERVISOR_event_channel_op     16
+#define __HYPERVISOR_xen_version          17
+#define __HYPERVISOR_console_io           18
+#define __HYPERVISOR_physdev_op           19
+#define __HYPERVISOR_grant_table_op       20
+#define __HYPERVISOR_vm_assist            21
+#define __HYPERVISOR_update_va_mapping_otherdomain 22
+#define __HYPERVISOR_iret                 23 /* x86 only */
+#define __HYPERVISOR_vcpu_op              24
+#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
+#define __HYPERVISOR_mmuext_op            26
+#define __HYPERVISOR_acm_op               27
+#define __HYPERVISOR_nmi_op               28
+#define __HYPERVISOR_sched_op_new         29
+
+/* 
+ * VIRTUAL INTERRUPTS
+ * 
+ * Virtual interrupts that a guest OS may receive from Xen.
+ */
+#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
+#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
+#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
+#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
+#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
+#define NR_VIRQS        8
+
+/*
+ * MMU-UPDATE REQUESTS
+ * 
+ * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
+ * Where the FD has some effect, it is described below.
+ * ptr[1:0] specifies the appropriate MMU_* command.
+ * 
+ * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
+ * Updates an entry in a page table. If updating an L1 table, and the new
+ * table entry is valid/present, the mapped frame must belong to the FD, if
+ * an FD has been specified. If attempting to map an I/O page then the
+ * caller assumes the privilege of the FD.
+ * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
+ * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
+ * ptr[:2]  -- Machine address of the page-table entry to modify.
+ * val      -- Value to write.
+ * 
+ * ptr[1:0] == MMU_MACHPHYS_UPDATE:
+ * Updates an entry in the machine->pseudo-physical mapping table.
+ * ptr[:2]  -- Machine address within the frame whose mapping to modify.
+ *             The frame must belong to the FD, if one is specified.
+ * val      -- Value to write into the mapping entry.
+ */
+#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
+#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
+
+/*
+ * MMU EXTENDED OPERATIONS
+ * 
+ * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
+ * Where the FD has some effect, it is described below.
+ * 
+ * cmd: MMUEXT_(UN)PIN_*_TABLE
+ * mfn: Machine frame number to be (un)pinned as a p.t. page.
+ *      The frame must belong to the FD, if one is specified.
+ * 
+ * cmd: MMUEXT_NEW_BASEPTR
+ * mfn: Machine frame number of new page-table base to install in MMU.
+ * 
+ * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
+ * mfn: Machine frame number of new page-table base to install in MMU
+ *      when in user space.
+ * 
+ * cmd: MMUEXT_TLB_FLUSH_LOCAL
+ * No additional arguments. Flushes local TLB.
+ * 
+ * cmd: MMUEXT_INVLPG_LOCAL
+ * linear_addr: Linear address to be flushed from the local TLB.
+ * 
+ * cmd: MMUEXT_TLB_FLUSH_MULTI
+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
+ * 
+ * cmd: MMUEXT_INVLPG_MULTI
+ * linear_addr: Linear address to be flushed.
+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
+ * 
+ * cmd: MMUEXT_TLB_FLUSH_ALL
+ * No additional arguments. Flushes all VCPUs' TLBs.
+ * 
+ * cmd: MMUEXT_INVLPG_ALL
+ * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
+ * 
+ * cmd: MMUEXT_FLUSH_CACHE
+ * No additional arguments. Writes back and flushes cache contents.
+ * 
+ * cmd: MMUEXT_SET_LDT
+ * linear_addr: Linear address of LDT base (NB. must be page-aligned).
+ * nr_ents: Number of entries in LDT.
+ */
+#define MMUEXT_PIN_L1_TABLE      0
+#define MMUEXT_PIN_L2_TABLE      1
+#define MMUEXT_PIN_L3_TABLE      2
+#define MMUEXT_PIN_L4_TABLE      3
+#define MMUEXT_UNPIN_TABLE       4
+#define MMUEXT_NEW_BASEPTR       5
+#define MMUEXT_TLB_FLUSH_LOCAL   6
+#define MMUEXT_INVLPG_LOCAL      7
+#define MMUEXT_TLB_FLUSH_MULTI   8
+#define MMUEXT_INVLPG_MULTI      9
+#define MMUEXT_TLB_FLUSH_ALL    10
+#define MMUEXT_INVLPG_ALL       11
+#define MMUEXT_FLUSH_CACHE      12
+#define MMUEXT_SET_LDT          13
+#define MMUEXT_NEW_USER_BASEPTR 15
+
+#ifndef __ASSEMBLY__
+struct mmuext_op {
+    unsigned int cmd;
+    union {
+        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+        unsigned long mfn;
+        /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
+        unsigned long linear_addr;
+    } arg1;
+    union {
+        /* SET_LDT */
+        unsigned int nr_ents;
+        /* TLB_FLUSH_MULTI, INVLPG_MULTI */
+        void *vcpumask;
+    } arg2;
+};
+DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
+#endif
+
+/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
+/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
+/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
+#define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
+#define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
+#define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
+#define UVMF_FLUSHTYPE_MASK     (3UL<<0)
+#define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
+#define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
+#define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
+
+/*
+ * Commands to HYPERVISOR_console_io().
+ */
+#define CONSOLEIO_write         0
+#define CONSOLEIO_read          1
+
+/*
+ * Commands to HYPERVISOR_vm_assist().
+ */
+#define VMASST_CMD_enable                0
+#define VMASST_CMD_disable               1
+#define VMASST_TYPE_4gb_segments         0
+#define VMASST_TYPE_4gb_segments_notify  1
+#define VMASST_TYPE_writable_pagetables  2
+#define MAX_VMASST_TYPE 2
+
+#ifndef __ASSEMBLY__
+
+typedef uint16_t domid_t;
+
+/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
+#define DOMID_FIRST_RESERVED (0x7FF0U)
+
+/* DOMID_SELF is used in certain contexts to refer to oneself. */
+#define DOMID_SELF (0x7FF0U)
+
+/*
+ * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
+ * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
+ * is useful to ensure that no mappings to the OS's own heap are accidentally
+ * installed. (e.g., in Linux this could cause havoc as reference counts
+ * aren't adjusted on the I/O-mapping code path).
+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
+ * be specified by any calling domain.
+ */
+#define DOMID_IO   (0x7FF1U)
+
+/*
+ * DOMID_XEN is used to allow privileged domains to map restricted parts of
+ * Xen's heap space (e.g., the machine_to_phys table).
+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
+ * the caller is privileged.
+ */
+#define DOMID_XEN  (0x7FF2U)
+
+/*
+ * Send an array of these to HYPERVISOR_mmu_update().
+ * NB. The fields are natural pointer/address size for this architecture.
+ */
+struct mmu_update {
+    uint64_t ptr;       /* Machine address of PTE. */
+    uint64_t val;       /* New contents of PTE.    */
+};
+DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
+
+/*
+ * Send an array of these to HYPERVISOR_multicall().
+ * NB. The fields are natural register size for this architecture.
+ */
+struct multicall_entry {
+    unsigned long op, result;
+    unsigned long args[6];
+};
+DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
+
+/*
+ * Event channel endpoints per domain:
+ *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ */
+#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+
+struct vcpu_time_info {
+    /*
+     * Updates to the following values are preceded and followed by an
+     * increment of 'version'. The guest can therefore detect updates by
+     * looking for changes to 'version'. If the least-significant bit of
+     * the version number is set then an update is in progress and the guest
+     * must wait to read a consistent set of values.
+     * The correct way to interact with the version number is similar to
+     * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
+     */
+    uint32_t version;
+    uint32_t pad0;
+    uint64_t tsc_timestamp;   /* TSC at last update of time vals.  */
+    uint64_t system_time;     /* Time, in nanosecs, since boot.    */
+    /*
+     * Current system time:
+     *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
+     * CPU frequency (Hz):
+     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+     */
+    uint32_t tsc_to_system_mul;
+    int8_t   tsc_shift;
+    int8_t   pad1[3];
+}; /* 32 bytes */
+
+struct vcpu_info {
+    /*
+     * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
+     * a pending notification for a particular VCPU. It is then cleared 
+     * by the guest OS /before/ checking for pending work, thus avoiding
+     * a set-and-check race. Note that the mask is only accessed by Xen
+     * on the CPU that is currently hosting the VCPU. This means that the
+     * pending and mask flags can be updated by the guest without special
+     * synchronisation (i.e., no need for the x86 LOCK prefix).
+     * This may seem suboptimal because if the pending flag is set by
+     * a different CPU then an IPI may be scheduled even when the mask
+     * is set. However, note:
+     *  1. The task of 'interrupt holdoff' is covered by the per-event-
+     *     channel mask bits. A 'noisy' event that is continually being
+     *     triggered can be masked at source at this very precise
+     *     granularity.
+     *  2. The main purpose of the per-VCPU mask is therefore to restrict
+     *     reentrant execution: whether for concurrency control, or to
+     *     prevent unbounded stack usage. Whatever the purpose, we expect
+     *     that the mask will be asserted only for short periods at a time,
+     *     and so the likelihood of a 'spurious' IPI is suitably small.
+     * The mask is read before making an event upcall to the guest: a
+     * non-zero mask therefore guarantees that the VCPU will not receive
+     * an upcall activation. The mask is cleared when the VCPU requests
+     * to block: this avoids wakeup-waiting races.
+     */
+    uint8_t evtchn_upcall_pending;
+    uint8_t evtchn_upcall_mask;
+    unsigned long evtchn_pending_sel;
+    struct arch_vcpu_info arch;
+    struct vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+/*
+ * Xen/kernel shared data -- pointer provided in start_info.
+ * NB. We expect that this struct is smaller than a page.
+ */
+struct shared_info {
+    struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
+
+    /*
+     * A domain can create "event channels" on which it can send and receive
+     * asynchronous event notifications. There are three classes of event that
+     * are delivered by this mechanism:
+     *  1. Bi-directional inter- and intra-domain connections. Domains must
+     *     arrange out-of-band to set up a connection (usually by allocating
+     *     an unbound 'listener' port and avertising that via a storage service
+     *     such as xenstore).
+     *  2. Physical interrupts. A domain with suitable hardware-access
+     *     privileges can bind an event-channel port to a physical interrupt
+     *     source.
+     *  3. Virtual interrupts ('events'). A domain can bind an event-channel
+     *     port to a virtual interrupt source, such as the virtual-timer
+     *     device or the emergency console.
+     * 
+     * Event channels are addressed by a "port index". Each channel is
+     * associated with two bits of information:
+     *  1. PENDING -- notifies the domain that there is a pending notification
+     *     to be processed. This bit is cleared by the guest.
+     *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
+     *     will cause an asynchronous upcall to be scheduled. This bit is only
+     *     updated by the guest. It is read-only within Xen. If a channel
+     *     becomes pending while the channel is masked then the 'edge' is lost
+     *     (i.e., when the channel is unmasked, the guest must manually handle
+     *     pending notifications as no upcall will be scheduled by Xen).
+     * 
+     * To expedite scanning of pending notifications, any 0->1 pending
+     * transition on an unmasked channel causes a corresponding bit in a
+     * per-vcpu selector word to be set. Each bit in the selector covers a
+     * 'C long' in the PENDING bitfield array.
+     */
+    unsigned long evtchn_pending[sizeof(unsigned long) * 8];
+    unsigned long evtchn_mask[sizeof(unsigned long) * 8];
+
+    /*
+     * Wallclock time: updated only by control software. Guests should base
+     * their gettimeofday() syscall on this wallclock-base value.
+     */
+    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
+    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
+    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+
+    struct arch_shared_info arch;
+
+};
+
+/*
+ * Start-of-day memory layout for the initial domain (DOM0):
+ *  1. The domain is started within contiguous virtual-memory region.
+ *  2. The contiguous region begins and ends on an aligned 4MB boundary.
+ *  3. The region start corresponds to the load address of the OS image.
+ *     If the load address is not 4MB aligned then the address is rounded down.
+ *  4. This the order of bootstrap elements in the initial virtual region:
+ *      a. relocated kernel image
+ *      b. initial ram disk              [mod_start, mod_len]
+ *      c. list of allocated page frames [mfn_list, nr_pages]
+ *      d. start_info_t structure        [register ESI (x86)]
+ *      e. bootstrap page tables         [pt_base, CR3 (x86)]
+ *      f. bootstrap stack               [register ESP (x86)]
+ *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
+ *  6. The initial ram disk may be omitted.
+ *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *     layout for the domain. In particular, the bootstrap virtual-memory
+ *     region is a 1:1 mapping to the first section of the pseudo-physical map.
+ *  8. All bootstrap elements are mapped read-writable for the guest OS. The
+ *     only exception is the bootstrap page table, which is mapped read-only.
+ *  9. There is guaranteed to be at least 512kB padding after the final
+ *     bootstrap element. If necessary, the bootstrap virtual region is
+ *     extended by an extra 4MB to ensure this.
+ */
+
+#define MAX_GUEST_CMDLINE 1024
+struct start_info {
+    /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
+    char magic[32];             /* "xen-<version>-<platform>".            */
+    unsigned long nr_pages;     /* Total pages allocated to this domain.  */
+    unsigned long shared_info;  /* MACHINE address of shared info struct. */
+    uint32_t flags;             /* SIF_xxx flags.                         */
+    unsigned long store_mfn;    /* MACHINE page number of shared page.    */
+    uint32_t store_evtchn;      /* Event channel for store communication. */
+    unsigned long console_mfn;  /* MACHINE address of console page.       */
+    uint32_t console_evtchn;    /* Event channel for console messages.    */
+    /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
+    unsigned long pt_base;      /* VIRTUAL address of page directory.     */
+    unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames.       */
+    unsigned long mfn_list;     /* VIRTUAL address of page-frame list.    */
+    unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
+    unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
+    int8_t cmd_line[MAX_GUEST_CMDLINE];
+};
+
+/* These flags are passed in the 'flags' field of start_info_t. */
+#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
+#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+
+typedef uint64_t cpumap_t;
+
+typedef uint8_t xen_domain_handle_t[16];
+
+/* Turn a plain number into a C unsigned long constant. */
+#define __mk_unsigned_long(x) x ## UL
+#define mk_unsigned_long(x) __mk_unsigned_long(x)
+
+#else /* __ASSEMBLY__ */
+
+/* In assembly code we cannot use C numeric constant suffixes. */
+#define mk_unsigned_long(x) x
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_XEN_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 04/35] Hypervisor interface header files.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (2 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 03/35] Add Xen interface header files Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 22:43   ` Ingo Oeser
  2006-05-09  7:00 ` [RFC PATCH 05/35] Add sync bitops Chris Wright
                   ` (31 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-hypervisor-interface --]
[-- Type: text/plain, Size: 11170 bytes --]

Define macros and inline functions for doing hypercalls into the
hypervisor.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/asm-i386/hypercall.h  |  309 ++++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/hypervisor.h |   70 +++++++++
 2 files changed, 379 insertions(+)

--- /dev/null
+++ linus-2.6/include/asm-i386/hypercall.h
@@ -0,0 +1,309 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+
+#include <linux/config.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+#define _hypercall0(type, name)			\
+({						\
+	long __res;				\
+	asm volatile (				\
+		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
+		: "=a" (__res)			\
+		:				\
+		: "memory" );			\
+	(type)__res;				\
+})
+
+#define _hypercall1(type, name, a1)				\
+({								\
+	long __res, __ign1;					\
+	asm volatile (						\
+		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
+		: "=a" (__res), "=b" (__ign1)			\
+		: "1" ((long)(a1))				\
+		: "memory" );					\
+	(type)__res;						\
+})
+
+#define _hypercall2(type, name, a1, a2)				\
+({								\
+	long __res, __ign1, __ign2;				\
+	asm volatile (						\
+		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2)	\
+		: "1" ((long)(a1)), "2" ((long)(a2))		\
+		: "memory" );					\
+	(type)__res;						\
+})
+
+#define _hypercall3(type, name, a1, a2, a3)			\
+({								\
+	long __res, __ign1, __ign2, __ign3;			\
+	asm volatile (						\
+		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2), 	\
+		"=d" (__ign3)					\
+		: "1" ((long)(a1)), "2" ((long)(a2)),		\
+		"3" ((long)(a3))				\
+		: "memory" );					\
+	(type)__res;						\
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4)			\
+({								\
+	long __res, __ign1, __ign2, __ign3, __ign4;		\
+	asm volatile (						\
+		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2),	\
+		"=d" (__ign3), "=S" (__ign4)			\
+		: "1" ((long)(a1)), "2" ((long)(a2)),		\
+		"3" ((long)(a3)), "4" ((long)(a4))		\
+		: "memory" );					\
+	(type)__res;						\
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5)		\
+({								\
+	long __res, __ign1, __ign2, __ign3, __ign4, __ign5;	\
+	asm volatile (						\
+		"call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)"\
+		: "=a" (__res), "=b" (__ign1), "=c" (__ign2),	\
+		"=d" (__ign3), "=S" (__ign4), "=D" (__ign5)	\
+		: "1" ((long)(a1)), "2" ((long)(a2)),		\
+		"3" ((long)(a3)), "4" ((long)(a4)),		\
+		"5" ((long)(a5))				\
+		: "memory" );					\
+	(type)__res;						\
+})
+
+static inline int
+HYPERVISOR_set_trap_table(
+	struct trap_info *table)
+{
+	return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int
+HYPERVISOR_mmu_update(
+	struct mmu_update *req, int count, int *success_count, domid_t domid)
+{
+	return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_mmuext_op(
+	struct mmuext_op *op, int count, int *success_count, domid_t domid)
+{
+	return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_set_gdt(
+	unsigned long *frame_list, int entries)
+{
+	return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int
+HYPERVISOR_stack_switch(
+	unsigned long ss, unsigned long esp)
+{
+	return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int
+HYPERVISOR_set_callbacks(
+	unsigned long event_selector, unsigned long event_address,
+	unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+	return _hypercall4(int, set_callbacks,
+			   event_selector, event_address,
+			   failsafe_selector, failsafe_address);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+	int set)
+{
+	return _hypercall1(int, fpu_taskswitch, set);
+}
+
+static inline int
+HYPERVISOR_sched_op(
+	int cmd, unsigned long arg)
+{
+	return _hypercall2(int, sched_op, cmd, arg);
+}
+
+static inline long
+HYPERVISOR_set_timer_op(
+	u64 timeout)
+{
+	unsigned long timeout_hi = (unsigned long)(timeout>>32);
+	unsigned long timeout_lo = (unsigned long)timeout;
+	return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
+}
+
+static inline int
+HYPERVISOR_set_debugreg(
+	int reg, unsigned long value)
+{
+	return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long
+HYPERVISOR_get_debugreg(
+	int reg)
+{
+	return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int
+HYPERVISOR_update_descriptor(
+	u64 ma, u64 desc)
+{
+	return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
+}
+
+static inline int
+HYPERVISOR_memory_op(
+	unsigned int cmd, void *arg)
+{
+	return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_multicall(
+	void *call_list, int nr_calls)
+{
+	return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping(
+	unsigned long va, pte_t new_val, unsigned long flags)
+{
+	unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+	pte_hi = new_val.pte_high;
+#endif
+	return _hypercall4(int, update_va_mapping, va,
+			   new_val.pte_low, pte_hi, flags);
+}
+
+static inline int
+HYPERVISOR_event_channel_op(
+	void *op)
+{
+	return _hypercall1(int, event_channel_op, op);
+}
+
+static inline int
+HYPERVISOR_xen_version(
+	int cmd, void *arg)
+{
+	return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_console_io(
+	int cmd, int count, char *str)
+{
+	return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int
+HYPERVISOR_physdev_op(
+	void *physdev_op)
+{
+	return _hypercall1(int, physdev_op, physdev_op);
+}
+
+static inline int
+HYPERVISOR_grant_table_op(
+	unsigned int cmd, void *uop, unsigned int count)
+{
+	return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping_otherdomain(
+	unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+	unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+	pte_hi = new_val.pte_high;
+#endif
+	return _hypercall5(int, update_va_mapping_otherdomain, va,
+			   new_val.pte_low, pte_hi, flags, domid);
+}
+
+static inline int
+HYPERVISOR_vm_assist(
+	unsigned int cmd, unsigned int type)
+{
+	return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int
+HYPERVISOR_vcpu_op(
+	int cmd, int vcpuid, void *extra_args)
+{
+	return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int
+HYPERVISOR_suspend(
+	unsigned long srec)
+{
+	return _hypercall3(int, sched_op, SCHEDOP_shutdown,
+			   SHUTDOWN_suspend, srec);
+}
+
+static inline int
+HYPERVISOR_nmi_op(
+	unsigned long op,
+	unsigned long arg)
+{
+	return _hypercall2(int, nmi_op, op, arg);
+}
+
+#endif /* __HYPERCALL_H__ */
--- /dev/null
+++ linus-2.6/include/asm-i386/hypervisor.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * hypervisor.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERVISOR_H__
+#define __HYPERVISOR_H__
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/features.h>
+
+#include <asm/ptrace.h>
+#include <asm/page.h>
+#if defined(__i386__)
+#  ifdef CONFIG_X86_PAE
+#   include <asm-generic/pgtable-nopud.h>
+#  else
+#   include <asm-generic/pgtable-nopmd.h>
+#  endif
+#endif
+
+/* arch/i386/kernel/setup.c */
+extern struct shared_info *HYPERVISOR_shared_info;
+extern struct start_info *xen_start_info;
+
+/* arch/i386/mach-xen/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+extern void force_evtchn_callback(void);
+
+/* Turn jiffies into Xen system time. */
+u64 jiffies_to_st(unsigned long jiffies);
+
+#include <asm/hypercall.h>
+
+#define xen_init()	(0)
+
+#endif /* __HYPERVISOR_H__ */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 05/35] Add sync bitops
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (3 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 04/35] Hypervisor " Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 22:56   ` Christoph Lameter
  2006-05-09  7:00 ` [RFC PATCH 06/35] Add vmlinuz build target Chris Wright
                   ` (30 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach,
	Christoph Lameter

[-- Attachment #1: synch-ops --]
[-- Type: text/plain, Size: 6794 bytes --]

Add "always lock'd" implementations of set_bit, clear_bit and
change_bit and the corresponding test_and_ functions.  Also add
"always lock'd" implementation of cmpxchg.  These give guaranteed
strong synchronisation and are required for non-SMP kernels running on
an SMP hypervisor.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Christoph Lameter <clameter@sgi.com>
---
 include/asm-i386/synch_bitops.h |  166 ++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/system.h       |   33 +++++++
 2 files changed, 199 insertions(+)

--- linus-2.6.orig/include/asm-i386/system.h
+++ linus-2.6/include/asm-i386/system.h
@@ -263,6 +263,9 @@ static inline unsigned long __xchg(unsig
 #define cmpxchg(ptr,o,n)\
 	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
 					(unsigned long)(n),sizeof(*(ptr))))
+#define synch_cmpxchg(ptr,o,n)\
+	((__typeof__(*(ptr)))__synch_cmpxchg((ptr),(unsigned long)(o),\
+					(unsigned long)(n),sizeof(*(ptr))))
 #endif
 
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
@@ -292,6 +295,36 @@ static inline unsigned long __cmpxchg(vo
 	return old;
 }
 
+#define __LOCK_PREFIX "lock ; "
+static inline unsigned long __synch_cmpxchg(volatile void *ptr,
+					    unsigned long old,
+					    unsigned long new, int size)
+{
+	unsigned long prev;
+	switch (size) {
+	case 1:
+		__asm__ __volatile__(__LOCK_PREFIX "cmpxchgb %b1,%2"
+				     : "=a"(prev)
+				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 2:
+		__asm__ __volatile__(__LOCK_PREFIX "cmpxchgw %w1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	case 4:
+		__asm__ __volatile__(__LOCK_PREFIX "cmpxchgl %1,%2"
+				     : "=a"(prev)
+				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+				     : "memory");
+		return prev;
+	}
+	return old;
+}
+#undef __LOCK_PREFIX
+
 #ifndef CONFIG_X86_CMPXCHG
 /*
  * Building a kernel capable running on 80386. It may be necessary to
--- /dev/null
+++ linus-2.6/include/asm-i386/synch_bitops.h
@@ -0,0 +1,166 @@
+#ifndef _I386_SYNCH_BITOPS_H
+#define _I386_SYNCH_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+/* make sure these are always locked */
+#define __LOCK_PREFIX "lock ; "
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#define ADDR (*(volatile long *) addr)
+
+/**
+ * synch_set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered.  See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered
+ * on non x86 architectures, so if you are writting portable code,
+ * make sure not to rely on its reordering guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void synch_set_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__( __LOCK_PREFIX
+		"btsl %1,%0"
+		:"+m" (ADDR)
+		:"Ir" (nr)
+		: "memory");
+}
+
+/**
+ * synch_clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * synch_clear_bit() is atomic and may not be reordered.  However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void synch_clear_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__( __LOCK_PREFIX
+		"btrl %1,%0"
+		:"+m" (ADDR)
+		:"Ir" (nr)
+		: "memory");
+}
+
+/**
+ * synch_change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered. It may be
+ * reordered on other architectures than x86.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void synch_change_bit(int nr, volatile unsigned long * addr)
+{
+	__asm__ __volatile__( __LOCK_PREFIX
+		"btcl %1,%0"
+		:"+m" (ADDR)
+		:"Ir" (nr)
+		: "memory");
+}
+
+/**
+ * synch_test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.  
+ * It may be reordered on other architectures than x86.
+ * It also implies a memory barrier.
+ */
+static inline int synch_test_and_set_bit(int nr, volatile unsigned long * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( __LOCK_PREFIX
+		"btsl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"+m" (ADDR)
+		:"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/**
+ * synch_test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It can be reorderdered on other architectures other than x86.
+ * It also implies a memory barrier.
+ */
+static inline int synch_test_and_clear_bit(int nr, volatile unsigned long * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( __LOCK_PREFIX
+		"btrl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"+m" (ADDR)
+		:"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+/**
+ * synch_test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.  
+ * It also implies a memory barrier.
+ */
+static inline int synch_test_and_change_bit(int nr, volatile unsigned long* addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( __LOCK_PREFIX
+		"btcl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),"+m" (ADDR)
+		:"Ir" (nr) : "memory");
+	return oldbit;
+}
+
+static __always_inline int synch_const_test_bit(int nr, const volatile unsigned long *addr)
+{
+	return ((1UL << (nr & 31)) &
+		(((const volatile unsigned int *)addr)[nr >> 5])) != 0;
+}
+
+static inline int synch_var_test_bit(int nr, const volatile unsigned long * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__(
+		"btl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit)
+		:"m" (ADDR),"Ir" (nr));
+	return oldbit;
+}
+
+#define synch_test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ synch_constant_test_bit((nr),(addr)) : \
+ synch_var_test_bit((nr),(addr)))
+
+#undef ADDR
+
+#endif /* _I386_SYNCH_BITOPS_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 06/35] Add vmlinuz build target.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (4 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 05/35] Add sync bitops Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch Chris Wright
                   ` (29 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: boot-xen --]
[-- Type: text/plain, Size: 2033 bytes --]

The vmlinuz image is a stripped and compressed kernel image, it is
smaller than the vmlinux image and the Xen domain builder supports
loading compressed images directly.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/Makefile      |    5 +++--
 arch/i386/boot/Makefile |   10 +++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

--- linus-2.6.orig/arch/i386/Makefile
+++ linus-2.6/arch/i386/Makefile
@@ -108,15 +108,16 @@ CPPFLAGS += $(mflags-y)
 boot := arch/i386/boot
 
 PHONY += zImage bzImage compressed zlilo bzlilo \
-         zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
+         zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install vmlinuz
 
 all: bzImage
 
 # KBUILD_IMAGE specify target image being built
                     KBUILD_IMAGE := $(boot)/bzImage
 zImage zlilo zdisk: KBUILD_IMAGE := arch/i386/boot/zImage
+vmlinuz:            KBUILD_IMAGE := $(boot)/vmlinuz
 
-zImage bzImage: vmlinux
+zImage bzImage vmlinuz: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
 
 compressed: zImage
--- linus-2.6.orig/arch/i386/boot/Makefile
+++ linus-2.6/arch/i386/boot/Makefile
@@ -26,7 +26,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
 #RAMDISK := -DRAMDISK=512
 
 targets		:= vmlinux.bin bootsect bootsect.o \
-		   setup setup.o zImage bzImage
+		   setup setup.o zImage bzImage vmlinuz
 subdir- 	:= compressed
 
 hostprogs-y	:= tools/build
@@ -128,5 +128,13 @@ zlilo: $(BOOTIMAGE)
 	cp System.map $(INSTALL_PATH)/
 	if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
 
+$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE
+	$(call if_changed,gzip)
+	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+
+$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded
+$(obj)/vmlinux-stripped: vmlinux FORCE
+	$(call if_changed,objcopy)
+
 install:
 	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (5 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 06/35] Add vmlinuz build target Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-10 23:28   ` Zachary Amsden
  2006-05-09  7:00 ` [RFC PATCH 08/35] Add Xen-specific memory management definitions Chris Wright
                   ` (28 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-load-offset --]
[-- Type: text/plain, Size: 1427 bytes --]

Change LOAD_OFFSET so that the kernel has virtual addresses in the elf header fields.

Unlike bare metal kernels, Xen kernels start with virtual address
management turned on and thus the addresses to load to should be
virtual addresses.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/vmlinux.lds.S                   |    2 +-
 include/asm-i386/mach-default/mach_vmlinux.lds.h |    6 ++++++
 include/asm-i386/mach-xen/mach_vmlinux.lds.h     |    6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

--- linus-2.6.orig/arch/i386/kernel/vmlinux.lds.S
+++ linus-2.6/arch/i386/kernel/vmlinux.lds.S
@@ -2,7 +2,7 @@
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
  */
 
-#define LOAD_OFFSET __PAGE_OFFSET
+#include "mach_vmlinux.lds.h"
 
 #include <asm-generic/vmlinux.lds.h>
 #include <asm/thread_info.h>
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_vmlinux.lds.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_MACH_VMLINUX_LDS_H
+#define __ASM_MACH_VMLINUX_LDS_H
+
+#define LOAD_OFFSET __PAGE_OFFSET
+
+#endif /* __ASM_MACH_VMLINUX_LDS_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_vmlinux.lds.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_MACH_VMLINUX_LDS_H
+#define __ASM_MACH_VMLINUX_LDS_H
+
+#define LOAD_OFFSET 0
+
+#endif /* __ASM_MACH_VMLINUX_LDS_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (6 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 14:49   ` Martin J. Bligh
                     ` (2 more replies)
  2006-05-09  7:00 ` [RFC PATCH 09/35] Change __FIXADDR_TOP to leave room for the hypervisor Chris Wright
                   ` (27 subsequent siblings)
  35 siblings, 3 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-xen-mm --]
[-- Type: text/plain, Size: 4568 bytes --]

Add extra memory management definitions used by Xen-specific
code. These allow conversion between 'pseudophysical' memory
addresses, which provide the illusion of a physically contiguous
memory map, and underlying real machine addresses. This conversion is
neceesary when interpreting and updating PTEs. Also support write
protection of page mappings, which is needed to allow successful
validation of page tables.

The current definitions are incomplete and only a stub implementation,
allowing us to re-use existing code (drivers) which references these
memory management code changes.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/asm-i386/hypervisor.h             |    3 
 include/asm-i386/mach-default/mach_page.h |    4 +
 include/asm-i386/mach-xen/mach_page.h     |   99 ++++++++++++++++++++++++++++++
 include/asm-i386/page.h                   |    2 
 4 files changed, 108 insertions(+)

--- linus-2.6.orig/include/asm-i386/hypervisor.h
+++ linus-2.6/include/asm-i386/hypervisor.h
@@ -67,4 +67,7 @@ u64 jiffies_to_st(unsigned long jiffies)
 
 #define xen_init()	(0)
 
+#include <xen/interface/version.h>
+#include <xen/features.h>
+
 #endif /* __HYPERVISOR_H__ */
--- linus-2.6.orig/include/asm-i386/page.h
+++ linus-2.6/include/asm-i386/page.h
@@ -82,6 +82,8 @@ typedef struct { unsigned long pgprot; }
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
 
+#include <mach_page.h>
+
 /*
  * This handles the memory map.. We could make this a config
  * option, but too many people screw it up, and too few need
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_page.h
@@ -0,0 +1,4 @@
+#ifndef __ASM_MACH_PAGE_H
+#define __ASM_MACH_PAGE_H
+
+#endif /* __ASM_MACH_PAGE_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_page.h
@@ -0,0 +1,99 @@
+#ifndef __ASM_MACH_PAGE_H
+#define __ASM_MACH_PAGE_H
+
+#ifndef __ASSEMBLY__
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY	(~0UL)
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+#ifndef CONFIG_XEN_SHADOW_MODE
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return pfn;
+	return phys_to_machine_mapping[(unsigned int)(pfn)] &
+		~FOREIGN_FRAME_BIT;
+#else
+	return pfn;
+#endif
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+#ifndef CONFIG_XEN_SHADOW_MODE
+	unsigned long pfn;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return mfn;
+
+	/*
+	 * The array access can fail (e.g., device space beyond end of RAM).
+	 * In such cases it doesn't matter what we return (we return garbage),
+	 * but we must handle the fault without crashing!
+	 */
+	asm (
+		"1:	movl %1,%0\n"
+		"2:\n"
+		".section __ex_table,\"a\"\n"
+		"	.align 4\n"
+		"	.long 1b,2b\n"
+		".previous"
+		: "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) );
+
+	return pfn;
+#else
+	return mfn;
+#endif
+}
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v)	(__pa(v))
+#define virt_to_mfn(v)		(pfn_to_mfn(__pa(v) >> PAGE_SHIFT))
+#define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+/* Definitions for machine and pseudophysical addresses. */
+#ifdef CONFIG_X86_PAE
+typedef unsigned long long paddr_t;
+typedef unsigned long long maddr_t;
+#else
+typedef unsigned long paddr_t;
+typedef unsigned long maddr_t;
+#endif
+
+#ifndef CONFIG_X86_PAE
+#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
+#else
+#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
+                       (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
+#endif
+
+#define virt_to_ptep(__va)						\
+({									\
+	pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));		\
+	pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));	\
+	pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));	\
+	pte_offset_kernel(__pmd, (unsigned long)(__va));		\
+})
+
+#define arbitrary_virt_to_machine(__va)					\
+({									\
+	maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
+	m | ((unsigned long)(__va) & (PAGE_SIZE-1));			\
+})
+
+#define make_lowmem_page_readonly(va, feature) do {		\
+	pte_t *pte;						\
+	int rc;							\
+								\
+	if (xen_feature(feature))				\
+		return;						\
+								\
+	pte = virt_to_ptep(va);					\
+	rc = HYPERVISOR_update_va_mapping(			\
+		(unsigned long)va, pte_wrprotect(*pte), 0);	\
+	BUG_ON(rc);						\
+} while (0)
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_MACH_PAGE_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 09/35] Change __FIXADDR_TOP to leave room for the hypervisor.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (7 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 08/35] Add Xen-specific memory management definitions Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 10/35] Add a new head.S start-of-day file for booting on Xen Chris Wright
                   ` (26 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-fixmap --]
[-- Type: text/plain, Size: 1720 bytes --]

Move the definition of __FIXADDR_TOP into a subarch include file so
that it can be overridden for subarch xen -- the hypervisor needs
about 64MB at the top of the address space.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/asm-i386/fixmap.h                   |    8 +-------
 include/asm-i386/mach-default/mach_fixmap.h |   11 +++++++++++
 include/asm-i386/mach-xen/mach_fixmap.h     |   11 +++++++++++
 3 files changed, 23 insertions(+), 7 deletions(-)

--- linus-2.6.orig/include/asm-i386/fixmap.h
+++ linus-2.6/include/asm-i386/fixmap.h
@@ -14,13 +14,7 @@
 #define _ASM_FIXMAP_H
 
 #include <linux/config.h>
-
-/* used by vmalloc.c, vsyscall.lds.S.
- *
- * Leave one empty page between vmalloc'ed areas and
- * the start of the fixmap.
- */
-#define __FIXADDR_TOP	0xfffff000
+#include <mach_fixmap.h>
 
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_fixmap.h
@@ -0,0 +1,11 @@
+#ifndef __ASM_MACH_FIXMAP_H
+#define __ASM_MACH_FIXMAP_H
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+#define __FIXADDR_TOP	0xfffff000
+
+#endif /* __ASM_MACH_FIXMAP_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_fixmap.h
@@ -0,0 +1,11 @@
+#ifndef __ASM_MACH_FIXMAP_H
+#define __ASM_MACH_FIXMAP_H
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+#define __FIXADDR_TOP	(HYPERVISOR_VIRT_START - 2 * PAGE_SIZE)
+
+#endif /* __ASM_MACH_FIXMAP_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 10/35] Add a new head.S start-of-day file for booting on Xen.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (8 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 09/35] Change __FIXADDR_TOP to leave room for the hypervisor Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 11/35] Add support for Xen to entry.S Chris Wright
                   ` (25 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-head.S --]
[-- Type: text/plain, Size: 11471 bytes --]

When running on Xen, the kernel is started with paging enabled.  Also
don't check for cpu features which are present on all cpus supported
by Xen.

Don't define segments which are not supported when running on Xen.

Define the __xen_guest section which exports information about the
kernel to the domain builder.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/Makefile          |    7 +-
 arch/i386/kernel/head-cpu.S |   88 +++++++++++++++++++++++++++++++
 arch/i386/kernel/head.S     |   69 +-----------------------
 arch/i386/mach-xen/Makefile |    2 
 arch/i386/mach-xen/head.S   |  122 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 222 insertions(+), 66 deletions(-)

--- linus-2.6.orig/arch/i386/Makefile
+++ linus-2.6/arch/i386/Makefile
@@ -48,6 +48,9 @@ CFLAGS				+= $(shell if [ $(call cc-vers
 
 CFLAGS += $(cflags-y)
 
+# Default subarch head files
+head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
+
 # Default subarch .c files
 mcore-y  := mach-default
 
@@ -74,6 +77,8 @@ mcore-$(CONFIG_X86_SUMMIT)  := mach-defa
 # Xen subarch support
 mflags-$(CONFIG_X86_XEN)	:= -Iinclude/asm-i386/mach-xen
 mcore-$(CONFIG_X86_XEN)		:= mach-xen
+head-$(CONFIG_X86_XEN)		:= arch/i386/mach-xen/head.o \
+				   arch/i386/kernel/init_task.o
 
 # generic subarchitecture
 mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
@@ -88,8 +93,6 @@ core-$(CONFIG_X86_ES7000)	:= arch/i386/m
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
-head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
-
 libs-y 					+= arch/i386/lib/
 core-y					+= arch/i386/kernel/ \
 					   arch/i386/mm/ \
--- linus-2.6.orig/arch/i386/kernel/head.S
+++ linus-2.6/arch/i386/kernel/head.S
@@ -20,6 +20,8 @@
 #include <asm/asm-offsets.h>
 #include <asm/setup.h>
 
+#include "head-cpu.S"
+
 /*
  * References to members of the new_cpu_data structure.
  */
@@ -270,28 +272,12 @@ checkCPUtype:
 	testl $0x200000,%eax	# check if ID bit changed
 	je is486
 
-	/* get vendor info */
-	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
-	cpuid
-	movl %eax,X86_CPUID		# save CPUID level
-	movl %ebx,X86_VENDOR_ID		# lo 4 chars
-	movl %edx,X86_VENDOR_ID+4	# next 4 chars
-	movl %ecx,X86_VENDOR_ID+8	# last 4 chars
+	CPUID_GET_VENDOR_INFO X86_CPUID, X86_VENDOR_ID
 
 	orl %eax,%eax			# do we have processor info as well?
 	je is486
 
-	movl $1,%eax		# Use the CPUID instruction to get CPU type
-	cpuid
-	movb %al,%cl		# save reg for future use
-	andb $0x0f,%ah		# mask processor family
-	movb %ah,X86
-	andb $0xf0,%al		# mask model
-	shrb $4,%al
-	movb %al,X86_MODEL
-	andb $0x0f,%cl		# mask mask revision
-	movb %cl,X86_MASK
-	movl %edx,X86_CAPABILITY
+	CPUID_GET_CPU_TYPE X86, X86_MODEL, X86_MASK, X86_CAPABILITY
 
 is486:	movl $0x50022,%ecx	# set AM, WP, NE and MP
 	jmp 2f
@@ -484,50 +470,5 @@ ENTRY(boot_gdt_table)
  */
 	.align L1_CACHE_BYTES
 ENTRY(cpu_gdt_table)
-	.quad 0x0000000000000000	/* NULL descriptor */
-	.quad 0x0000000000000000	/* 0x0b reserved */
-	.quad 0x0000000000000000	/* 0x13 reserved */
-	.quad 0x0000000000000000	/* 0x1b reserved */
-	.quad 0x0000000000000000	/* 0x20 unused */
-	.quad 0x0000000000000000	/* 0x28 unused */
-	.quad 0x0000000000000000	/* 0x33 TLS entry 1 */
-	.quad 0x0000000000000000	/* 0x3b TLS entry 2 */
-	.quad 0x0000000000000000	/* 0x43 TLS entry 3 */
-	.quad 0x0000000000000000	/* 0x4b reserved */
-	.quad 0x0000000000000000	/* 0x53 reserved */
-	.quad 0x0000000000000000	/* 0x5b reserved */
-
-	.quad 0x00cf9a000000ffff	/* 0x60 kernel 4GB code at 0x00000000 */
-	.quad 0x00cf92000000ffff	/* 0x68 kernel 4GB data at 0x00000000 */
-	.quad 0x00cffa000000ffff	/* 0x73 user 4GB code at 0x00000000 */
-	.quad 0x00cff2000000ffff	/* 0x7b user 4GB data at 0x00000000 */
-
-	.quad 0x0000000000000000	/* 0x80 TSS descriptor */
-	.quad 0x0000000000000000	/* 0x88 LDT descriptor */
-
-	/*
-	 * Segments used for calling PnP BIOS have byte granularity.
-	 * They code segments and data segments have fixed 64k limits,
-	 * the transfer segment sizes are set at run time.
-	 */
-	.quad 0x00409a000000ffff	/* 0x90 32-bit code */
-	.quad 0x00009a000000ffff	/* 0x98 16-bit code */
-	.quad 0x000092000000ffff	/* 0xa0 16-bit data */
-	.quad 0x0000920000000000	/* 0xa8 16-bit data */
-	.quad 0x0000920000000000	/* 0xb0 16-bit data */
-
-	/*
-	 * The APM segments have byte granularity and their bases
-	 * are set at run time.  All have 64k limits.
-	 */
-	.quad 0x00409a000000ffff	/* 0xb8 APM CS    code */
-	.quad 0x00009a000000ffff	/* 0xc0 APM CS 16 code (16 bit) */
-	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
-
-	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
-	.quad 0x0000000000000000	/* 0xd8 - unused */
-	.quad 0x0000000000000000	/* 0xe0 - unused */
-	.quad 0x0000000000000000	/* 0xe8 - unused */
-	.quad 0x0000000000000000	/* 0xf0 - unused */
-	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
+	CPU_GDT_TABLE
 
--- linus-2.6.orig/arch/i386/mach-xen/Makefile
+++ linus-2.6/arch/i386/mach-xen/Makefile
@@ -2,6 +2,8 @@
 # Makefile for the linux kernel.
 #
 
+extra-y				:= head.o
+
 obj-y				:= setup.o
 
 setup-y				:= ../mach-default/setup.o
--- /dev/null
+++ linus-2.6/arch/i386/kernel/head-cpu.S
@@ -0,0 +1,88 @@
+/* Some macros for head.S */
+
+#include <mach_processor.h>
+
+.macro CPU_GDT_TABLE
+	.quad 0x0000000000000000	/* NULL descriptor */
+	.quad 0x0000000000000000	/* 0x0b reserved */
+	.quad 0x0000000000000000	/* 0x13 reserved */
+	.quad 0x0000000000000000	/* 0x1b reserved */
+	.quad 0x0000000000000000	/* 0x20 unused */
+	.quad 0x0000000000000000	/* 0x28 unused */
+	.quad 0x0000000000000000	/* 0x33 TLS entry 1 */
+	.quad 0x0000000000000000	/* 0x3b TLS entry 2 */
+	.quad 0x0000000000000000	/* 0x43 TLS entry 3 */
+	.quad 0x0000000000000000	/* 0x4b reserved */
+	.quad 0x0000000000000000	/* 0x53 reserved */
+	.quad 0x0000000000000000	/* 0x5b reserved */
+
+	.quad 0x00cf9a000000ffff	/* 0x60 kernel 4GB code at 0x00000000 */
+	.quad 0x00cf92000000ffff	/* 0x68 kernel 4GB data at 0x00000000 */
+	.quad 0x00cffa000000ffff	/* 0x73 user 4GB code at 0x00000000 */
+	.quad 0x00cff2000000ffff	/* 0x7b user 4GB data at 0x00000000 */
+
+	.quad 0x0000000000000000	/* 0x80 TSS descriptor */
+	.quad 0x0000000000000000	/* 0x88 LDT descriptor */
+
+	/*
+	 * Segments used for calling PnP BIOS have byte granularity.
+	 * They code segments and data segments have fixed 64k limits,
+	 * the transfer segment sizes are set at run time.
+	 */
+	.quad 0x00409a000000ffff	/* 0x90 32-bit code */
+	.quad 0x00009a000000ffff	/* 0x98 16-bit code */
+	.quad 0x000092000000ffff	/* 0xa0 16-bit data */
+	.quad 0x0000920000000000	/* 0xa8 16-bit data */
+	.quad 0x0000920000000000	/* 0xb0 16-bit data */
+
+	/*
+	 * The APM segments have byte granularity and their bases
+	 * are set at run time.  All have 64k limits.
+	 */
+	.quad 0x00409a000000ffff	/* 0xb8 APM CS    code */
+	.quad 0x00009a000000ffff	/* 0xc0 APM CS 16 code (16 bit) */
+	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
+
+	.quad 0x0000920000000000	/* 0xd0 - ESPFIX 16-bit SS */
+	.quad 0x0000000000000000	/* 0xd8 - unused */
+	.quad 0x0000000000000000	/* 0xe0 - unused */
+	.quad 0x0000000000000000	/* 0xe8 - unused */
+	.quad 0x0000000000000000	/* 0xf0 - unused */
+	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
+.endm
+
+/**
+ * CPUID_GET_VENDOR_INFO - macro for obtaining cpuid vendor info
+ * @cpuid_level:  address to store max basic supported cpuid level
+ * @x86_vendor_id: buffer to store vendor id, must be at least 12 bytes
+ */
+.macro CPUID_GET_VENDOR_INFO cpuid_level, x86_vendor_id
+	/* get vendor info */
+	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
+	cpuid
+	movl %eax,\cpuid_level		# save CPUID level
+	movl %ebx,\x86_vendor_id	# lo 4 chars
+	movl %edx,\x86_vendor_id+4	# next 4 chars
+	movl %ecx,\x86_vendor_id+8	# last 4 chars
+.endm
+
+/**
+ * CPUID_GET_CPU_TYPE - macro for obtaining cpuid version info
+ * @family:  address to store family
+ * @model:  address to store model
+ * @mask:  address to store mask
+ * @capability:  address to store capabilities
+ */
+.macro CPUID_GET_CPU_TYPE family, model, mask, capability
+	movl $1,%eax		# Use the CPUID instruction to get CPU type
+	cpuid
+	movb %al,%cl		# save reg for future use
+	andb $0x0f,%ah		# mask processor family
+	movb %ah,\family
+	andb $0xf0,%al		# mask model
+	shrb $4,%al
+	movb %al,\model
+	andb $0x0f,%cl		# mask mask revision
+	movb %cl,\mask
+	movl %edx,\capability
+.endm
--- /dev/null
+++ linus-2.6/arch/i386/mach-xen/head.S
@@ -0,0 +1,122 @@
+
+
+.text
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+
+#include "../kernel/head-cpu.S"
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86		new_cpu_data+CPUINFO_x86
+#define X86_VENDOR	new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL	new_cpu_data+CPUINFO_x86_model
+#define X86_MASK	new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH	new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID	new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY	new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
+
+ENTRY(startup_32)
+	movl %esi,xen_start_info
+	cld
+
+	/* Set up the stack pointer */
+	movl $(init_thread_union+THREAD_SIZE),%esp
+
+	CPUID_GET_VENDOR_INFO X86_CPUID, X86_VENDOR_ID
+	CPUID_GET_CPU_TYPE X86, X86_MODEL, X86_MASK, X86_CAPABILITY
+
+	movb $1,X86_HARD_MATH
+
+	xorl %eax,%eax			# Clear FS/GS and LDT
+	movl %eax,%fs
+	movl %eax,%gs
+	cld			# gcc2 wants the direction flag cleared at all times
+
+	call start_kernel
+L6:
+	jmp L6			# main should never return here, but
+				# just in case, we know what happens.
+
+#define HYPERCALL_PAGE_OFFSET 0x1000
+.org HYPERCALL_PAGE_OFFSET
+ENTRY(hypercall_page)
+.skip 0x1000
+
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
+
+/*
+ * BSS section
+ */
+.section ".bss.page_aligned","w"
+ENTRY(swapper_pg_dir)
+	.fill 1024,4,0
+ENTRY(empty_zero_page)
+	.fill 4096,1,0
+
+/*
+ * This starts the data section.
+ */
+.data
+
+	ALIGN
+	.word 0				# 32 bit align gdt_desc.address
+	.globl cpu_gdt_descr
+cpu_gdt_descr:
+	.word GDT_SIZE
+	.long cpu_gdt_table
+
+	.fill NR_CPUS-1,8,0		# space for the other GDT descriptors
+
+/*
+ * The Global Descriptor Table contains 28 quadwords, per-CPU.
+ */
+	.align PAGE_SIZE_asm
+ENTRY(cpu_gdt_table)
+	CPU_GDT_TABLE
+	/* Be sure this is zeroed to avoid false validations in Xen */
+	.fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
+
+
+/*
+ * __xen_guest information
+ */
+.macro utoa value
+ .if (\value) < 0 || (\value) >= 0x10
+	utoa (((\value)>>4)&0x0fffffff)
+ .endif
+ .if ((\value) & 0xf) < 10
+  .byte '0' + ((\value) & 0xf)
+ .else
+  .byte 'A' + ((\value) & 0xf) - 10
+ .endif
+.endm
+
+.section __xen_guest
+	.ascii	"GUEST_OS=linux,GUEST_VER=2.6"
+	.ascii	",XEN_VER=xen-3.0"
+	.ascii	",VIRT_BASE=0x"
+		utoa __PAGE_OFFSET
+	.ascii	",HYPERCALL_PAGE=0x"
+		utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
+	.ascii  ",FEATURES=!writable_page_tables"
+	.ascii	         "|!auto_translated_physmap"
+#ifdef CONFIG_X86_PAE
+	.ascii	",PAE=yes"
+#else
+	.ascii	",PAE=no"
+#endif
+	.ascii	",LOADER=generic"
+	.byte	0

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 11/35] Add support for Xen to entry.S.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (9 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 10/35] Add a new head.S start-of-day file for booting on Xen Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 16:51   ` Andi Kleen
  2006-05-09  7:00 ` [RFC PATCH 12/35] Add start-of-day setup hooks to subarch Chris Wright
                   ` (24 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-entry.S --]
[-- Type: text/plain, Size: 7899 bytes --]

- change cli/sti
- change test for user mode return to work for kernel mode in ring1
- check hypervisor saved event mask on return from exception
- add entry points for the hypervisor upcall handlers
- avoid math emulation check when running on Xen
- add nmi handler for running on Xen

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/entry.S |  137 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 120 insertions(+), 17 deletions(-)

--- linus-2.6.orig/arch/i386/kernel/entry.S
+++ linus-2.6/arch/i386/kernel/entry.S
@@ -75,8 +75,38 @@ DF_MASK		= 0x00000400 
 NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
+#ifndef CONFIG_XEN
+#define DISABLE_INTERRUPTS	cli
+#define ENABLE_INTERRUPTS	sti
+#else
+#include <xen/interface/xen.h>
+
+EVENT_MASK	= 0x2E
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending		/* 0 */
+#define evtchn_upcall_mask		1
+
+#define sizeof_vcpu_shift		6
+
+#ifdef CONFIG_SMP
+#define GET_VCPU_INFO		movl TI_cpu(%ebp),%esi			; \
+				shl  $sizeof_vcpu_shift,%esi		; \
+				addl HYPERVISOR_shared_info,%esi
+#else
+#define GET_VCPU_INFO		movl HYPERVISOR_shared_info,%esi
+#endif
+
+#define __DISABLE_INTERRUPTS	movb $1,evtchn_upcall_mask(%esi)
+#define DISABLE_INTERRUPTS	GET_VCPU_INFO				; \
+				__DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS	GET_VCPU_INFO				; \
+				movb $0,evtchn_upcall_mask(%esi)
+#define __TEST_PENDING		testb $0xFF,evtchn_upcall_pending(%esi)
+#endif
+
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli
+#define preempt_stop		DISABLE_INTERRUPTS
 #else
 #define preempt_stop
 #define resume_kernel		restore_nocheck
@@ -145,10 +175,10 @@ ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
-	testl $(VM_MASK | 3), %eax
+	testl $(VM_MASK | USER_MODE_MASK), %eax
 	jz resume_kernel
 ENTRY(resume_userspace)
- 	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -159,7 +189,7 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-	cli
+	DISABLE_INTERRUPTS
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
@@ -179,7 +209,7 @@ need_resched:
 ENTRY(sysenter_entry)
 	movl TSS_sysenter_esp0(%esp),%esp
 sysenter_past_esp:
-	sti
+	ENABLE_INTERRUPTS
 	pushl $(__USER_DS)
 	pushl %ebp
 	pushfl
@@ -209,7 +239,7 @@ sysenter_past_esp:
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
-	cli
+	DISABLE_INTERRUPTS
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
@@ -217,7 +247,7 @@ sysenter_past_esp:
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
 	xorl %ebp,%ebp
-	sti
+	ENABLE_INTERRUPTS
 	sysexit
 
 
@@ -240,7 +270,7 @@ syscall_call:
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)		# store the return value
 syscall_exit:
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -248,6 +278,7 @@ syscall_exit:
 	jne syscall_exit_work
 
 restore_all:
+#ifndef CONFIG_XEN
 	movl EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
 	# Warning: OLDSS(%esp) contains the wrong/random values if we
 	# are returning to the kernel.
@@ -258,12 +289,32 @@ restore_all:
 	cmpl $((4 << 8) | 3), %eax
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
+#else
+restore_nocheck:
+	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb CS(%esp), %al
+	andl $(VM_MASK | 3), %eax
+	cmpl $3, %eax
+	jne hypervisor_iret
+	ENABLE_INTERRUPTS
+	__TEST_PENDING
+	jz restore_regs_and_iret
+	__DISABLE_INTERRUPTS
+	jmp do_hypervisor_callback
+hypervisor_iret:
+	RESTORE_REGS
+	addl $4, %esp
+	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
+#endif
+restore_regs_and_iret:
 	RESTORE_REGS
 	addl $4, %esp
 1:	iret
 .section .fixup,"ax"
 iret_exc:
-	sti
+#ifndef CONFIG_XEN
+	ENABLE_INTERRUPTS
+#endif
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -273,6 +324,7 @@ iret_exc:
 	.long 1b,iret_exc
 .previous
 
+#ifndef CONFIG_XEN
 ldt_ss:
 	larl OLDSS(%esp), %eax
 	jnz restore_nocheck
@@ -285,7 +337,7 @@ ldt_ss:
 	 * CPUs, which we can try to work around to make
 	 * dosemu and wine happy. */
 	subl $8, %esp		# reserve space for switch16 pointer
-	cli
+	DISABLE_INTERRUPTS
 	movl %esp, %eax
 	/* Set up the 16bit stack frame with switch32 pointer on top,
 	 * and a switch16 pointer on top of the current frame. */
@@ -297,6 +349,7 @@ ldt_ss:
 	.align 4
 	.long 1b,iret_exc
 .previous
+#endif
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
@@ -305,7 +358,7 @@ work_pending:
 	jz work_notifysig
 work_resched:
 	call schedule
-	cli				# make sure we don't miss an interrupt
+	DISABLE_INTERRUPTS		# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -357,7 +410,7 @@ syscall_trace_entry:
 syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
-	sti				# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS		# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
 	movl $1, %edx
@@ -377,6 +430,7 @@ syscall_badsys:
 	movl $-ENOSYS,EAX(%esp)
 	jmp resume_userspace
 
+#ifndef CONFIG_XEN
 #define FIXUP_ESPFIX_STACK \
 	movl %esp, %eax; \
 	/* switch to 32bit stack using the pointer on top of 16bit stack */ \
@@ -435,6 +489,9 @@ ENTRY(name)				\
 
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
+#else
+#define UNWIND_ESPFIX_STACK
+#endif
 
 ENTRY(divide_error)
 	pushl $0			# no error code
@@ -466,6 +523,44 @@ error_code:
 	call *%edi
 	jmp ret_from_exception
 
+#ifdef CONFIG_XEN
+ENTRY(hypervisor_callback)
+	pushl %eax
+	SAVE_ALL
+do_hypervisor_callback:
+	push %esp
+	call evtchn_do_upcall
+	add  $4,%esp
+	jmp  ret_from_intr
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+1:	popl %ds
+2:	popl %es
+3:	popl %fs
+4:	popl %gs
+	subl $4,%esp
+	SAVE_ALL
+	jmp  ret_from_exception
+.section .fixup,"ax";	\
+6:	movl $0,(%esp);	\
+	jmp 1b;		\
+7:	movl $0,(%esp);	\
+	jmp 2b;		\
+8:	movl $0,(%esp);	\
+	jmp 3b;		\
+9:	movl $0,(%esp);	\
+	jmp 4b;		\
+.previous;		\
+.section __ex_table,"a";\
+	.align 4;	\
+	.long 1b,6b;	\
+	.long 2b,7b;	\
+	.long 3b,8b;	\
+	.long 4b,9b;	\
+.previous
+#endif
+
 ENTRY(coprocessor_error)
 	pushl $0
 	pushl $do_coprocessor_error
@@ -479,17 +574,19 @@ ENTRY(simd_coprocessor_error)
 ENTRY(device_not_available)
 	pushl $-1			# mark this as an int
 	SAVE_ALL
+#ifndef CONFIG_XEN
 	movl %cr0, %eax
 	testl $0x4, %eax		# EM (math emulation bit)
-	jne device_not_available_emulate
-	preempt_stop
-	call math_state_restore
-	jmp ret_from_exception
-device_not_available_emulate:
+	je device_available_emulate
 	pushl $0			# temporary storage for ORIG_EIP
 	call math_emulate
 	addl $4, %esp
 	jmp ret_from_exception
+device_available_emulate:
+#endif
+	preempt_stop
+	call math_state_restore
+	jmp ret_from_exception
 
 /*
  * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -525,6 +622,8 @@ debug_stack_correct:
 	call do_debug
 	jmp ret_from_exception
 	.previous .text
+
+#ifndef CONFIG_XEN
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
  * a debug fault, and the debug fault hasn't yet been able to
@@ -595,6 +694,10 @@ nmi_16bit_stack:
 	.align 4
 	.long 1b,iret_exc
 .previous
+#else
+ENTRY(nmi)
+	jmp restore_all
+#endif
 
 KPROBE_ENTRY(int3)
 	pushl $-1			# mark this as an int

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 12/35] Add start-of-day setup hooks to subarch
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (10 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 11/35] Add support for Xen to entry.S Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 13/35] Support loading an initrd when running on Xen Chris Wright
                   ` (23 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-setup --]
[-- Type: text/plain, Size: 6100 bytes --]

Implement the start-of-day subarchitecture setup hooks for booting on
Xen. Add subarch macros for determining loader type and initrd
location.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/setup.c                    |    5 ++
 include/asm-i386/hypervisor.h               |    4 +
 include/asm-i386/mach-default/mach_setup.h  |   12 +++++
 include/asm-i386/mach-xen/mach_setup.h      |   13 ++++++
 include/asm-i386/mach-xen/setup_arch_post.h |   60 ++++++++++++++++++++++++++++
 include/asm-i386/mach-xen/setup_arch_pre.h  |   22 ++++++++++
 include/asm-i386/setup.h                    |    8 ++-
 7 files changed, 121 insertions(+), 3 deletions(-)

--- linus-2.6.orig/arch/i386/kernel/setup.c
+++ linus-2.6/arch/i386/kernel/setup.c
@@ -458,6 +458,7 @@ static void __init print_memory_map(char
 	}
 }
 
+#ifndef HAVE_ARCH_E820_SANITIZE
 /*
  * Sanitize the BIOS e820 map.
  *
@@ -677,6 +678,7 @@ static int __init copy_e820_map(struct e
 	} while (biosmap++,--nr_map);
 	return 0;
 }
+#endif
 
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
@@ -1578,6 +1580,9 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;
 #endif
 #endif
+#ifdef ARCH_FINAL_SETUP
+	ARCH_FINAL_SETUP
+#endif
 }
 
 static __init int add_pcspkr(void)
--- linus-2.6.orig/include/asm-i386/hypervisor.h
+++ linus-2.6/include/asm-i386/hypervisor.h
@@ -56,6 +56,10 @@
 extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;
 
+/* arch/i386/mach-xen/entry.S */
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+
 /* arch/i386/mach-xen/evtchn.c */
 /* Force a proper event-channel callback from Xen. */
 extern void force_evtchn_callback(void);
--- linus-2.6.orig/include/asm-i386/setup.h
+++ linus-2.6/include/asm-i386/setup.h
@@ -49,10 +49,10 @@ extern unsigned char boot_params[PARAM_S
 #define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA))
 #define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC))
 #define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF))
-#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
+#define LOADER_TYPE MACH_LOADER_TYPE
 #define KERNEL_START (*(unsigned long *) (PARAM+0x214))
-#define INITRD_START (*(unsigned long *) (PARAM+0x218))
-#define INITRD_SIZE (*(unsigned long *) (PARAM+0x21c))
+#define INITRD_START MACH_INITRD_START
+#define INITRD_SIZE MACH_INITRD_SIZE
 #define EDID_INFO   (*(struct edid_info *) (PARAM+0x140))
 #define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
 #define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF))
@@ -61,4 +61,6 @@ extern unsigned char boot_params[PARAM_S
 
 #endif /* __ASSEMBLY__ */
 
+#include <mach_setup.h>
+
 #endif /* _i386_SETUP_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_setup.h
@@ -0,0 +1,12 @@
+#ifndef __ASM_MACH_SETUP_H
+#define __ASM_MACH_SETUP_H
+
+#ifndef __ASSEMBLY__
+
+#define MACH_LOADER_TYPE (*(unsigned char *) (PARAM+0x210))
+#define MACH_INITRD_START (*(unsigned long *) (PARAM+0x218))
+#define MACH_INITRD_SIZE (*(unsigned long *) (PARAM+0x21c))
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_MACH_SETUP_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_setup.h
@@ -0,0 +1,13 @@
+#ifndef __ASM_MACH_SETUP_H
+#define __ASM_MACH_SETUP_H
+
+#ifndef __ASSEMBLY__
+
+#define MACH_LOADER_TYPE 0x6e6558	/* "Xen" */
+#define MACH_INITRD_START \
+	(xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0)
+#define MACH_INITRD_SIZE (xen_start_info->mod_len)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_MACH_SETUP_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/setup_arch_post.h
@@ -0,0 +1,60 @@
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ *
+ * Description:
+ *	This is included late in kernel/setup.c so that it can make
+ *	use of all of the static functions.
+ **/
+
+#include <xen/interface/physdev.h>
+
+static char * __init machine_specific_memory_setup(void)
+{
+	unsigned long max_pfn = xen_start_info->nr_pages;
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+	return "Xen";
+}
+
+static void __init machine_specific_arch_setup(void)
+{
+	struct physdev_op op;
+
+	HYPERVISOR_shared_info =
+		(struct shared_info *)__va(xen_start_info->shared_info);
+	memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+	HYPERVISOR_set_callbacks(
+	    __KERNEL_CS, (unsigned long)hypervisor_callback,
+	    __KERNEL_CS, (unsigned long)failsafe_callback);
+
+	init_pg_tables_end = __pa(xen_start_info->pt_base) +
+		PFN_PHYS(xen_start_info->nr_pt_frames);
+
+	op.cmd = PHYSDEVOP_SET_IOPL;
+	op.u.set_iopl.iopl = 1;
+	HYPERVISOR_physdev_op(&op);
+
+#ifdef CONFIG_ACPI
+	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+		acpi_disabled = 1;
+		acpi_ht = 0;
+	}
+#endif
+
+	memcpy(saved_command_line, xen_start_info->cmd_line,
+	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+}
+
+static void __init machine_specific_arch_final_setup(void)
+{
+	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+		extern int console_use_vt;
+		console_use_vt = 0;
+		conswitchp = NULL;
+	}
+}
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/setup_arch_pre.h
@@ -0,0 +1,22 @@
+
+#include <xen/interface/xen.h>
+#include <asm/hypervisor.h>
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL(xen_start_info);
+
+/*
+ * Point at the empty zero page to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info =
+	(struct shared_info *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+#define ARCH_SETUP machine_specific_arch_setup();
+#define ARCH_FINAL_SETUP machine_specific_arch_final_setup();
+
+static void __init machine_specific_arch_setup(void);
+static void __init machine_specific_arch_final_setup(void);
+
+#define HAVE_ARCH_E820_SANITIZE

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 13/35] Support loading an initrd when running on Xen
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (11 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 12/35] Add start-of-day setup hooks to subarch Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 14/35] Subarch support for CPUID instruction Chris Wright
                   ` (22 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-setup-initrd --]
[-- Type: text/plain, Size: 1940 bytes --]

Due to the initial physical memory layout when booting on Xen, the initrd
image ends up below min_low_pfn (as registered with the bootstrap memory
allocator).  Add Xen subarch support to enable initrd_below_start_ok flag,
and disable initrd_reserve_bootmem.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/setup.c                    |    4 +++-
 include/asm-i386/mach-xen/setup_arch_post.h |    6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

--- linus-2.6.orig/arch/i386/kernel/setup.c
+++ linus-2.6/arch/i386/kernel/setup.c
@@ -1211,6 +1211,7 @@ void __init zone_sizes_init(void)
 extern unsigned long __init setup_memory(void);
 extern void zone_sizes_init(void);
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
+int initrd_reserve_bootmem = 1;
 
 void __init setup_bootmem_allocator(void)
 {
@@ -1271,7 +1272,8 @@ void __init setup_bootmem_allocator(void
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (LOADER_TYPE && INITRD_START) {
 		if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
-			reserve_bootmem(INITRD_START, INITRD_SIZE);
+			if (initrd_reserve_bootmem)
+				reserve_bootmem(INITRD_START, INITRD_SIZE);
 			initrd_start =
 				INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
 			initrd_end = initrd_start+INITRD_SIZE;
--- linus-2.6.orig/include/asm-i386/mach-xen/setup_arch_post.h
+++ linus-2.6/include/asm-i386/mach-xen/setup_arch_post.h
@@ -48,6 +48,12 @@ static void __init machine_specific_arch
 	memcpy(saved_command_line, xen_start_info->cmd_line,
 	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+	if (INITRD_START &&
+	    INITRD_START + PAGE_OFFSET < min_low_pfn << PAGE_SHIFT) {
+		initrd_reserve_bootmem = 0;
+		initrd_below_start_ok = 1;
+	}
 }
 
 static void __init machine_specific_arch_final_setup(void)

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 14/35] Subarch support for CPUID instruction
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (12 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 13/35] Support loading an initrd when running on Xen Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 15/35] subarch support for controlling interrupt delivery Chris Wright
                   ` (21 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-cpuid --]
[-- Type: text/plain, Size: 3516 bytes --]

Allow subarchitectures to modify the CPUID instruction.  This allows
the subarch to provide a limited set of CPUID feature flags during CPU
identification.  Add a subarch implementation for Xen that traps to the
hypervisor where unsupported feature flags can be hidden from guests.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/head-cpu.S                    |    4 ++--
 include/asm-i386/mach-default/mach_processor.h |    7 +++++++
 include/asm-i386/mach-xen/mach_processor.h     |    9 +++++++++
 include/asm-i386/processor.h                   |   13 +++++++------
 4 files changed, 25 insertions(+), 8 deletions(-)

--- linus-2.6.orig/include/asm-i386/processor.h
+++ linus-2.6/include/asm-i386/processor.h
@@ -21,6 +21,7 @@
 #include <linux/threads.h>
 #include <asm/percpu.h>
 #include <linux/cpumask.h>
+#include <mach_processor.h>
 
 /* flag for disabling the tsc */
 extern int tsc_disable;
@@ -148,7 +149,7 @@ static inline void detect_ht(struct cpui
  */
 static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
 {
-	__asm__("cpuid"
+	__asm__(CPUID_STR
 		: "=a" (*eax),
 		  "=b" (*ebx),
 		  "=c" (*ecx),
@@ -160,7 +161,7 @@ static inline void cpuid(unsigned int op
 static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
 	       	int *edx)
 {
-	__asm__("cpuid"
+	__asm__(CPUID_STR
 		: "=a" (*eax),
 		  "=b" (*ebx),
 		  "=c" (*ecx),
@@ -175,7 +176,7 @@ static inline unsigned int cpuid_eax(uns
 {
 	unsigned int eax;
 
-	__asm__("cpuid"
+	__asm__(CPUID_STR
 		: "=a" (eax)
 		: "0" (op)
 		: "bx", "cx", "dx");
@@ -185,7 +186,7 @@ static inline unsigned int cpuid_ebx(uns
 {
 	unsigned int eax, ebx;
 
-	__asm__("cpuid"
+	__asm__(CPUID_STR
 		: "=a" (eax), "=b" (ebx)
 		: "0" (op)
 		: "cx", "dx" );
@@ -195,7 +196,7 @@ static inline unsigned int cpuid_ecx(uns
 {
 	unsigned int eax, ecx;
 
-	__asm__("cpuid"
+	__asm__(CPUID_STR
 		: "=a" (eax), "=c" (ecx)
 		: "0" (op)
 		: "bx", "dx" );
@@ -205,7 +206,7 @@ static inline unsigned int cpuid_edx(uns
 {
 	unsigned int eax, edx;
 
-	__asm__("cpuid"
+	__asm__(CPUID_STR
 		: "=a" (eax), "=d" (edx)
 		: "0" (op)
 		: "bx", "cx");
--- linus-2.6.orig/arch/i386/kernel/head-cpu.S
+++ linus-2.6/arch/i386/kernel/head-cpu.S
@@ -59,7 +59,7 @@
 .macro CPUID_GET_VENDOR_INFO cpuid_level, x86_vendor_id
 	/* get vendor info */
 	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
-	cpuid
+	CPUID
 	movl %eax,\cpuid_level		# save CPUID level
 	movl %ebx,\x86_vendor_id	# lo 4 chars
 	movl %edx,\x86_vendor_id+4	# next 4 chars
@@ -75,7 +75,7 @@
  */
 .macro CPUID_GET_CPU_TYPE family, model, mask, capability
 	movl $1,%eax		# Use the CPUID instruction to get CPU type
-	cpuid
+	CPUID
 	movb %al,%cl		# save reg for future use
 	andb $0x0f,%ah		# mask processor family
 	movb %ah,\family
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_processor.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_MACH_PROCESSOR_H
+#define __ASM_MACH_PROCESSOR_H
+
+#define CPUID cpuid
+#define CPUID_STR "cpuid"
+
+#endif /* __ASM_MACH_PROCESSOR_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_processor.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_MACH_PROCESSOR_H
+#define __ASM_MACH_PROCESSOR_H
+
+#include <xen/interface/arch-x86_32.h>
+
+#define CPUID XEN_CPUID
+#define CPUID_STR XEN_CPUID
+
+#endif /* __ASM_MACH_PROCESSOR_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (13 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 14/35] Subarch support for CPUID instruction Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09  7:00 ` [RFC PATCH 16/35] subarch support for interrupt and exception gates Chris Wright
                   ` (20 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-interrupt-control --]
[-- Type: text/plain, Size: 6188 bytes --]

Abstract the code that controls interrupt delivery, and add a separate
subarch implementation for Xen that manipulates a shared-memory event
delivery mask.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/asm-i386/mach-default/mach_system.h |   24 ++++++
 include/asm-i386/mach-xen/mach_system.h     |  103 ++++++++++++++++++++++++++++
 include/asm-i386/system.h                   |   20 -----
 3 files changed, 128 insertions(+), 19 deletions(-)

--- linus-2.6.orig/include/asm-i386/system.h
+++ linus-2.6/include/asm-i386/system.h
@@ -490,25 +490,7 @@ static inline unsigned long long __cmpxc
 
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
-/* interrupt control.. */
-#define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
-#define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
-/* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
-})
-
-/* For spinlocks etc */
-#define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+#include <mach_system.h>
 
 /*
  * disable hlt during certain critical i/o operations
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_system.h
@@ -0,0 +1,24 @@
+#ifndef __ASM_MACH_SYSTEM_H
+#define __ASM_MACH_SYSTEM_H
+
+/* interrupt control.. */
+#define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
+#define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
+#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
+#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+/* used in the idle loop; sti takes one instruction cycle to complete */
+#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
+
+#define irqs_disabled()			\
+({					\
+	unsigned long flags;		\
+	local_save_flags(flags);	\
+	!(flags & (1<<9));		\
+})
+
+/* For spinlocks etc */
+#define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+
+#endif /* __ASM_MACH_SYSTEM_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_system.h
@@ -0,0 +1,103 @@
+#ifndef __ASM_MACH_SYSTEM_H
+#define __ASM_MACH_SYSTEM_H
+
+#ifdef __KERNEL__
+
+#include <asm/hypervisor.h>
+
+#ifdef CONFIG_SMP
+#define __vcpu_id smp_processor_id()
+#else
+#define __vcpu_id 0
+#endif
+
+/* interrupt control.. */
+
+/*
+ * The use of 'barrier' in the following reflects their use as local-lock
+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+ * critical operations are executed. All critical operations must complete
+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+ * includes these barriers, for example.
+ */
+
+#define __cli()								\
+do {									\
+	struct vcpu_info *_vcpu;					\
+	preempt_disable();						\
+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
+	_vcpu->evtchn_upcall_mask = 1;					\
+	preempt_enable_no_resched();					\
+	barrier();							\
+} while (0)
+
+#define __sti()								\
+do {									\
+	struct vcpu_info *_vcpu;					\
+	barrier();							\
+	preempt_disable();						\
+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
+	_vcpu->evtchn_upcall_mask = 0;					\
+	barrier(); /* unmask then check (avoid races) */		\
+	if (unlikely(_vcpu->evtchn_upcall_pending))			\
+		force_evtchn_callback();				\
+	preempt_enable();						\
+} while (0)
+
+#define __save_flags(x)							\
+do {									\
+	struct vcpu_info *_vcpu;					\
+	preempt_disable();						\
+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
+	(x) = _vcpu->evtchn_upcall_mask;				\
+	preempt_enable();						\
+} while (0)
+
+#define __restore_flags(x)						\
+do {									\
+	struct vcpu_info *_vcpu;					\
+	barrier();							\
+	preempt_disable();						\
+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
+	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
+		barrier(); /* unmask then check (avoid races) */	\
+		if (unlikely(_vcpu->evtchn_upcall_pending))		\
+			force_evtchn_callback();			\
+		preempt_enable();					\
+	} else								\
+		preempt_enable_no_resched();				\
+} while (0)
+
+#define safe_halt()		((void)0)
+#define halt()			((void)0)
+
+#define __save_and_cli(x)						\
+do {									\
+	struct vcpu_info *_vcpu;					\
+	preempt_disable();						\
+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
+	(x) = _vcpu->evtchn_upcall_mask;				\
+	_vcpu->evtchn_upcall_mask = 1;					\
+	preempt_enable_no_resched();					\
+	barrier();							\
+} while (0)
+
+#define local_irq_save(x)	__save_and_cli(x)
+#define local_irq_restore(x)	__restore_flags(x)
+#define local_save_flags(x)	__save_flags(x)
+#define local_irq_disable()	__cli()
+#define local_irq_enable()	__sti()
+
+/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
+#define irqs_disabled()							\
+({	int ___x;							\
+	struct vcpu_info *_vcpu;					\
+	preempt_disable();						\
+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
+	___x = (_vcpu->evtchn_upcall_mask != 0);			\
+	preempt_enable_no_resched();					\
+	___x; })
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_MACH_SYSTEM_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 16/35] subarch support for interrupt and exception gates
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (14 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 15/35] subarch support for controlling interrupt delivery Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 11:09   ` Andi Kleen
  2006-05-13 12:27   ` Andrew Morton
  2006-05-09  7:00 ` [RFC PATCH 17/35] Segment register changes for Xen Chris Wright
                   ` (19 subsequent siblings)
  35 siblings, 2 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-idt --]
[-- Type: text/plain, Size: 5213 bytes --]

Abstract the code that sets up interrupt and exception gates, and
add a separate subarch implementation for Xen.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/traps.c                   |   49 ---------------------------
 include/asm-i386/mach-default/mach_idt.h   |   52 +++++++++++++++++++++++++++++
 include/asm-i386/mach-xen/mach_idt.h       |   50 +++++++++++++++++++++++++++
 include/asm-i386/mach-xen/setup_arch_pre.h |    2 +
 4 files changed, 105 insertions(+), 48 deletions(-)

--- linus-2.6.orig/arch/i386/kernel/traps.c
+++ linus-2.6/arch/i386/kernel/traps.c
@@ -1086,54 +1086,7 @@ void __init trap_init_f00f_bug(void)
 }
 #endif
 
-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
-  int __d0, __d1; \
-  __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
-	"movw %4,%%dx\n\t" \
-	"movl %%eax,%0\n\t" \
-	"movl %%edx,%1" \
-	:"=m" (*((long *) (gate_addr))), \
-	 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
-	:"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
-	 "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-void set_intr_gate(unsigned int n, void *addr)
-{
-	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
-}
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
-	_set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
-}
-
-static void __init set_trap_gate(unsigned int n, void *addr)
-{
-	_set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
-}
-
-static void __init set_system_gate(unsigned int n, void *addr)
-{
-	_set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
-}
-
-static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
-	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
-}
-
+#include <mach_idt.h>
 
 void __init trap_init(void)
 {
--- linus-2.6.orig/include/asm-i386/mach-xen/setup_arch_pre.h
+++ linus-2.6/include/asm-i386/mach-xen/setup_arch_pre.h
@@ -5,6 +5,8 @@
 struct start_info *xen_start_info;
 EXPORT_SYMBOL(xen_start_info);
 
+struct trap_info xen_trap_table[257];
+
 /*
  * Point at the empty zero page to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_idt.h
@@ -0,0 +1,52 @@
+#ifndef __ASM_MACH_IDT_H
+#define __ASM_MACH_IDT_H
+
+#define _set_gate(gate_addr,type,dpl,addr,seg) \
+do { \
+  int __d0, __d1; \
+  __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
+	"movw %4,%%dx\n\t" \
+	"movl %%eax,%0\n\t" \
+	"movl %%edx,%1" \
+	:"=m" (*((long *) (gate_addr))), \
+	 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
+	:"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
+	 "3" ((char *) (addr)),"2" ((seg) << 16)); \
+} while (0)
+
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+void set_intr_gate(unsigned int n, void *addr)
+{
+	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+	_set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
+}
+
+static void __init set_trap_gate(unsigned int n, void *addr)
+{
+	_set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
+}
+
+static void __init set_system_gate(unsigned int n, void *addr)
+{
+	_set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
+}
+
+static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+}
+
+#endif /* __ASM_MACH_IDT_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_idt.h
@@ -0,0 +1,50 @@
+#ifndef __ASM_MACH_IDT_H
+#define __ASM_MACH_IDT_H
+
+static inline void _set_gate(unsigned int vector, uint8_t type, uint8_t dpl,
+			     void *addr, uint16_t seg)
+{
+	struct trap_info *t = xen_trap_table;
+
+	BUG_ON(vector > 256);
+
+	while (t->address && t->vector != vector)
+		t++;
+
+	t->vector = vector;
+	t->cs = seg;
+	TI_SET_DPL(t, dpl);
+	if (type == 14 || vector == 7)
+		TI_SET_IF(t, 1);
+	t->address = (unsigned long)addr;
+}
+
+void set_intr_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, 14, 0, addr, __KERNEL_CS);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, 14, 3, addr, __KERNEL_CS);
+}
+
+static void __init set_trap_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, 15, 0, addr, __KERNEL_CS);
+}
+
+static void __init set_system_gate(unsigned int n, void *addr)
+{
+	_set_gate(n, 15, 3, addr, __KERNEL_CS);
+}
+
+static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+	/* _set_gate(n, 5, 0, 0, (gdt_entry<<3)); */
+}
+
+#endif /* __ASM_MACH_IDT_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (15 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 16/35] subarch support for interrupt and exception gates Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:16   ` Pavel Machek
                     ` (2 more replies)
  2006-05-09  7:00 ` [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen Chris Wright
                   ` (18 subsequent siblings)
  35 siblings, 3 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-segments --]
[-- Type: text/plain, Size: 5780 bytes --]

1. We clear FS/GS before changing TLS entries and switching LDT, as
otherwise the hypervisor will fail to restore thread-local values on
return to the guest kernel and we take a slow exception path.

2. We allow for the fact that the guest kernel may not run in ring 0.
This requires some abstraction in a few places when setting %cs or
checking privilege level (user vs kernel).

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/process.c                   |    4 +++-
 arch/i386/mm/fault.c                         |    8 +++++---
 include/asm-i386/mach-default/mach_segment.h |    8 ++++++++
 include/asm-i386/mach-default/mach_system.h  |    2 ++
 include/asm-i386/mach-xen/mach_segment.h     |    9 +++++++++
 include/asm-i386/mach-xen/mach_system.h      |    2 ++
 include/asm-i386/mach-xen/setup_arch_post.h  |    2 ++
 include/asm-i386/ptrace.h                    |    6 ++++--
 include/asm-i386/segment.h                   |    2 ++
 9 files changed, 37 insertions(+), 6 deletions(-)

--- linus-2.6.orig/arch/i386/kernel/process.c
+++ linus-2.6/arch/i386/kernel/process.c
@@ -347,7 +347,7 @@ int kernel_thread(int (*fn)(void *), voi
 	regs.xes = __USER_DS;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
-	regs.xcs = __KERNEL_CS;
+	regs.xcs = get_kernel_cs();
 	regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
 
 	/* Ok, create the new process.. */
@@ -647,6 +647,8 @@ struct task_struct fastcall * __switch_t
 	 */
 	savesegment(fs, prev->fs);
 	savesegment(gs, prev->gs);
+	clearsegment(fs);
+	clearsegment(gs);
 
 	/*
 	 * Load the per-thread Thread-Local Storage descriptor.
--- linus-2.6.orig/arch/i386/mm/fault.c
+++ linus-2.6/arch/i386/mm/fault.c
@@ -28,6 +28,8 @@
 #include <asm/desc.h>
 #include <asm/kdebug.h>
 
+#include <mach_segment.h>
+
 extern void die(const char *,struct pt_regs *,long);
 
 /*
@@ -78,14 +80,14 @@ static inline unsigned long get_segment_
 	u32 seg_ar, seg_limit, base, *desc;
 
 	/* The standard kernel/user address space limit. */
-	*eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
+	*eip_limit = (seg & USER_MODE_MASK) ? USER_DS.seg : KERNEL_DS.seg;
 
 	/* Unlikely, but must come before segment checks. */
 	if (unlikely((regs->eflags & VM_MASK) != 0))
 		return eip + (seg << 4);
 	
 	/* By far the most common cases. */
-	if (likely(seg == __USER_CS || seg == __KERNEL_CS))
+	if (likely(seg == __USER_CS || seg == get_kernel_cs()))
 		return eip;
 
 	/* Check the segment exists, is within the current LDT/GDT size,
@@ -400,7 +402,7 @@ good_area:
 	switch (error_code & 3) {
 		default:	/* 3: write, present */
 #ifdef TEST_VERIFY_AREA
-			if (regs->cs == KERNEL_CS)
+			if (regs->cs == get_kernel_cs())
 				printk("WP fault at %08lx\n", regs->eip);
 #endif
 			/* fall through */
--- linus-2.6.orig/include/asm-i386/mach-default/mach_system.h
+++ linus-2.6/include/asm-i386/mach-default/mach_system.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_MACH_SYSTEM_H
 #define __ASM_MACH_SYSTEM_H
 
+#define clearsegment(seg)
+
 /* interrupt control.. */
 #define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
 #define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
--- linus-2.6.orig/include/asm-i386/mach-xen/mach_system.h
+++ linus-2.6/include/asm-i386/mach-xen/mach_system.h
@@ -5,6 +5,8 @@
 
 #include <asm/hypervisor.h>
 
+#define clearsegment(seg) loadsegment(seg, 0)
+
 #ifdef CONFIG_SMP
 #define __vcpu_id smp_processor_id()
 #else
--- linus-2.6.orig/include/asm-i386/mach-xen/setup_arch_post.h
+++ linus-2.6/include/asm-i386/mach-xen/setup_arch_post.h
@@ -26,6 +26,8 @@ static void __init machine_specific_arch
 		(struct shared_info *)__va(xen_start_info->shared_info);
 	memset(empty_zero_page, 0, sizeof(empty_zero_page));
 
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+
 	HYPERVISOR_set_callbacks(
 	    __KERNEL_CS, (unsigned long)hypervisor_callback,
 	    __KERNEL_CS, (unsigned long)failsafe_callback);
--- linus-2.6.orig/include/asm-i386/ptrace.h
+++ linus-2.6/include/asm-i386/ptrace.h
@@ -1,6 +1,8 @@
 #ifndef _I386_PTRACE_H
 #define _I386_PTRACE_H
 
+#include <mach_segment.h>
+
 #define EBX 0
 #define ECX 1
 #define EDX 2
@@ -73,11 +75,11 @@ extern void send_sigtrap(struct task_str
  */
 static inline int user_mode(struct pt_regs *regs)
 {
-	return (regs->xcs & 3) != 0;
+	return (regs->xcs & USER_MODE_MASK) != 0;
 }
 static inline int user_mode_vm(struct pt_regs *regs)
 {
-	return ((regs->xcs & 3) | (regs->eflags & VM_MASK)) != 0;
+	return ((regs->xcs & USER_MODE_MASK) | (regs->eflags & VM_MASK)) != 0;
 }
 #define instruction_pointer(regs) ((regs)->eip)
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
--- linus-2.6.orig/include/asm-i386/segment.h
+++ linus-2.6/include/asm-i386/segment.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_SEGMENT_H
 #define _ASM_SEGMENT_H
 
+#include <mach_segment.h>
+
 /*
  * The layout of the per-CPU GDT under Linux:
  *
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_segment.h
@@ -0,0 +1,8 @@
+#ifndef __ASM_MACH_SEGMENT_H
+#define __ASM_MACH_SEGMENT_H
+
+#define USER_MODE_MASK 3
+
+#define get_kernel_cs() __KERNEL_CS
+
+#endif /* __ASM_MACH_SEGMENT_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_segment.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_MACH_SEGMENT_H
+#define __ASM_MACH_SEGMENT_H
+
+#define USER_MODE_MASK 2
+
+#define get_kernel_cs() \
+	(__KERNEL_CS + (xen_feature(XENFEAT_supervisor_mode_kernel) ? 0 : 1))
+
+#endif /* __ASM_MACH_SEGMENT_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (16 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 17/35] Segment register changes for Xen Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:21   ` Pavel Machek
  2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09  7:00 ` [RFC PATCH 19/35] subarch support for control register accesses Chris Wright
                   ` (17 subsequent siblings)
  35 siblings, 2 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-desc --]
[-- Type: text/plain, Size: 8200 bytes --]

Move the macros which handle gdt/idt/ldt's into a subarch include
file and add implementations for running on Xen.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/asm-i386/desc.h                   |   65 ++--------------------------
 include/asm-i386/mach-default/mach_desc.h |   67 +++++++++++++++++++++++++++++
 include/asm-i386/mach-xen/mach_desc.h     |   69 ++++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+), 60 deletions(-)

--- linus-2.6.orig/include/asm-i386/desc.h
+++ linus-2.6/include/asm-i386/desc.h
@@ -33,18 +33,7 @@ static inline struct desc_struct *get_cp
 	return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
 }
 
-#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
-
-#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
-#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
-#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
-
-#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
-#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
-#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
-#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
+#include <mach_desc.h>
 
 /*
  * This is the ldt that every process will get unless we need
@@ -53,30 +42,6 @@ static inline struct desc_struct *get_cp
 extern struct desc_struct default_ldt[];
 extern void set_intr_gate(unsigned int irq, void * addr);
 
-#define _set_tssldt_desc(n,addr,limit,type) \
-__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
-	"movw %w1,2(%2)\n\t" \
-	"rorl $16,%1\n\t" \
-	"movb %b1,4(%2)\n\t" \
-	"movb %4,5(%2)\n\t" \
-	"movb $0,6(%2)\n\t" \
-	"movb %h1,7(%2)\n\t" \
-	"rorl $16,%1" \
-	: "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
-
-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
-{
-	_set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
-		offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
-}
-
-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
-
-static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
-{
-	_set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
-}
-
 #define LDT_entry_a(info) \
 	((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
 
@@ -102,30 +67,11 @@ static inline void set_ldt_desc(unsigned
 	(info)->seg_not_present	== 1	&& \
 	(info)->useable		== 0	)
 
-static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
-{
-	__u32 *lp = (__u32 *)((char *)ldt + entry*8);
-	*lp = entry_a;
-	*(lp+1) = entry_b;
-}
-
-#if TLS_SIZE != 24
-# error update this code.
-#endif
-
-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
-{
-#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
-	C(0); C(1); C(2);
-#undef C
-}
-
 static inline void clear_LDT(void)
 {
 	int cpu = get_cpu();
 
-	set_ldt_desc(cpu, &default_ldt[0], 5);
-	load_LDT_desc();
+	__set_ldt(cpu, DEFAULT_LDT, DEFAULT_LDT_SIZE);
 	put_cpu();
 }
 
@@ -138,12 +84,11 @@ static inline void load_LDT_nolock(mm_co
 	int count = pc->size;
 
 	if (likely(!count)) {
-		segments = &default_ldt[0];
-		count = 5;
+		segments = DEFAULT_LDT;
+		count = DEFAULT_LDT_SIZE;
 	}
 		
-	set_ldt_desc(cpu, segments, count);
-	load_LDT_desc();
+	__set_ldt(cpu, segments, count);
 }
 
 static inline void load_LDT(mm_context_t *pc)
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_desc.h
@@ -0,0 +1,67 @@
+#ifndef __ASM_MACH_DESC_H
+#define __ASM_MACH_DESC_H
+
+#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
+#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
+
+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
+
+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
+#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
+
+#define _set_tssldt_desc(n,addr,limit,type) \
+__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
+	"movw %w1,2(%2)\n\t" \
+	"rorl $16,%1\n\t" \
+	"movb %b1,4(%2)\n\t" \
+	"movb %4,5(%2)\n\t" \
+	"movb $0,6(%2)\n\t" \
+	"movb %h1,7(%2)\n\t" \
+	"rorl $16,%1" \
+	: "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
+
+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
+{
+	_set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
+		offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
+}
+
+#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
+{
+	_set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
+}
+
+#define DEFAULT_LDT &default_ldt[0]
+#define DEFAULT_LDT_SIZE 5
+static inline void __set_ldt(unsigned int cpu, void *addr, unsigned int size)
+{
+	set_ldt_desc(cpu, addr, size);
+	load_LDT_desc();
+}
+
+static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
+{
+	__u32 *lp = (__u32 *)((char *)ldt + entry*8);
+	*lp = entry_a;
+	*(lp+1) = entry_b;
+}
+
+#if TLS_SIZE != 24
+# error update this code.
+#endif
+
+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+	C(0); C(1); C(2);
+#undef C
+}
+
+#endif /* __ASM_MACH_DESC_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_desc.h
@@ -0,0 +1,69 @@
+#ifndef __ASM_MACH_DESC_H
+#define __ASM_MACH_DESC_H
+
+extern struct trap_info xen_trap_table[];
+
+#define load_TR_desc()
+
+#define load_gdt(dtr) do {						\
+	struct Xgt_desc_struct *gdt_descr = (dtr);			\
+	unsigned long frames[16];					\
+	unsigned long va;						\
+	int f;								\
+									\
+	for (va = gdt_descr->address, f = 0;				\
+	     va < gdt_descr->address + gdt_descr->size;			\
+	     va += PAGE_SIZE, f++) {					\
+		frames[f] = virt_to_mfn(va);				\
+		make_lowmem_page_readonly(				\
+			(void *)va, XENFEAT_writable_descriptor_tables); \
+	}								\
+	if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))		\
+		BUG();							\
+} while (0)
+
+#define load_idt(dtr) HYPERVISOR_set_trap_table(xen_trap_table)
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
+
+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
+#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
+
+#define set_tss_desc(cpu,addr)
+
+static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
+{
+}
+
+#define DEFAULT_LDT NULL
+#define DEFAULT_LDT_SIZE 0
+static inline void __set_ldt(unsigned int cpu, void *addr, unsigned int size)
+{
+	struct mmuext_op op;
+	op.cmd = MMUEXT_SET_LDT;
+	op.arg1.linear_addr = (unsigned long)addr;
+	op.arg2.nr_ents = size;
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#define write_ldt_entry(ldt, entry, entry_a, entry_b) do {	\
+        __u32 *lp = (__u32 *)((char *)ldt + entry * 8);		\
+        maddr_t mach_lp = arbitrary_virt_to_machine(lp);	\
+        HYPERVISOR_update_descriptor(				\
+                mach_lp, (u64)entry_a | ((u64)entry_b<<32));	\
+} while (0)
+
+#if TLS_SIZE != 24
+# error update this code.
+#endif
+
+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
+	C(0); C(1); C(2);
+#undef C
+}
+
+#endif /* __ASM_MACH_DESC_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 19/35] subarch support for control register accesses
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (17 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 20/35] subarch stack pointer update Chris Wright
                   ` (16 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-processor --]
[-- Type: text/plain, Size: 5527 bytes --]

Abstract the code that accesses control register, and
add a separate subarch implementation for Xen.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
TODO:
- better sharing to avoid unneeded (i.e. read_cr4_safe, stts)

 include/asm-i386/mach-default/mach_system.h |   58 ++++++++++++++++++++++++++++
 include/asm-i386/mach-xen/mach_system.h     |   53 +++++++++++++++++++++++++
 include/asm-i386/system.h                   |   58 ----------------------------
 3 files changed, 111 insertions(+), 58 deletions(-)

--- linus-2.6.orig/include/asm-i386/mach-default/mach_system.h
+++ linus-2.6/include/asm-i386/mach-default/mach_system.h
@@ -3,6 +3,64 @@
 
 #define clearsegment(seg)
 
+/*
+ * Clear and set 'TS' bit respectively
+ */
+#define clts() __asm__ __volatile__ ("clts")
+#define read_cr0() ({ \
+	unsigned int __dummy; \
+	__asm__ __volatile__( \
+		"movl %%cr0,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr0(x) \
+	__asm__ __volatile__("movl %0,%%cr0": :"r" (x));
+
+#define read_cr2() ({ \
+	unsigned int __dummy; \
+	__asm__ __volatile__( \
+		"movl %%cr2,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr2(x) \
+	__asm__ __volatile__("movl %0,%%cr2": :"r" (x));
+
+#define read_cr3() ({ \
+	unsigned int __dummy; \
+	__asm__ ( \
+		"movl %%cr3,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr3(x) \
+	__asm__ __volatile__("movl %0,%%cr3": :"r" (x));
+
+#define read_cr4() ({ \
+	unsigned int __dummy; \
+	__asm__( \
+		"movl %%cr4,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+
+#define read_cr4_safe() ({			      \
+	unsigned int __dummy;			      \
+	/* This could fault if %cr4 does not exist */ \
+	__asm__("1: movl %%cr4, %0		\n"   \
+		"2:				\n"   \
+		".section __ex_table,\"a\"	\n"   \
+		".long 1b,2b			\n"   \
+		".previous			\n"   \
+		: "=r" (__dummy): "0" (0));	      \
+	__dummy;				      \
+})
+
+#define write_cr4(x) \
+	__asm__ __volatile__("movl %0,%%cr4": :"r" (x));
+#define stts() write_cr0(8 | read_cr0())
+
 /* interrupt control.. */
 #define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
 #define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
--- linus-2.6.orig/include/asm-i386/mach-xen/mach_system.h
+++ linus-2.6/include/asm-i386/mach-xen/mach_system.h
@@ -13,6 +13,59 @@
 #define __vcpu_id 0
 #endif
 
+/*
+ * Clear and set 'TS' bit respectively
+ */
+#define clts() __asm__ __volatile__ ("clts")
+#define read_cr0() ({ \
+	unsigned int __dummy; \
+	__asm__ __volatile__( \
+		"movl %%cr0,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr0(x) \
+	__asm__ __volatile__("movl %0,%%cr0": :"r" (x));
+
+#define read_cr2() \
+	(HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2)
+#define write_cr2(x) \
+	__asm__ __volatile__("movl %0,%%cr2": :"r" (x));
+
+#define read_cr3() ({ \
+	unsigned int __dummy; \
+	__asm__ ( \
+		"movl %%cr3,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr3(x) \
+	__asm__ __volatile__("movl %0,%%cr3": :"r" (x));
+
+#define read_cr4() ({ \
+	unsigned int __dummy; \
+	__asm__( \
+		"movl %%cr4,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+
+#define read_cr4_safe() ({			      \
+	unsigned int __dummy;			      \
+	/* This could fault if %cr4 does not exist */ \
+	__asm__("1: movl %%cr4, %0		\n"   \
+		"2:				\n"   \
+		".section __ex_table,\"a\"	\n"   \
+		".long 1b,2b			\n"   \
+		".previous			\n"   \
+		: "=r" (__dummy): "0" (0));	      \
+	__dummy;				      \
+})
+
+#define write_cr4(x) \
+	__asm__ __volatile__("movl %0,%%cr4": :"r" (x));
+#define stts() write_cr0(8 | read_cr0())
+
 /* interrupt control.. */
 
 /*
--- linus-2.6.orig/include/asm-i386/system.h
+++ linus-2.6/include/asm-i386/system.h
@@ -83,64 +83,6 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
 #define savesegment(seg, value) \
 	asm volatile("mov %%" #seg ",%0":"=rm" (value))
 
-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
-#define read_cr0() ({ \
-	unsigned int __dummy; \
-	__asm__ __volatile__( \
-		"movl %%cr0,%0\n\t" \
-		:"=r" (__dummy)); \
-	__dummy; \
-})
-#define write_cr0(x) \
-	__asm__ __volatile__("movl %0,%%cr0": :"r" (x));
-
-#define read_cr2() ({ \
-	unsigned int __dummy; \
-	__asm__ __volatile__( \
-		"movl %%cr2,%0\n\t" \
-		:"=r" (__dummy)); \
-	__dummy; \
-})
-#define write_cr2(x) \
-	__asm__ __volatile__("movl %0,%%cr2": :"r" (x));
-
-#define read_cr3() ({ \
-	unsigned int __dummy; \
-	__asm__ ( \
-		"movl %%cr3,%0\n\t" \
-		:"=r" (__dummy)); \
-	__dummy; \
-})
-#define write_cr3(x) \
-	__asm__ __volatile__("movl %0,%%cr3": :"r" (x));
-
-#define read_cr4() ({ \
-	unsigned int __dummy; \
-	__asm__( \
-		"movl %%cr4,%0\n\t" \
-		:"=r" (__dummy)); \
-	__dummy; \
-})
-
-#define read_cr4_safe() ({			      \
-	unsigned int __dummy;			      \
-	/* This could fault if %cr4 does not exist */ \
-	__asm__("1: movl %%cr4, %0		\n"   \
-		"2:				\n"   \
-		".section __ex_table,\"a\"	\n"   \
-		".long 1b,2b			\n"   \
-		".previous			\n"   \
-		: "=r" (__dummy): "0" (0));	      \
-	__dummy;				      \
-})
-
-#define write_cr4(x) \
-	__asm__ __volatile__("movl %0,%%cr4": :"r" (x));
-#define stts() write_cr0(8 | read_cr0())
-
 #endif	/* __KERNEL__ */
 
 #define wbinvd() \

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 20/35] subarch stack pointer update
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (18 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 19/35] subarch support for control register accesses Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 21/35] subarch TLB support Chris Wright
                   ` (15 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-tss --]
[-- Type: text/plain, Size: 1607 bytes --]

Register the new kernel ('ring 0') stack pointer with the hypervisor
during context switch.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/asm-i386/mach-default/mach_processor.h |    7 +++++++
 include/asm-i386/mach-xen/mach_processor.h     |    8 ++++++++
 include/asm-i386/processor.h                   |    1 +
 3 files changed, 16 insertions(+)

--- linus-2.6.orig/include/asm-i386/mach-default/mach_processor.h
+++ linus-2.6/include/asm-i386/mach-default/mach_processor.h
@@ -4,4 +4,11 @@
 #define CPUID cpuid
 #define CPUID_STR "cpuid"
 
+#ifndef __ASSEMBLY__
+static inline void mach_update_kernel_stack(unsigned long esp0,
+					    unsigned short ss0)
+{
+}
+#endif
+
 #endif /* __ASM_MACH_PROCESSOR_H */
--- linus-2.6.orig/include/asm-i386/mach-xen/mach_processor.h
+++ linus-2.6/include/asm-i386/mach-xen/mach_processor.h
@@ -6,4 +6,12 @@
 #define CPUID XEN_CPUID
 #define CPUID_STR XEN_CPUID
 
+#ifndef __ASSEMBLY__
+static inline void mach_update_kernel_stack(unsigned long esp0,
+					    unsigned short ss0)
+{
+	HYPERVISOR_stack_switch(ss0, esp0);
+}
+#endif
+
 #endif /* __ASM_MACH_PROCESSOR_H */
--- linus-2.6.orig/include/asm-i386/processor.h
+++ linus-2.6/include/asm-i386/processor.h
@@ -500,6 +500,7 @@ static inline void load_esp0(struct tss_
 		tss->ss1 = thread->sysenter_cs;
 		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 	}
+	mach_update_kernel_stack(tss->esp0, tss->ss0);
 }
 
 #define start_thread(regs, new_eip, new_esp) do {		\

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 21/35] subarch TLB support
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (19 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 20/35] subarch stack pointer update Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 22/35] subarch suport for idle loop (NO_IDLE_HZ for Xen) Chris Wright
                   ` (14 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-tlbflush --]
[-- Type: text/plain, Size: 5196 bytes --]

Paravirtualize TLB flushes by using the flush interfaces provided by
the hypervisor. These hide the details of cross-CPU shootdowns and
allow significant optimisations (for example, by avoiding shooting
down on virtual CPUs that are descheduled). This is considerably
faster in most cases than performing virtual IPIs in the guest kernel.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/asm-i386/mach-default/mach_tlbflush.h |   59 ++++++++++++++++++++++++++
 include/asm-i386/mach-xen/mach_tlbflush.h     |   25 +++++++++++
 include/asm-i386/tlbflush.h                   |   55 ------------------------
 3 files changed, 85 insertions(+), 54 deletions(-)

--- linus-2.6.orig/include/asm-i386/tlbflush.h
+++ linus-2.6/include/asm-i386/tlbflush.h
@@ -5,64 +5,11 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
-#define __flush_tlb()							\
-	do {								\
-		unsigned int tmpreg;					\
-									\
-		__asm__ __volatile__(					\
-			"movl %%cr3, %0;              \n"		\
-			"movl %0, %%cr3;  # flush TLB \n"		\
-			: "=r" (tmpreg)					\
-			:: "memory");					\
-	} while (0)
-
-/*
- * Global pages have to be flushed a bit differently. Not a real
- * performance problem because this does not happen often.
- */
-#define __flush_tlb_global()						\
-	do {								\
-		unsigned int tmpreg, cr4, cr4_orig;			\
-									\
-		__asm__ __volatile__(					\
-			"movl %%cr4, %2;  # turn off PGE     \n"	\
-			"movl %2, %1;                        \n"	\
-			"andl %3, %1;                        \n"	\
-			"movl %1, %%cr4;                     \n"	\
-			"movl %%cr3, %0;                     \n"	\
-			"movl %0, %%cr3;  # flush TLB        \n"	\
-			"movl %2, %%cr4;  # turn PGE back on \n"	\
-			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
-			: "i" (~X86_CR4_PGE)				\
-			: "memory");					\
-	} while (0)
-
 extern unsigned long pgkern_mask;
 
-# define __flush_tlb_all()						\
-	do {								\
-		if (cpu_has_pge)					\
-			__flush_tlb_global();				\
-		else							\
-			__flush_tlb();					\
-	} while (0)
-
 #define cpu_has_invlpg	(boot_cpu_data.x86 > 3)
 
-#define __flush_tlb_single(addr) \
-	__asm__ __volatile__("invlpg %0": :"m" (*(char *) addr))
-
-#ifdef CONFIG_X86_INVLPG
-# define __flush_tlb_one(addr) __flush_tlb_single(addr)
-#else
-# define __flush_tlb_one(addr)						\
-	do {								\
-		if (cpu_has_invlpg)					\
-			__flush_tlb_single(addr);			\
-		else							\
-			__flush_tlb();					\
-	} while (0)
-#endif
+#include <mach_tlbflush.h>
 
 /*
  * TLB flushing:
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-default/mach_tlbflush.h
@@ -0,0 +1,59 @@
+#ifndef __ASM_MACH_TLBFLUSH_H
+#define __ASM_MACH_TLBFLUSH_H
+
+#define __flush_tlb()							\
+	do {								\
+		unsigned int tmpreg;					\
+									\
+		__asm__ __volatile__(					\
+			"movl %%cr3, %0;              \n"		\
+			"movl %0, %%cr3;  # flush TLB \n"		\
+			: "=r" (tmpreg)					\
+			:: "memory");					\
+	} while (0)
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+#define __flush_tlb_global()						\
+	do {								\
+		unsigned int tmpreg, cr4, cr4_orig;			\
+									\
+		__asm__ __volatile__(					\
+			"movl %%cr4, %2;  # turn off PGE     \n"	\
+			"movl %2, %1;                        \n"	\
+			"andl %3, %1;                        \n"	\
+			"movl %1, %%cr4;                     \n"	\
+			"movl %%cr3, %0;                     \n"	\
+			"movl %0, %%cr3;  # flush TLB        \n"	\
+			"movl %2, %%cr4;  # turn PGE back on \n"	\
+			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
+			: "i" (~X86_CR4_PGE)				\
+			: "memory");					\
+	} while (0)
+
+#define __flush_tlb_all()						\
+	do {								\
+		if (cpu_has_pge)					\
+			__flush_tlb_global();				\
+		else							\
+			__flush_tlb();					\
+	} while (0)
+
+#define __flush_tlb_single(addr) \
+	__asm__ __volatile__("invlpg %0": :"m" (*(char *) addr))
+
+#ifdef CONFIG_X86_INVLPG
+# define __flush_tlb_one(addr) __flush_tlb_single(addr)
+#else
+# define __flush_tlb_one(addr)						\
+	do {								\
+		if (cpu_has_invlpg)					\
+			__flush_tlb_single(addr);			\
+		else							\
+			__flush_tlb();					\
+	} while (0)
+#endif
+
+#endif /* __ASM_MACH_TLBFLUSH_H */
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/mach_tlbflush.h
@@ -0,0 +1,25 @@
+#ifndef __ASM_MACH_TLBFLUSH_H
+#define __ASM_MACH_TLBFLUSH_H
+
+static inline void xen_tlb_flush(void)
+{
+        struct mmuext_op op;
+        op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+        BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+static inline void xen_invlpg(unsigned long ptr)
+{
+        struct mmuext_op op;
+        op.cmd = MMUEXT_INVLPG_LOCAL;
+        op.arg1.linear_addr = ptr & PAGE_MASK;
+        BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#define __flush_tlb() xen_tlb_flush()
+#define __flush_tlb_global() xen_tlb_flush()
+#define __flush_tlb_all() xen_tlb_flush()
+#define __flush_tlb_single(addr) xen_invlpg(addr)
+#define __flush_tlb_one(addr) __flush_tlb_single(addr)
+
+#endif /* __ASM_MACH_TLBFLUSH_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 22/35] subarch suport for idle loop (NO_IDLE_HZ for Xen)
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (20 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 21/35] subarch TLB support Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 13:21   ` Andi Kleen
  2006-05-09  7:00 ` [RFC PATCH 23/35] Increase x86 interrupt vector range Chris Wright
                   ` (13 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-idle --]
[-- Type: text/plain, Size: 1810 bytes --]

Paravirtualize the idle loop to explicitly trap to the hypervisor when
blocking, and to use the NO_IDLE_HZ functionality introduced by s390
to inform the rcu subsystem that the CPU is quiescent.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 drivers/xen/Kconfig                         |    8 ++++++++
 include/asm-i386/mach-xen/setup_arch_post.h |   24 ++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

--- linus-2.6.orig/drivers/xen/Kconfig
+++ linus-2.6/drivers/xen/Kconfig
@@ -12,6 +12,14 @@ config XEN
 
 if XEN
 
+config NO_IDLE_HZ
+	bool
+	default y
+	help
+	  Switches the regular HZ timer off when the system is going idle.
+	  This helps Xen to detect that the Linux system is idle, reducing
+	  the overhead of idle systems.
+
 config XEN_SHADOW_MODE
 	bool
 	default y
--- linus-2.6.orig/include/asm-i386/mach-xen/setup_arch_post.h
+++ linus-2.6/include/asm-i386/mach-xen/setup_arch_post.h
@@ -8,6 +8,11 @@
 
 #include <xen/interface/physdev.h>
 
+extern void stop_hz_timer(void);
+extern void start_hz_timer(void);
+
+void xen_idle(void);
+
 static char * __init machine_specific_memory_setup(void)
 {
 	unsigned long max_pfn = xen_start_info->nr_pages;
@@ -65,4 +70,23 @@ static void __init machine_specific_arch
 		console_use_vt = 0;
 		conswitchp = NULL;
 	}
+
+	pm_idle = xen_idle;
+}
+
+void xen_idle(void)
+{
+	local_irq_disable();
+
+	if (need_resched())
+		local_irq_enable();
+	else {
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
+		stop_hz_timer();
+		/* Blocking includes an implicit local_irq_enable(). */
+		HYPERVISOR_sched_op(SCHEDOP_block, 0);
+		start_hz_timer();
+		set_thread_flag(TIF_POLLING_NRFLAG);
+	}
 }

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 23/35] Increase x86 interrupt vector range
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (21 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 22/35] subarch suport for idle loop (NO_IDLE_HZ for Xen) Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 24/35] Add support for Xen event channels Chris Wright
                   ` (12 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: x86-increase-interrupt-vector-range --]
[-- Type: text/plain, Size: 3302 bytes --]

Remove the limit of 256 interrupt vectors by changing the value
stored in orig_{e,r}ax to be the negated interrupt vector.
The orig_{e,r}ax needs to be < 0 to allow the signal code to
distinguish between return from interrupt and return from syscall.
With this change applied, NR_IRQS can be > 256.

Xen extends the IRQ numbering space to include room for dynamically
allocated virtual interrupts (in the range 256-511), which requires a
more permissive interface to do_IRQ.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/entry.S    |    4 ++--
 arch/i386/kernel/irq.c      |    4 ++--
 arch/x86_64/kernel/entry.S  |    2 +-
 arch/x86_64/kernel/irq.c    |    4 ++--
 arch/x86_64/kernel/smp.c    |    4 ++--
 include/asm-x86_64/hw_irq.h |    2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

--- linus-2.6.orig/arch/i386/kernel/entry.S
+++ linus-2.6/arch/i386/kernel/entry.S
@@ -464,7 +464,7 @@ vector=0
 ENTRY(irq_entries_start)
 .rept NR_IRQS
 	ALIGN
-1:	pushl $vector-256
+1:	pushl $~(vector)
 	jmp common_interrupt
 .data
 	.long 1b
@@ -481,7 +481,7 @@ common_interrupt:
 
 #define BUILD_INTERRUPT(name, nr)	\
 ENTRY(name)				\
-	pushl $nr-256;			\
+	pushl $~(nr);			\
 	SAVE_ALL			\
 	movl %esp,%eax;			\
 	call smp_/**/name;		\
--- linus-2.6.orig/arch/i386/kernel/irq.c
+++ linus-2.6/arch/i386/kernel/irq.c
@@ -53,8 +53,8 @@ static union irq_ctx *softirq_ctx[NR_CPU
  */
 fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {	
-	/* high bits used in ret_from_ code */
-	int irq = regs->orig_eax & 0xff;
+	/* high bit used in ret_from_ code */
+	int irq = ~regs->orig_eax;
 #ifdef CONFIG_4KSTACKS
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
--- linus-2.6.orig/arch/x86_64/kernel/entry.S
+++ linus-2.6/arch/x86_64/kernel/entry.S
@@ -601,7 +601,7 @@ retint_kernel:	
  */		
 	.macro apicinterrupt num,func
 	INTR_FRAME
-	pushq $\num-256
+	pushq $~(\num)
 	CFI_ADJUST_CFA_OFFSET 8
 	interrupt \func
 	jmp ret_from_intr
--- linus-2.6.orig/arch/x86_64/kernel/irq.c
+++ linus-2.6/arch/x86_64/kernel/irq.c
@@ -91,8 +91,8 @@ skip:
  */
 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 {	
-	/* high bits used in ret_from_ code  */
-	unsigned irq = regs->orig_rax & 0xff;
+	/* high bit used in ret_from_ code  */
+	unsigned irq = ~regs->orig_rax;
 
 	exit_idle();
 	irq_enter();
--- linus-2.6.orig/arch/x86_64/kernel/smp.c
+++ linus-2.6/arch/x86_64/kernel/smp.c
@@ -135,10 +135,10 @@ asmlinkage void smp_invalidate_interrupt
 
 	cpu = smp_processor_id();
 	/*
-	 * orig_rax contains the interrupt vector - 256.
+	 * orig_rax contains the negated interrupt vector.
 	 * Use that to determine where the sender put the data.
 	 */
-	sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
+	sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
 	f = &per_cpu(flush_state, sender);
 
 	if (!cpu_isset(cpu, f->flush_cpumask))
--- linus-2.6.orig/include/asm-x86_64/hw_irq.h
+++ linus-2.6/include/asm-x86_64/hw_irq.h
@@ -127,7 +127,7 @@ asmlinkage void IRQ_NAME(nr); \
 __asm__( \
 "\n.p2align\n" \
 "IRQ" #nr "_interrupt:\n\t" \
-	"push $" #nr "-256 ; " \
+	"push $~(" #nr ") ; " \
 	"jmp common_interrupt");
 
 #if defined(CONFIG_X86_IO_APIC)

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 24/35] Add support for Xen event channels.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (22 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 23/35] Increase x86 interrupt vector range Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-12 21:41   ` Pavel Machek
  2006-05-13 12:27   ` Andrew Morton
  2006-05-09  7:00 ` [RFC PATCH 25/35] Add Xen time abstractions Chris Wright
                   ` (11 subsequent siblings)
  35 siblings, 2 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: evtchn --]
[-- Type: text/plain, Size: 32073 bytes --]

Support Xen event channels instead of the i8259 PIC.

Event channels are used to inject events into the kernel, either from
the hypervisor or from another VM.  The injected events are mapped to
interrupts.

If an event needs to be injected, the hypervisor causes an upcall into
the kernel.  The upcall handler then scans the event pending bitmap
and calls do_IRQ for each pending event.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/Makefile               |    6 
 drivers/xen/core/evtchn.c               |  887 ++++++++++++++++++++++++++++++++
 include/asm-i386/hw_irq.h               |    4 
 include/asm-i386/mach-xen/irq_vectors.h |  109 +++
 include/xen/evtchn.h                    |  116 ++++
 5 files changed, 1120 insertions(+), 2 deletions(-)

--- linus-2.6.orig/arch/i386/kernel/Makefile
+++ linus-2.6/arch/i386/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y := head.o init_task.o vmlinux.lds
 
 obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
-		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
+		ptrace.o time.o ioport.o ldt.o setup.o hw_irq.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o bootflag.o \
 		quirks.o i8237.o topology.o alternative.o
 
@@ -42,6 +42,10 @@ EXTRA_AFLAGS   := -traditional
 
 obj-$(CONFIG_SCx200)		+= scx200.o
 
+hw_irq-y			:= i8259.o
+
+hw_irq-$(CONFIG_XEN)		:= ../../../drivers/xen/core/evtchn.o
+
 # vsyscall.o contains the vsyscall DSO images as __initdata.
 # We must build both images before we can assemble it.
 # Note: kbuild does not track this dependency due to usage of .incbin
--- linus-2.6.orig/include/asm-i386/hw_irq.h
+++ linus-2.6/include/asm-i386/hw_irq.h
@@ -68,7 +68,9 @@ extern atomic_t irq_mis_count;
 
 #define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
 
-#if defined(CONFIG_X86_IO_APIC)
+#if defined(CONFIG_X86_XEN)
+extern void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i);
+#elif defined(CONFIG_X86_IO_APIC)
 static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
 {
 	if (IO_APIC_IRQ(i))
--- /dev/null
+++ linus-2.6/drivers/xen/core/evtchn.c
@@ -0,0 +1,887 @@
+/******************************************************************************
+ * evtchn.c
+ * 
+ * Communication via Xen event channels.
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/version.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/ptrace.h>
+#include <asm/synch_bitops.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/physdev.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <linux/mc146818rtc.h> /* RTC_IRQ */
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static spinlock_t irq_mapping_update_lock;
+
+/* IRQ <-> event-channel mappings. */
+static int evtchn_to_irq[NR_EVENT_CHANNELS];
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+static u32 irq_info[NR_IRQS];
+
+/* Binding types. */
+enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
+
+/* Constructor for packed IRQ information. */
+static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+	return ((type << 24) | (index << 16) | evtchn);
+}
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+/*
+ * Accessors for packed IRQ information.
+ */
+
+static inline unsigned int evtchn_from_irq(int irq)
+{
+	return (u16)(irq_info[irq]);
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+	return (u8)(irq_info[irq] >> 16);
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+	return (u8)(irq_info[irq] >> 24);
+}
+
+/* IRQ <-> VIRQ mapping. */
+DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]);
+
+/* IRQ <-> IPI mapping. */
+#ifndef NR_IPIS
+#define NR_IPIS 1
+#endif
+DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+static unsigned long pirq_needs_unmask_notify[NR_PIRQS/sizeof(unsigned long)];
+
+#ifdef CONFIG_SMP
+
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+					   struct shared_info *sh,
+					   unsigned int idx)
+{
+	return (sh->evtchn_pending[idx] &
+		cpu_evtchn_mask[cpu][idx] &
+		~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+	clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
+	set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
+	cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+	/* By default all event channels notify CPU#0. */
+	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return cpu_evtchn[evtchn];
+}
+
+#else
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+					   struct shared_info *sh,
+					   unsigned int idx)
+{
+	return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return 0;
+}
+
+#endif
+
+/* Upcall to generic IRQ layer. */
+#ifdef CONFIG_X86
+extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
+#if defined (__i386__)
+static inline void exit_idle(void) {}
+#define IRQ_REG orig_eax
+#elif defined (__x86_64__)
+#include <asm/idle.h>
+#define IRQ_REG orig_rax
+#endif
+#define do_IRQ(irq, regs) do {		\
+	(regs)->IRQ_REG = ~(irq);	\
+	do_IRQ((regs));			\
+} while (0)
+#endif
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)	((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+	(void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+/* NB. Interrupts are disabled on entry. */
+asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
+{
+	unsigned long  l1, l2;
+	unsigned int   l1i, l2i, port;
+	int            irq, cpu = smp_processor_id();
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = &s->vcpu_info[cpu];
+
+	vcpu_info->evtchn_upcall_pending = 0;
+
+	/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+	l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
+	while (l1 != 0) {
+		l1i = __ffs(l1);
+		l1 &= ~(1UL << l1i);
+
+		while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
+			l2i = __ffs(l2);
+
+			port = (l1i * BITS_PER_LONG) + l2i;
+			if ((irq = evtchn_to_irq[port]) != -1)
+				do_IRQ(irq, regs);
+			else {
+				exit_idle();
+#ifdef CONFIG_XEN_EVTCHN_DEVICE
+				evtchn_device_upcall(port);
+#else
+				mask_evtchn(port);
+#endif
+			}
+		}
+	}
+}
+
+static int find_unbound_irq(void)
+{
+	int irq;
+
+	for (irq = 0; irq < NR_IRQS; irq++)
+		if (irq_bindcount[irq] == 0)
+			break;
+
+	if (irq == NR_IRQS) {
+		printk(KERN_ERR "No available IRQ to bind to: increase NR_IRQS!\n");
+		irq = -EINVAL;
+	}
+
+	return irq;
+}
+
+static int bind_evtchn_to_irq(unsigned int evtchn)
+{
+	int irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((irq = evtchn_to_irq[evtchn]) == -1) {
+		irq = find_unbound_irq();
+		if (irq < 0)
+			goto out;
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+	}
+
+	irq_bindcount[irq]++;
+out:
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+	struct evtchn_op op = { .cmd = EVTCHNOP_bind_virq };
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+		irq = find_unbound_irq();
+		if (irq < 0)
+			goto out;
+
+		op.u.bind_virq.virq = virq;
+		op.u.bind_virq.vcpu = cpu;
+		BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
+		evtchn = op.u.bind_virq.port;
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+		per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+out:
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+	struct evtchn_op op = { .cmd = EVTCHNOP_bind_ipi };
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
+		irq = find_unbound_irq();
+		if (irq < 0)
+			goto out;
+
+		op.u.bind_ipi.vcpu = cpu;
+		BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
+		evtchn = op.u.bind_ipi.port;
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+		per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+out:
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+	struct evtchn_op op = { .cmd = EVTCHNOP_close };
+	int evtchn = evtchn_from_irq(irq);
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) {
+		op.u.close.port = evtchn;
+		BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq(irq)] = -1;
+			break;
+		case IRQT_IPI:
+			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+		irq_info[irq] = IRQ_UNBOUND;
+	}
+
+	spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(
+	unsigned int evtchn,
+	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_evtchn_to_irq(evtchn);
+	if (irq < 0)
+		goto out;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+out:
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(
+	unsigned int virq,
+	unsigned int cpu,
+	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_virq_to_irq(virq, cpu);
+	if (irq < 0)
+		goto out;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+out:
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(
+	unsigned int ipi,
+	unsigned int cpu,
+	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_ipi_to_irq(ipi, cpu);
+	if (irq < 0)
+		goto out;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+out:
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler);
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+	free_irq(irq, dev_id);
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+#ifdef CONFIG_SMP
+static void do_nothing_function(void *ign)
+{
+}
+#endif
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+	struct evtchn_op op = { .cmd = EVTCHNOP_bind_vcpu };
+	int evtchn;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	evtchn = evtchn_from_irq(irq);
+	if (!VALID_EVTCHN(evtchn)) {
+		spin_unlock(&irq_mapping_update_lock);
+		return;
+	}
+
+	/* Send future instances of this interrupt to other vcpu. */
+	op.u.bind_vcpu.port = evtchn;
+	op.u.bind_vcpu.vcpu = tcpu;
+
+	/*
+	 * If this fails, it usually just indicates that we're dealing with a 
+	 * virq or IPI channel, which don't actually need to be rebound. Ignore
+	 * it, but don't do the xenlinux-level rebind in that case.
+	 */
+	if (HYPERVISOR_event_channel_op(&op) >= 0)
+		bind_evtchn_to_cpu(evtchn, tcpu);
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	/*
+	 * Now send the new target processor a NOP IPI. When this returns, it
+	 * will check for any pending interrupts, and so service any that got 
+	 * delivered to the wrong processor by mistake.
+	 * 
+	 * XXX: The only time this is called with interrupts disabled is from
+	 * the hotplug/hotunplug path. In that case, all cpus are stopped with 
+	 * interrupts disabled, and the missed interrupts will be picked up
+	 * when they start again. This is kind of a hack.
+	 */
+	if (!irqs_disabled())
+		smp_call_function(do_nothing_function, NULL, 0, 0);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+	unsigned tcpu = first_cpu(dest);
+	rebind_irq_to_cpu(irq, tcpu);
+}
+
+/*
+ * Interface to generic handling in irq.c
+ */
+
+static unsigned int startup_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		unmask_evtchn(evtchn);
+	return 0;
+}
+
+static void shutdown_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn)) {
+		mask_evtchn(evtchn);
+		clear_evtchn(evtchn);
+	}
+}
+
+static void end_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED))
+		unmask_evtchn(evtchn);
+}
+
+static struct hw_interrupt_type dynirq_type = {
+	"Dynamic-irq",
+	startup_dynirq,
+	shutdown_dynirq,
+	enable_dynirq,
+	disable_dynirq,
+	ack_dynirq,
+	end_dynirq,
+	set_affinity_irq
+};
+
+static inline void pirq_unmask_notify(int pirq)
+{
+	struct physdev_op op;
+	if (unlikely(test_bit(pirq, &pirq_needs_unmask_notify[0]))) {
+		op.cmd = PHYSDEVOP_IRQ_UNMASK_NOTIFY;
+		(void)HYPERVISOR_physdev_op(&op);
+	}
+}
+
+static inline void pirq_query_unmask(int pirq)
+{
+	struct physdev_op op;
+	op.cmd = PHYSDEVOP_IRQ_STATUS_QUERY;
+	op.u.irq_status_query.irq = pirq;
+	(void)HYPERVISOR_physdev_op(&op);
+	clear_bit(pirq, &pirq_needs_unmask_notify[0]);
+	if (op.u.irq_status_query.flags & PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY)
+		set_bit(pirq, &pirq_needs_unmask_notify[0]);
+}
+
+/*
+ * On startup, if there is no action associated with the IRQ then we are
+ * probing. In this case we should not share with others as it will confuse us.
+ */
+#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
+
+static unsigned int startup_pirq(unsigned int irq)
+{
+	struct evtchn_op op = { .cmd = EVTCHNOP_bind_pirq };
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		goto out;
+
+	op.u.bind_pirq.pirq  = irq;
+	/* NB. We are happy to share unless we are probing. */
+	op.u.bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
+	if (HYPERVISOR_event_channel_op(&op) != 0) {
+		if (!probing_irq(irq))
+			printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
+			       irq);
+		return 0;
+	}
+	evtchn = op.u.bind_pirq.port;
+
+	pirq_query_unmask(irq_to_pirq(irq));
+
+	bind_evtchn_to_cpu(evtchn, 0);
+	evtchn_to_irq[evtchn] = irq;
+	irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn);
+
+ out:
+	unmask_evtchn(evtchn);
+	pirq_unmask_notify(irq_to_pirq(irq));
+
+	return 0;
+}
+
+static void shutdown_pirq(unsigned int irq)
+{
+	struct evtchn_op op = { .cmd = EVTCHNOP_close };
+	int evtchn = evtchn_from_irq(irq);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	mask_evtchn(evtchn);
+
+	op.u.close.port = evtchn;
+	BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
+
+	bind_evtchn_to_cpu(evtchn, 0);
+	evtchn_to_irq[evtchn] = -1;
+	irq_info[irq] = IRQ_UNBOUND;
+}
+
+static void enable_pirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn)) {
+		unmask_evtchn(evtchn);
+		pirq_unmask_notify(irq_to_pirq(irq));
+	}
+}
+
+static void disable_pirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static void ack_pirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn)) {
+		mask_evtchn(evtchn);
+		clear_evtchn(evtchn);
+	}
+}
+
+static void end_pirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) {
+		unmask_evtchn(evtchn);
+		pirq_unmask_notify(irq_to_pirq(irq));
+	}
+}
+
+static struct hw_interrupt_type pirq_type = {
+	"Phys-irq",
+	startup_pirq,
+	shutdown_pirq,
+	enable_pirq,
+	disable_pirq,
+	ack_pirq,
+	end_pirq,
+	set_affinity_irq
+};
+
+void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
+{
+	int evtchn = evtchn_from_irq(i);
+	struct shared_info *s = HYPERVISOR_shared_info;
+	if (!VALID_EVTCHN(evtchn))
+		return;
+	BUG_ON(!test_bit(evtchn, &s->evtchn_mask[0]));
+	synch_set_bit(evtchn, &s->evtchn_pending[0]);
+}
+
+void notify_remote_via_irq(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+void mask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	synch_set_bit(port, &s->evtchn_mask[0]);
+}
+EXPORT_SYMBOL_GPL(mask_evtchn);
+
+void unmask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = smp_processor_id();
+	struct vcpu_info *vcpu_info = &s->vcpu_info[cpu];
+
+	/* Slow path (hypercall) if this is a non-local port. */
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_op op = { .cmd = EVTCHNOP_unmask,
+				   .u.unmask.port = port };
+		(void)HYPERVISOR_event_channel_op(&op);
+		return;
+	}
+
+	synch_clear_bit(port, &s->evtchn_mask[0]);
+
+	/*
+	 * The following is basically the equivalent of 'hw_resend_irq'. Just
+	 * like a real IO-APIC we 'lose the interrupt edge' if the channel is
+	 * masked.
+	 */
+	if (test_bit(port, &s->evtchn_pending[0]) &&
+	    !synch_test_and_set_bit(port / BITS_PER_LONG,
+				    &vcpu_info->evtchn_pending_sel)) {
+		vcpu_info->evtchn_upcall_pending = 1;
+		if (!vcpu_info->evtchn_upcall_mask)
+			force_evtchn_callback();
+	}
+}
+EXPORT_SYMBOL_GPL(unmask_evtchn);
+
+void irq_resume(void)
+{
+	struct evtchn_op op;
+	int cpu, pirq, virq, ipi, irq, evtchn;
+
+	init_evtchn_cpu_bindings();
+
+	/* New event-channel space is not 'live' yet. */
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+		mask_evtchn(evtchn);
+
+	/* Check that no PIRQs are still bound. */
+	for (pirq = 0; pirq < NR_PIRQS; pirq++)
+		BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND);
+
+	/* Secondary CPUs must have no VIRQ or IPI bindings. */
+	for (cpu = 1; cpu < NR_CPUS; cpu++) {
+		for (virq = 0; virq < NR_VIRQS; virq++)
+			BUG_ON(per_cpu(virq_to_irq, cpu)[virq] != -1);
+		for (ipi = 0; ipi < NR_IPIS; ipi++)
+			BUG_ON(per_cpu(ipi_to_irq, cpu)[ipi] != -1);
+	}
+
+	/* No IRQ <-> event-channel mappings. */
+	for (irq = 0; irq < NR_IRQS; irq++)
+		irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+		evtchn_to_irq[evtchn] = -1;
+
+	/* Primary CPU: rebind VIRQs automatically. */
+	for (virq = 0; virq < NR_VIRQS; virq++) {
+		if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1)
+			continue;
+
+		BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0));
+
+		/* Get a new binding from Xen. */
+		memset(&op, 0, sizeof(op));
+		op.cmd              = EVTCHNOP_bind_virq;
+		op.u.bind_virq.virq = virq;
+		op.u.bind_virq.vcpu = 0;
+		BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
+		evtchn = op.u.bind_virq.port;
+
+		/* Record the new mapping. */
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+		/* Ready for use. */
+		unmask_evtchn(evtchn);
+	}
+
+	/* Primary CPU: rebind IPIs automatically. */
+	for (ipi = 0; ipi < NR_IPIS; ipi++) {
+		if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1)
+			continue;
+
+		BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0));
+
+		/* Get a new binding from Xen. */
+		memset(&op, 0, sizeof(op));
+		op.cmd = EVTCHNOP_bind_ipi;
+		op.u.bind_ipi.vcpu = 0;
+		BUG_ON(HYPERVISOR_event_channel_op(&op) != 0);
+		evtchn = op.u.bind_ipi.port;
+
+		/* Record the new mapping. */
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+		/* Ready for use. */
+		unmask_evtchn(evtchn);
+	}
+}
+
+void init_8259A(int auto_eoi)
+{
+}
+
+void __init init_ISA_irqs (void)
+{
+}
+
+void __init init_IRQ(void)
+{
+	int i;
+	int cpu;
+
+	irq_ctx_init(0);
+
+	spin_lock_init(&irq_mapping_update_lock);
+
+	init_evtchn_cpu_bindings();
+
+	/* No VIRQ or IPI bindings. */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		for (i = 0; i < NR_VIRQS; i++)
+			per_cpu(virq_to_irq, cpu)[i] = -1;
+		for (i = 0; i < NR_IPIS; i++)
+			per_cpu(ipi_to_irq, cpu)[i] = -1;
+	}
+
+	/* No event-channel -> IRQ mappings. */
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		evtchn_to_irq[i] = -1;
+		mask_evtchn(i); /* No event channels are 'live' right now. */
+	}
+
+	/* No IRQ -> event-channel mappings. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_info[i] = IRQ_UNBOUND;
+
+	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+	for (i = 0; i < NR_DYNIRQS; i++) {
+		irq_bindcount[dynirq_to_irq(i)] = 0;
+
+		irq_desc[dynirq_to_irq(i)].status  = IRQ_DISABLED;
+		irq_desc[dynirq_to_irq(i)].action  = NULL;
+		irq_desc[dynirq_to_irq(i)].depth   = 1;
+		irq_desc[dynirq_to_irq(i)].handler = &dynirq_type;
+	}
+
+	/* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
+	for (i = 0; i < NR_PIRQS; i++) {
+		irq_bindcount[pirq_to_irq(i)] = 1;
+
+#ifdef RTC_IRQ
+		/* If not domain 0, force our RTC driver to fail its probe. */
+		if ((i == RTC_IRQ) &&
+		    !(xen_start_info->flags & SIF_INITDOMAIN))
+			continue;
+#endif
+
+		irq_desc[pirq_to_irq(i)].status  = IRQ_DISABLED;
+		irq_desc[pirq_to_irq(i)].action  = NULL;
+		irq_desc[pirq_to_irq(i)].depth   = 1;
+		irq_desc[pirq_to_irq(i)].handler = &pirq_type;
+	}
+}
--- /dev/null
+++ linus-2.6/include/asm-i386/mach-xen/irq_vectors.h
@@ -0,0 +1,109 @@
+/*
+ * This file should contain #defines for all of the interrupt vector
+ * numbers used by this architecture.
+ *
+ * In addition, there are some standard defines:
+ *
+ *	FIRST_EXTERNAL_VECTOR:
+ *		The first free place for external interrupts
+ *
+ *	SYSCALL_VECTOR:
+ *		The IRQ vector a syscall makes the user to kernel transition
+ *		under.
+ *
+ *	TIMER_IRQ:
+ *		The IRQ number the timer interrupt comes in at.
+ *
+ *	NR_IRQS:
+ *		The total number of interrupt vectors (including all the
+ *		architecture specific interrupts) needed.
+ *
+ */			
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR	0x20
+
+#define SYSCALL_VECTOR		0x80
+
+/*
+ * Vectors 0x20-0x2f are used for ISA interrupts.
+ */
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#define SPURIOUS_APIC_VECTOR	0xff
+#define ERROR_APIC_VECTOR	0xfe
+
+#define THERMAL_APIC_VECTOR	0xf0
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee)
+ * we start at 0x31 to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR	0x31
+#define FIRST_SYSTEM_VECTOR	0xef
+
+/*
+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
+ * Right now the APIC is mostly only used for SMP.
+ * 256 vectors is an architectural limit. (we can have
+ * more than 256 devices theoretically, but they will
+ * have to use shared interrupts)
+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
+ * the usable vector space is 0x20-0xff (224 vectors)
+ */
+
+#define RESCHEDULE_VECTOR	0
+#define CALL_FUNCTION_VECTOR	1
+#define NR_IPIS			2
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS 256
+
+#define FPU_IRQ			13
+
+#define	FIRST_VM86_IRQ		3
+#define LAST_VM86_IRQ		15
+#define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
+
+/*
+ * The flat IRQ space is divided into two regions:
+ *  1. A one-to-one mapping of real physical IRQs. This space is only used
+ *     if we have physical device-access privilege. This region is at the 
+ *     start of the IRQ space so that existing device drivers do not need
+ *     to be modified to translate physical IRQ numbers into our IRQ space.
+ *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ *     are bound using the provided bind/unbind functions.
+ */
+
+#define PIRQ_BASE		0
+#define PIRQ_BITS		8
+#define NR_PIRQS		(1 << PIRQ_BITS)
+
+#define DYNIRQ_BASE		(PIRQ_BASE + NR_PIRQS)
+#define DYNIRQ_BITS		8
+#define NR_DYNIRQS		(1 << DYNIRQ_BITS)
+
+#define NR_IRQS			(NR_PIRQS + NR_DYNIRQS)
+#define NR_IRQ_VECTORS		NR_IRQS
+
+#define pirq_to_irq(_x)		((_x) + PIRQ_BASE)
+#define irq_to_pirq(_x)		((_x) - PIRQ_BASE)
+
+#define dynirq_to_irq(_x)	((_x) + DYNIRQ_BASE)
+#define irq_to_dynirq(_x)	((_x) - DYNIRQ_BASE)
+
+#endif /* _ASM_IRQ_VECTORS_H */
--- /dev/null
+++ linus-2.6/include/xen/evtchn.h
@@ -0,0 +1,116 @@
+/******************************************************************************
+ * evtchn.h
+ * 
+ * Communication via Xen event channels.
+ * Also definitions for the device that demuxes notifications to userspace.
+ * 
+ * Copyright (c) 2004-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_EVTCHN_H__
+#define __ASM_EVTCHN_H__
+
+#include <linux/config.h>
+#include <linux/interrupt.h>
+#include <asm/hypervisor.h>
+#include <asm/ptrace.h>
+#include <asm/synch_bitops.h>
+#include <xen/interface/event_channel.h>
+#include <linux/smp.h>
+
+/*
+ * LOW-LEVEL DEFINITIONS
+ */
+
+/*
+ * Dynamically bind an event source to an IRQ-like callback handler.
+ * On some platforms this may not be implemented via the Linux IRQ subsystem.
+ * The IRQ argument passed to the callback handler is the same as returned
+ * from the bind call. It may not correspond to a Linux IRQ number.
+ * Returns IRQ or negative errno.
+ * UNBIND: Takes IRQ to unbind from; automatically closes the event channel.
+ */
+extern int bind_evtchn_to_irqhandler(
+	unsigned int evtchn,
+	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+extern int bind_virq_to_irqhandler(
+	unsigned int virq,
+	unsigned int cpu,
+	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+extern int bind_ipi_to_irqhandler(
+	unsigned int ipi,
+	unsigned int cpu,
+	irqreturn_t (*handler)(int, void *, struct pt_regs *),
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+
+/*
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (even for bindings
+ * made with bind_evtchn_to_irqhandler()).
+ */
+extern void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+
+extern void irq_resume(void);
+
+/* Entry point for notifications into Linux subsystems. */
+asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
+
+/* Entry point for notifications into the userland character device. */
+extern void evtchn_device_upcall(int port);
+
+extern void mask_evtchn(int port);
+extern void unmask_evtchn(int port);
+
+static inline void clear_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	synch_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void notify_remote_via_evtchn(int port)
+{
+	struct evtchn_op op;
+	op.cmd         = EVTCHNOP_send,
+	op.u.send.port = port;
+	(void)HYPERVISOR_event_channel_op(&op);
+}
+
+/*
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently dropped.
+ */
+extern void notify_remote_via_irq(int irq);
+
+#endif /* __ASM_EVTCHN_H__ */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (23 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 24/35] Add support for Xen event channels Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 16:23   ` Daniel Walker
                     ` (2 more replies)
  2006-05-09  7:00 ` [RFC PATCH 26/35] Add Xen subarch reboot support Chris Wright
                   ` (10 subsequent siblings)
  35 siblings, 3 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: time --]
[-- Type: text/plain, Size: 29162 bytes --]

Add support for Xen time abstractions. To avoid expensive traps into
the hypervisor, the passage of time is extrapolated from the local TSC
and a set of timestamps and scaling factors exported to the guest via
shared memory. Xen also provides a periodic interrupt facility which
is used to drive updates of xtime and jiffies, and perform the usual
process accounting and profiling.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/Makefile |    6 
 drivers/xen/core/time.c   | 1045 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1050 insertions(+), 1 deletion(-)

--- linus-2.6.orig/arch/i386/kernel/Makefile
+++ linus-2.6/arch/i386/kernel/Makefile
@@ -9,8 +9,11 @@ obj-y	:= process.o semaphore.o signal.o 
 		pci-dma.o i386_ksyms.o i387.o bootflag.o \
 		quirks.o i8237.o topology.o alternative.o
 
+timers-y			:= timers/
+timers-$(CONFIG_XEN)		:=
+
 obj-y				+= cpu/
-obj-y				+= timers/
+obj-y				+= $(timers-y)
 obj-y				+= acpi/
 obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
 obj-$(CONFIG_MCA)		+= mca.o
@@ -45,6 +48,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 hw_irq-y			:= i8259.o
 
 hw_irq-$(CONFIG_XEN)		:= ../../../drivers/xen/core/evtchn.o
+time-$(CONFIG_XEN)		:= ../../../drivers/xen/core/time.o
 
 # vsyscall.o contains the vsyscall DSO images as __initdata.
 # We must build both images before we can assemble it.
--- /dev/null
+++ linus-2.6/drivers/xen/core/time.c
@@ -0,0 +1,1045 @@
+/*
+ *  time.c
+ *
+ *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ *
+ * This file contains the PC-specific time handling details:
+ * reading the RTC at bootup, etc..
+ * 1994-07-02    Alan Modra
+ *	fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
+ * 1995-03-26    Markus Kuhn
+ *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
+ *      precision CMOS clock update
+ * 1996-05-03    Ingo Molnar
+ *      fixed time warps in do_[slow|fast]_gettimeoffset()
+ * 1997-09-10	Updated NTP code according to technical memorandum Jan '96
+ *		"A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-09-05    (Various)
+ *	More robust do_fast_gettimeoffset() algorithm implemented
+ *	(works with APM, Cyrix 6x86MX and Centaur C6),
+ *	monotonic gettimeofday() with fast_get_timeoffset(),
+ *	drift-proof precision TSC calibration on boot
+ *	(C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
+ *	Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
+ *	ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
+ * 1998-12-16    Andrea Arcangeli
+ *	Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
+ *	because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
+ *	Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *	serialize accesses to xtime/lost_ticks).
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/efi.h>
+#include <linux/mca.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <linux/posix-timers.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/msr.h>
+#include <asm/delay.h>
+#include <asm/mpspec.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+#include <asm/sections.h>
+
+#include "mach_time.h"
+
+#include <linux/timex.h>
+#include <linux/config.h>
+
+#include <asm/hpet.h>
+
+#include <asm/arch_hooks.h>
+
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+
+#if defined (__i386__)
+#include <asm/i8259.h>
+#endif
+
+int pit_latch_buggy;              /* extern */
+
+#if defined(__x86_64__)
+unsigned long vxtime_hz = PIT_TICK_RATE;
+struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
+struct timespec __xtime __section_xtime;
+struct timezone __sys_tz __section_sys_tz;
+#endif
+
+unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
+extern unsigned long wall_jiffies;
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+#if defined (__i386__)
+#include <asm/i8253.h>
+#endif
+
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+extern struct init_timer_opts timer_tsc_init;
+extern struct timer_opts timer_tsc;
+#define timer_none timer_tsc
+struct timer_opts *cur_timer __read_mostly = &timer_tsc;
+
+/* These are peridically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+	u32 tsc_to_nsec_mul;
+	u32 tsc_to_usec_mul;
+	int tsc_shift;
+	u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+static struct timespec shadow_tv;
+static u32 shadow_tv_version;
+
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time;   /* System time (ns) at last processing. */
+static DEFINE_PER_CPU(u64, processed_system_time);
+
+/* How much CPU time was spent blocked and how much was 'stolen'? */
+static DEFINE_PER_CPU(u64, processed_stolen_time);
+static DEFINE_PER_CPU(u64, processed_blocked_time);
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
+#define NS_PER_TICK (1000000000LL/HZ)
+
+static inline void __normalize_time(time_t *sec, s64 *nsec)
+{
+	while (*nsec >= NSEC_PER_SEC) {
+		(*nsec) -= NSEC_PER_SEC;
+		(*sec)++;
+	}
+	while (*nsec < 0) {
+		(*nsec) += NSEC_PER_SEC;
+		(*sec)--;
+	}
+}
+
+/* Does this guest OS track Xen time, or set its wall clock independently? */
+static int independent_wallclock = 0;
+static int __init __independent_wallclock(char *str)
+{
+	independent_wallclock = 1;
+	return 1;
+}
+__setup("independent_wallclock", __independent_wallclock);
+
+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
+static int __init __permitted_clock_jitter(char *str)
+{
+	permitted_clock_jitter = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
+
+int tsc_disable __devinitdata = 0;
+
+static void delay_tsc(unsigned long loops)
+{
+	unsigned long bclock, now;
+
+	rdtscl(bclock);
+	do {
+		rep_nop();
+		rdtscl(now);
+	} while ((now - bclock) < loops);
+}
+
+struct timer_opts timer_tsc = {
+	.name = "tsc",
+	.delay = delay_tsc,
+};
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#else
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#endif
+
+	return product;
+}
+
+#if defined (__i386__)
+int read_current_timer(unsigned long *timer_val)
+{
+	rdtscl(*timer_val);
+	return 0;
+}
+#endif
+
+void init_cpu_khz(void)
+{
+	u64 __cpu_khz = 1000000ULL << 32;
+	struct vcpu_time_info *info;
+	info = &HYPERVISOR_shared_info->vcpu_info[0].time;
+	do_div(__cpu_khz, info->tsc_to_system_mul);
+	if (info->tsc_shift < 0)
+		cpu_khz = __cpu_khz << -info->tsc_shift;
+	else
+		cpu_khz = __cpu_khz >> info->tsc_shift;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+	u64 now, delta;
+	rdtscll(now);
+	delta = now - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
+{
+	u64 now, delta;
+	rdtscll(now);
+	delta = now - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift);
+}
+
+static void __update_wallclock(time_t sec, long nsec)
+{
+	long wtm_nsec, xtime_nsec;
+	time_t wtm_sec, xtime_sec;
+	u64 tmp, wc_nsec;
+
+	/* Adjust wall-clock time base based on wall_jiffies ticks. */
+	wc_nsec = processed_system_time;
+	wc_nsec += sec * (u64)NSEC_PER_SEC;
+	wc_nsec += nsec;
+	wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
+
+	/* Split wallclock base into seconds and nanoseconds. */
+	tmp = wc_nsec;
+	xtime_nsec = do_div(tmp, 1000000000);
+	xtime_sec  = (time_t)tmp;
+
+	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
+	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
+
+	set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
+	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+	ntp_clear();
+}
+
+static void update_wallclock(void)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+
+	do {
+		shadow_tv_version = s->wc_version;
+		rmb();
+		shadow_tv.tv_sec  = s->wc_sec;
+		shadow_tv.tv_nsec = s->wc_nsec;
+		rmb();
+	} while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
+
+	if (!independent_wallclock)
+		__update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static void get_time_values_from_xen(void)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_time_info *src;
+	struct shadow_time_info *dst;
+
+	src = &s->vcpu_info[smp_processor_id()].time;
+	dst = &per_cpu(shadow_time, smp_processor_id());
+
+	do {
+		dst->version = src->version;
+		rmb();
+		dst->tsc_timestamp     = src->tsc_timestamp;
+		dst->system_timestamp  = src->system_time;
+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+		dst->tsc_shift         = src->tsc_shift;
+		rmb();
+	} while ((src->version & 1) | (dst->version ^ src->version));
+
+	dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+}
+
+static inline int time_values_up_to_date(int cpu)
+{
+	struct vcpu_time_info *src;
+	struct shadow_time_info *dst;
+
+	src = &HYPERVISOR_shared_info->vcpu_info[cpu].time;
+	dst = &per_cpu(shadow_time, cpu);
+
+	rmb();
+	return (dst->version == src->version);
+}
+
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+	unsigned char val;
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	val = inb_p(RTC_PORT(1));
+	lock_cmos_suffix(addr);
+	return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+	lock_cmos_prefix(addr);
+	outb_p(addr, RTC_PORT(0));
+	outb_p(val, RTC_PORT(1));
+	lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+/*
+ * This version of gettimeofday has microsecond resolution
+ * and better than microsecond precision on fast x86 machines with TSC.
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+	unsigned long seq;
+	unsigned long usec, sec;
+	unsigned long max_ntp_tick;
+	s64 nsec;
+	unsigned int cpu;
+	struct shadow_time_info *shadow;
+	u32 local_time_version;
+
+	cpu = get_cpu();
+	shadow = &per_cpu(shadow_time, cpu);
+
+	do {
+		unsigned long lost;
+
+		local_time_version = shadow->version;
+		seq = read_seqbegin(&xtime_lock);
+
+		usec = get_usec_offset(shadow);
+		lost = jiffies - wall_jiffies;
+
+		/*
+		 * If time_adjust is negative then NTP is slowing the clock
+		 * so make sure not to go into next possible interval.
+		 * Better to lose some accuracy than have time go backwards..
+		 */
+		if (unlikely(time_adjust < 0)) {
+			max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
+			usec = min(usec, max_ntp_tick);
+
+			if (lost)
+				usec += lost * max_ntp_tick;
+		}
+		else if (unlikely(lost))
+			usec += lost * (USEC_PER_SEC / HZ);
+
+		sec = xtime.tv_sec;
+		usec += (xtime.tv_nsec / NSEC_PER_USEC);
+
+		nsec = shadow->system_timestamp - processed_system_time;
+		__normalize_time(&sec, &nsec);
+		usec += (long)nsec / NSEC_PER_USEC;
+
+		if (unlikely(!time_values_up_to_date(cpu))) {
+			/*
+			 * We may have blocked for a long time,
+			 * rendering our calculations invalid
+			 * (e.g. the time delta may have
+			 * overflowed). Detect that and recalculate
+			 * with fresh values.
+			 */
+			get_time_values_from_xen();
+			continue;
+		}
+	} while (read_seqretry(&xtime_lock, seq) ||
+		 (local_time_version != shadow->version));
+
+	put_cpu();
+
+	while (usec >= USEC_PER_SEC) {
+		usec -= USEC_PER_SEC;
+		sec++;
+	}
+
+	tv->tv_sec = sec;
+	tv->tv_usec = usec;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+int do_settimeofday(struct timespec *tv)
+{
+	time_t sec;
+	s64 nsec;
+	unsigned int cpu;
+	struct shadow_time_info *shadow;
+
+	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+		return -EINVAL;
+
+	cpu = get_cpu();
+	shadow = &per_cpu(shadow_time, cpu);
+
+	write_seqlock_irq(&xtime_lock);
+
+	/*
+	 * Ensure we don't get blocked for a long time so that our time delta
+	 * overflows. If that were to happen then our shadow time values would
+	 * be stale, so we can retry with fresh ones.
+	 */
+	for (;;) {
+		nsec = tv->tv_nsec - get_nsec_offset(shadow);
+		if (time_values_up_to_date(cpu))
+			break;
+		get_time_values_from_xen();
+	}
+	sec = tv->tv_sec;
+	__normalize_time(&sec, &nsec);
+
+	if (independent_wallclock) {
+		nsec -= shadow->system_timestamp;
+		__normalize_time(&sec, &nsec);
+		__update_wallclock(sec, nsec);
+	}
+
+	write_sequnlock_irq(&xtime_lock);
+
+	put_cpu();
+
+	clock_was_set();
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+static int set_rtc_mmss(unsigned long nowtime)
+{
+	int retval;
+
+	WARN_ON(irqs_disabled());
+
+	if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
+		return 0;
+
+	/* gets recalled with irq locally disabled */
+	spin_lock_irq(&rtc_lock);
+	if (efi_enabled)
+		retval = efi_set_rtc_mmss(nowtime);
+	else
+		retval = mach_set_rtc_mmss(nowtime);
+	spin_unlock_irq(&rtc_lock);
+
+	return retval;
+}
+
+/* monotonic_clock(): returns # of nanoseconds passed since time_init()
+ *		Note: This function is required to return accurate
+ *		time even in the absence of multiple timer ticks.
+ */
+unsigned long long monotonic_clock(void)
+{
+	int cpu = get_cpu();
+	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+	u64 time;
+	u32 local_time_version;
+
+	do {
+		local_time_version = shadow->version;
+		barrier();
+		time = shadow->system_timestamp + get_nsec_offset(shadow);
+		if (!time_values_up_to_date(cpu))
+			get_time_values_from_xen();
+		barrier();
+	} while (local_time_version != shadow->version);
+
+	put_cpu();
+
+	return time;
+}
+EXPORT_SYMBOL(monotonic_clock);
+
+unsigned long long sched_clock(void)
+{
+	return monotonic_clock();
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+#ifdef __x86_64__
+	/* Assume the lock function has either no stack frame or only a single word.
+	   This checks if the address on the stack looks like a kernel text address.
+	   There is a small window for false hits, but in that case the tick
+	   is just accounted to the spinlock function.
+	   Better would be to write these functions in assembler again
+	   and check exactly. */
+	if (in_lock_functions(pc)) {
+		char *v = *(char **)regs->rsp;
+		if ((v >= _stext && v <= _etext) ||
+			(v >= _sinittext && v <= _einittext) ||
+			(v >= (char *)MODULES_VADDR  && v <= (char *)MODULES_END))
+			return (unsigned long)v;
+		return ((unsigned long *)regs->rsp)[1];
+	}
+#else
+	if (in_lock_functions(pc))
+		return *(unsigned long *)(regs->ebp + 4);
+#endif
+
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+#endif
+
+irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	s64 delta, delta_cpu, stolen, blocked;
+	u64 sched_time;
+	int i, cpu = smp_processor_id();
+	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+	write_seqlock(&xtime_lock);
+
+	do {
+		get_time_values_from_xen();
+
+		/* Obtain a consistent snapshot of elapsed wallclock cycles. */
+		delta = delta_cpu =
+			shadow->system_timestamp + get_nsec_offset(shadow);
+		delta     -= processed_system_time;
+		delta_cpu -= per_cpu(processed_system_time, cpu);
+
+		/*
+		 * Obtain a consistent snapshot of stolen/blocked cycles. We
+		 * can use state_entry_time to detect if we get preempted here.
+		 */
+		do {
+			sched_time = runstate->state_entry_time;
+			barrier();
+			stolen = runstate->time[RUNSTATE_runnable] +
+				runstate->time[RUNSTATE_offline] -
+				per_cpu(processed_stolen_time, cpu);
+			blocked = runstate->time[RUNSTATE_blocked] -
+				per_cpu(processed_blocked_time, cpu);
+			barrier();
+		} while (sched_time != runstate->state_entry_time);
+	} while (!time_values_up_to_date(cpu));
+
+	if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
+	     unlikely(delta_cpu < -(s64)permitted_clock_jitter))
+	    && printk_ratelimit()) {
+		printk("Timer ISR/%d: Time went backwards: "
+		       "delta=%lld delta_cpu=%lld shadow=%lld "
+		       "off=%lld processed=%lld cpu_processed=%lld\n",
+		       cpu, delta, delta_cpu, shadow->system_timestamp,
+		       (s64)get_nsec_offset(shadow),
+		       processed_system_time,
+		       per_cpu(processed_system_time, cpu));
+		for (i = 0; i < num_online_cpus(); i++)
+			printk(" %d: %lld\n", i,
+			       per_cpu(processed_system_time, i));
+	}
+
+	/* System-wide jiffy work. */
+	while (delta >= NS_PER_TICK) {
+		delta -= NS_PER_TICK;
+		processed_system_time += NS_PER_TICK;
+		do_timer(regs);
+	}
+
+	if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
+		update_wallclock();
+		clock_was_set();
+	}
+
+	write_sequnlock(&xtime_lock);
+
+	/*
+	 * Account stolen ticks.
+	 * HACK: Passing NULL to account_steal_time()
+	 * ensures that the ticks are accounted as stolen.
+	 */
+	if ((stolen > 0) && (delta_cpu > 0)) {
+		delta_cpu -= stolen;
+		if (unlikely(delta_cpu < 0))
+			stolen += delta_cpu; /* clamp local-time progress */
+		do_div(stolen, NS_PER_TICK);
+		per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
+		per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
+		account_steal_time(NULL, (cputime_t)stolen);
+	}
+
+	/*
+	 * Account blocked ticks.
+	 * HACK: Passing idle_task to account_steal_time()
+	 * ensures that the ticks are accounted as idle/wait.
+	 */
+	if ((blocked > 0) && (delta_cpu > 0)) {
+		delta_cpu -= blocked;
+		if (unlikely(delta_cpu < 0))
+			blocked += delta_cpu; /* clamp local-time progress */
+		do_div(blocked, NS_PER_TICK);
+		per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
+		per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
+		account_steal_time(idle_task(cpu), (cputime_t)blocked);
+	}
+
+	/* Account user/system ticks. */
+	if (delta_cpu > 0) {
+		do_div(delta_cpu, NS_PER_TICK);
+		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+		if (user_mode(regs))
+			account_user_time(current, (cputime_t)delta_cpu);
+		else
+			account_system_time(current, HARDIRQ_OFFSET,
+					    (cputime_t)delta_cpu);
+	}
+
+	/* Local timer processing (see update_process_times()). */
+	run_local_timers();
+	if (rcu_pending(cpu))
+		rcu_check_callbacks(cpu, user_mode(regs));
+	scheduler_tick();
+	run_posix_cpu_timers(current);
+
+	return IRQ_HANDLED;
+}
+
+static void init_missing_ticks_accounting(int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+	memset(runstate, 0, sizeof(*runstate));
+
+	area.addr.v = runstate;
+	HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+
+	per_cpu(processed_blocked_time, cpu) =
+		runstate->time[RUNSTATE_blocked];
+	per_cpu(processed_stolen_time, cpu) =
+		runstate->time[RUNSTATE_runnable] +
+		runstate->time[RUNSTATE_offline];
+}
+
+/* not static: needed by APM */
+unsigned long get_cmos_time(void)
+{
+	unsigned long retval;
+
+	spin_lock(&rtc_lock);
+
+	if (efi_enabled)
+		retval = efi_get_time();
+	else
+		retval = mach_get_cmos_time();
+
+	spin_unlock(&rtc_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(get_cmos_time);
+
+static void sync_cmos_clock(unsigned long dummy);
+
+static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+
+static void sync_cmos_clock(unsigned long dummy)
+{
+	struct timeval now, next;
+	int fail = 1;
+
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+	 * called as close as possible to 500 ms before the new second starts.
+	 * This code is run on a timer.  If the clock is set, that timer
+	 * may not expire at the correct time.  Thus, we adjust...
+	 */
+	if (!ntp_synced())
+		/*
+		 * Not synced, exit, do not restart a timer (if one is
+		 * running, let it run out).
+		 */
+		return;
+
+	do_gettimeofday(&now);
+	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
+		fail = set_rtc_mmss(now.tv_sec);
+
+	next.tv_usec = USEC_AFTER - now.tv_usec;
+	if (next.tv_usec <= 0)
+		next.tv_usec += USEC_PER_SEC;
+
+	if (!fail)
+		next.tv_sec = 659;
+	else
+		next.tv_sec = 0;
+
+	if (next.tv_usec >= USEC_PER_SEC) {
+		next.tv_sec++;
+		next.tv_usec -= USEC_PER_SEC;
+	}
+	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
+}
+
+void notify_arch_cmos_timer(void)
+{
+	mod_timer(&sync_cmos_timer, jiffies + 1);
+}
+
+static long clock_cmos_diff, sleep_start;
+
+static struct timer_opts *last_timer;
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/*
+	 * Estimate time zone so that set_time can update the clock
+	 */
+	clock_cmos_diff = -get_cmos_time();
+	clock_cmos_diff += get_seconds();
+	sleep_start = get_cmos_time();
+	last_timer = cur_timer;
+	cur_timer = &timer_none;
+	if (last_timer->suspend)
+		last_timer->suspend(state);
+	return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	unsigned long sec;
+	unsigned long sleep_length;
+
+#ifdef CONFIG_HPET_TIMER
+	if (is_hpet_enabled())
+		hpet_reenable();
+#endif
+	sec = get_cmos_time() + clock_cmos_diff;
+	sleep_length = (get_cmos_time() - sleep_start) * HZ;
+	write_seqlock_irqsave(&xtime_lock, flags);
+	xtime.tv_sec = sec;
+	xtime.tv_nsec = 0;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	jiffies += sleep_length;
+	wall_jiffies += sleep_length;
+	if (last_timer->resume)
+		last_timer->resume();
+	cur_timer = last_timer;
+	last_timer = NULL;
+	touch_softlockup_watchdog();
+	return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+	.resume = timer_resume,
+	.suspend = timer_suspend,
+	set_kset_name("timer"),
+};
+
+
+/* XXX this driverfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+	.id	= 0,
+	.cls	= &timer_sysclass,
+};
+
+static int time_init_device(void)
+{
+	int error = sysdev_class_register(&timer_sysclass);
+	if (!error)
+		error = sysdev_register(&device_timer);
+	return error;
+}
+
+device_initcall(time_init_device);
+
+#ifdef CONFIG_HPET_TIMER
+extern void (*late_time_init)(void);
+/* Duplicate of time_init() below, with hpet_enable part added */
+static void __init hpet_time_init(void)
+{
+	xtime.tv_sec = get_cmos_time();
+	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+	set_normalized_timespec(&wall_to_monotonic,
+		-xtime.tv_sec, -xtime.tv_nsec);
+
+	if ((hpet_enable() >= 0) && hpet_use_timer) {
+		printk("Using HPET for base-timer\n");
+	}
+
+	cur_timer = select_timer();
+	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+
+	time_init_hook();
+}
+#endif
+
+/* Dynamically-mapped IRQ. */
+DEFINE_PER_CPU(int, timer_irq);
+
+extern void (*late_time_init)(void);
+static void setup_cpu0_timer_irq(void)
+{
+	per_cpu(timer_irq, 0) =
+		bind_virq_to_irqhandler(
+			VIRQ_TIMER,
+			0,
+			timer_interrupt,
+			SA_INTERRUPT,
+			"timer0",
+			NULL);
+	BUG_ON(per_cpu(timer_irq, 0) < 0);
+}
+
+void __init time_init(void)
+{
+#ifdef CONFIG_HPET_TIMER
+	if (is_hpet_capable()) {
+		/*
+		 * HPET initialization needs to do memory-mapped io. So, let
+		 * us do a late initialization after mem_init().
+		 */
+		late_time_init = hpet_time_init;
+		return;
+	}
+#endif
+	get_time_values_from_xen();
+
+	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+	per_cpu(processed_system_time, 0) = processed_system_time;
+	init_missing_ticks_accounting(0);
+
+	update_wallclock();
+
+	init_cpu_khz();
+	printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
+	       cpu_khz / 1000, cpu_khz % 1000);
+
+#if defined(__x86_64__)
+	vxtime.mode = VXTIME_TSC;
+	vxtime.quot = (1000000L << 32) / vxtime_hz;
+	vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+	sync_core();
+	rdtscll(vxtime.last_tsc);
+#endif
+
+	/* Cannot request_irq() until kmem is initialised. */
+	late_time_init = setup_cpu0_timer_irq;
+}
+
+/* Convert jiffies to system time. */
+u64 jiffies_to_st(unsigned long j)
+{
+	unsigned long seq;
+	long delta;
+	u64 st;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		delta = j - jiffies;
+		/* NB. The next check can trigger in some wrap-around cases,
+		 * but that's ok: we'll just end up with a shorter timeout. */
+		if (delta < 1)
+			delta = 1;
+		st = processed_system_time + (delta * (u64)NS_PER_TICK);
+	} while (read_seqretry(&xtime_lock, seq));
+
+	return st;
+}
+EXPORT_SYMBOL(jiffies_to_st);
+
+/*
+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
+ * These functions are based on implementations from arch/s390/kernel/time.c
+ */
+void stop_hz_timer(void)
+{
+	unsigned int cpu = smp_processor_id();
+	unsigned long j;
+
+	/* We must do this /before/ checking rcu_pending(). */
+	cpu_set(cpu, nohz_cpu_mask);
+	smp_mb();
+
+	/* Leave ourselves in 'tick mode' if rcu or softirq pending. */
+	if (rcu_pending(cpu) || local_softirq_pending()) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		j = jiffies + 1;
+	} else {
+		j = next_timer_interrupt();
+	}
+
+	BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
+}
+
+void start_hz_timer(void)
+{
+	cpu_clear(smp_processor_id(), nohz_cpu_mask);
+}
+
+/* No locking required. We are only CPU running, and interrupts are off. */
+void time_resume(void)
+{
+	init_cpu_khz();
+
+	get_time_values_from_xen();
+
+	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+	per_cpu(processed_system_time, 0) = processed_system_time;
+	init_missing_ticks_accounting(0);
+
+	update_wallclock();
+}
+
+#ifdef CONFIG_SMP
+static char timer_name[NR_CPUS][15];
+
+void local_setup_timer(unsigned int cpu)
+{
+	int seq;
+
+	BUG_ON(cpu == 0);
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		/* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
+		per_cpu(processed_system_time, cpu) =
+			per_cpu(shadow_time, 0).system_timestamp;
+		init_missing_ticks_accounting(cpu);
+	} while (read_seqretry(&xtime_lock, seq));
+
+	sprintf(timer_name[cpu], "timer%d", cpu);
+	per_cpu(timer_irq, cpu) =
+		bind_virq_to_irqhandler(
+			VIRQ_TIMER,
+			cpu,
+			timer_interrupt,
+			SA_INTERRUPT,
+			timer_name[cpu],
+			NULL);
+	BUG_ON(per_cpu(timer_irq, cpu) < 0);
+}
+
+void local_teardown_timer(unsigned int cpu)
+{
+	BUG_ON(cpu == 0);
+	unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL);
+}
+#endif
+
+/*
+ * /proc/sys/xen: This really belongs in another file. It can stay here for
+ * now however.
+ */
+static ctl_table xen_subtable[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "independent_wallclock",
+		.data		= &independent_wallclock,
+		.maxlen		= sizeof(independent_wallclock),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.ctl_name	= 2,
+		.procname	= "permitted_clock_jitter",
+		.data		= &permitted_clock_jitter,
+		.maxlen		= sizeof(permitted_clock_jitter),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax
+	},
+	{ 0 }
+};
+static ctl_table xen_table[] = {
+	{
+		.ctl_name	= 123,
+		.procname	= "xen",
+		.mode		= 0555,
+		.child		= xen_subtable},
+	{ 0 }
+};
+static int __init xen_sysctl_init(void)
+{
+	(void)register_sysctl_table(xen_table, 0);
+	return 0;
+}
+__initcall(xen_sysctl_init);

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 26/35] Add Xen subarch reboot support
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (24 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 25/35] Add Xen time abstractions Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 17:02   ` Andi Kleen
  2006-05-09  7:00 ` [RFC PATCH 27/35] Add nosegneg capability to the vsyscall page notes Chris Wright
                   ` (9 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: reboot --]
[-- Type: text/plain, Size: 6861 bytes --]

Add remote reboot capability, so that a virtual machine can be
rebooted, halted or 'powered off' by external management tools.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
TODO:
 - move poweroff and halt to generic similar to c_a_d

 arch/i386/kernel/Makefile |    1
 drivers/xen/core/reboot.c |  232 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 233 insertions(+)

 arch/i386/kernel/Makefile |    1 
 drivers/xen/core/reboot.c |  232 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 233 insertions(+)

--- linus-2.6.orig/arch/i386/kernel/Makefile
+++ linus-2.6/arch/i386/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_SCx200)		+= scx200.o
 hw_irq-y			:= i8259.o
 
 hw_irq-$(CONFIG_XEN)		:= ../../../drivers/xen/core/evtchn.o
+reboot-$(CONFIG_XEN)		:= ../../../drivers/xen/core/reboot.o
 time-$(CONFIG_XEN)		:= ../../../drivers/xen/core/time.o
 
 # vsyscall.o contains the vsyscall DSO images as __initdata.
--- /dev/null
+++ linus-2.6/drivers/xen/core/reboot.c
@@ -0,0 +1,232 @@
+#define __KERNEL_SYSCALLS__
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <linux/stringify.h>
+#include <linux/syscalls.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/xencons.h>
+
+#include <asm/irq.h>
+#include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+#endif
+
+extern void ctrl_alt_del(void);
+
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+
+void machine_emergency_restart(void)
+{
+	/* We really want to get pending console data out before we die. */
+	xencons_force_flush();
+	HYPERVISOR_sched_op(SCHEDOP_shutdown, SHUTDOWN_reboot);
+}
+
+void machine_restart(char * __unused)
+{
+	machine_emergency_restart();
+}
+
+void machine_halt(void)
+{
+	machine_power_off();
+}
+
+void machine_power_off(void)
+{
+	/* We really want to get pending console data out before we die. */
+	xencons_force_flush();
+	HYPERVISOR_sched_op(SCHEDOP_shutdown, SHUTDOWN_poweroff);
+}
+
+int reboot_thru_bios = 0;	/* for dmi_scan.c */
+EXPORT_SYMBOL(machine_restart);
+EXPORT_SYMBOL(machine_halt);
+EXPORT_SYMBOL(machine_power_off);
+
+
+/******************************************************************************
+ * Stop/pickle callback handling.
+ */
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+static void __shutdown_handler(void *unused);
+static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
+
+static int shutdown_process(void *__unused)
+{
+	static char *envp[] = { "HOME=/", "TERM=linux",
+				"PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
+	static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
+
+	if ((shutting_down == SHUTDOWN_POWEROFF) ||
+	    (shutting_down == SHUTDOWN_HALT)) {
+		if (execve(poweroff_argv[0], poweroff_argv, envp) < 0) {
+			sys_reboot(LINUX_REBOOT_MAGIC1,
+				   LINUX_REBOOT_MAGIC2,
+				   LINUX_REBOOT_CMD_POWER_OFF,
+				   NULL);
+		}
+	}
+
+	shutting_down = SHUTDOWN_INVALID; /* could try again */
+
+	return 0;
+}
+
+static void __shutdown_handler(void *unused)
+{
+	int err = 0;
+
+	if (shutting_down != SHUTDOWN_SUSPEND)
+		err = kernel_thread(shutdown_process, NULL,
+				    CLONE_FS | CLONE_FILES);
+
+	if (err < 0) {
+		printk(KERN_WARNING "Error creating shutdown process (%d): "
+		       "retrying...\n", -err);
+		schedule_delayed_work(&shutdown_work, HZ/2);
+	}
+}
+
+static void shutdown_handler(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	char *str;
+	xenbus_transaction_t xbt;
+	int err;
+
+	if (shutting_down != SHUTDOWN_INVALID)
+		return;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+	/* Ignore read errors and empty reads. */
+	if (XENBUS_IS_ERR_READ(str)) {
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	xenbus_write(xbt, "control", "shutdown", "");
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN) {
+		kfree(str);
+		goto again;
+	}
+
+	if (strcmp(str, "poweroff") == 0)
+		shutting_down = SHUTDOWN_POWEROFF;
+	else if (strcmp(str, "reboot") == 0)
+		ctrl_alt_del();
+	else if (strcmp(str, "suspend") == 0)
+		shutting_down = SHUTDOWN_SUSPEND;
+	else if (strcmp(str, "halt") == 0)
+		shutting_down = SHUTDOWN_HALT;
+	else {
+		printk("Ignoring shutdown request: %s\n", str);
+		shutting_down = SHUTDOWN_INVALID;
+	}
+
+	if (shutting_down != SHUTDOWN_INVALID)
+		schedule_work(&shutdown_work);
+
+	kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+			  unsigned int len)
+{
+	char sysrq_key = '\0';
+	xenbus_transaction_t xbt;
+	int err;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+		printk(KERN_ERR "Unable to read sysrq code in "
+		       "control/sysrq\n");
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	if (sysrq_key != '\0')
+		xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	if (sysrq_key != '\0')
+		handle_sysrq(sysrq_key, NULL, NULL);
+#endif
+}
+
+static struct xenbus_watch shutdown_watch = {
+	.node = "control/shutdown",
+	.callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+	.node ="control/sysrq",
+	.callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(struct notifier_block *notifier,
+                                  unsigned long event,
+                                  void *data)
+{
+	int err;
+
+	err = register_xenbus_watch(&shutdown_watch);
+	if (err)
+		printk(KERN_ERR "Failed to set shutdown watcher\n");
+
+	err = register_xenbus_watch(&sysrq_watch);
+	if (err)
+		printk(KERN_ERR "Failed to set sysrq watcher\n");
+
+	return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = setup_shutdown_watcher
+	};
+	register_xenstore_notifier(&xenstore_notifier);
+	return 0;
+}
+
+subsys_initcall(setup_shutdown_event);

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 27/35] Add nosegneg capability to the vsyscall page notes
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (25 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 26/35] Add Xen subarch reboot support Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 28/35] add support for Xen feature queries Chris Wright
                   ` (8 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: i386-vsyscall-note --]
[-- Type: text/plain, Size: 2145 bytes --]

Add the "nosegneg" fake capabilty to the vsyscall page notes. This is
used by the runtime linker to select a glibc version which then
disables negative-offset accesses to the thread-local segment via
%gs. These accesses require emulation in Xen (because segments are
truncated to protect the hypervisor address space) and avoiding them
provides a measurable performance boost.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/vsyscall-note.S |   29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

--- linus-2.6.orig/arch/i386/kernel/vsyscall-note.S
+++ linus-2.6/arch/i386/kernel/vsyscall-note.S
@@ -3,6 +3,7 @@
  * Here we can supply some information useful to userland.
  */
 
+#include <linux/config.h>
 #include <linux/uts.h>
 #include <linux/version.h>
 
@@ -23,3 +24,31 @@
 	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
 	.long LINUX_VERSION_CODE
 	ASM_ELF_NOTE_END
+
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *	hwcap 0 nosegneg
+ * to match the mapping of bit to name that we give here.
+ */
+#define NOTE_KERNELCAP_BEGIN(ncaps, mask) \
+	ASM_ELF_NOTE_BEGIN(".note.kernelcap", "a", "GNU", 2) \
+	.long ncaps, mask
+#define NOTE_KERNELCAP(bit, name) \
+	.byte bit; .asciz name
+#define NOTE_KERNELCAP_END ASM_ELF_NOTE_END
+
+NOTE_KERNELCAP_BEGIN(1, 1)
+NOTE_KERNELCAP(1, "nosegneg")
+NOTE_KERNELCAP_END
+#endif
+

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 28/35] add support for Xen feature queries
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (26 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 27/35] Add nosegneg capability to the vsyscall page notes Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-12 21:56   ` Pavel Machek
  2006-05-09  7:00 ` [RFC PATCH 29/35] Add the Xen virtual console driver Chris Wright
                   ` (7 subsequent siblings)
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: xen-features --]
[-- Type: text/plain, Size: 3210 bytes --]

Add support for parsing and interpreting hypervisor feature
flags. These allow the kernel to determine what features are provided
by the underlying hypervisor. For example, whether page tables need to
be write protected explicitly by the kernel, and whether the kernel
(appears to) run in ring 0 rather than ring 1. This information allows
the kernel to improve performance by avoiding unnecessary actions.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 drivers/Makefile                            |    1 
 drivers/xen/Makefile                        |    3 ++
 drivers/xen/core/Makefile                   |    3 ++
 drivers/xen/core/features.c                 |   29 ++++++++++++++++++++++++++++
 include/asm-i386/mach-xen/setup_arch_post.h |    2 +
 include/xen/features.h                      |   20 +++++++++++++++++++
 6 files changed, 58 insertions(+)

--- linus-2.6.orig/include/asm-i386/mach-xen/setup_arch_post.h
+++ linus-2.6/include/asm-i386/mach-xen/setup_arch_post.h
@@ -27,6 +27,8 @@ static void __init machine_specific_arch
 {
 	struct physdev_op op;
 
+	setup_xen_features();
+
 	HYPERVISOR_shared_info =
 		(struct shared_info *)__va(xen_start_info->shared_info);
 	memset(empty_zero_page, 0, sizeof(empty_zero_page));
--- linus-2.6.orig/drivers/Makefile
+++ linus-2.6/drivers/Makefile
@@ -31,6 +31,7 @@ obj-y				+= base/ block/ misc/ mfd/ net/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_PPC_PMAC)		+= macintosh/
+obj-$(CONFIG_XEN)		+= xen/
 obj-$(CONFIG_IDE)		+= ide/
 obj-$(CONFIG_FC4)		+= fc4/
 obj-$(CONFIG_SCSI)		+= scsi/
--- /dev/null
+++ linus-2.6/include/xen/features.h
@@ -0,0 +1,20 @@
+/******************************************************************************
+ * features.h
+ *
+ * Query the features reported by Xen.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+
+#ifndef __ASM_XEN_FEATURES_H__
+#define __ASM_XEN_FEATURES_H__
+
+#include <xen/interface/version.h>
+
+extern void setup_xen_features(void);
+
+extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
+
+#define xen_feature(flag)	(xen_features[flag])
+
+#endif /* __ASM_XEN_FEATURES_H__ */
--- /dev/null
+++ linus-2.6/drivers/xen/Makefile
@@ -0,0 +1,3 @@
+
+obj-y	+= core/
+
--- /dev/null
+++ linus-2.6/drivers/xen/core/Makefile
@@ -0,0 +1,3 @@
+
+obj-y	:= features.o
+
--- /dev/null
+++ linus-2.6/drivers/xen/core/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void setup_xen_features(void)
+{
+	struct xen_feature_info fi;
+	int i, j;
+
+	for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+		fi.submap_idx = i;
+		if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+			break;
+		for (j=0; j<32; j++)
+			xen_features[i*32+j] = !!(fi.submap & 1<<j);
+	}
+}

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 29/35] Add the Xen virtual console driver.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (27 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 28/35] add support for Xen feature queries Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 13:26   ` Andi Kleen
  2006-05-13 12:27   ` Andrew Morton
  2006-05-09  7:00 ` [RFC PATCH 30/35] Add apply_to_page_range() function Chris Wright
                   ` (6 subsequent siblings)
  35 siblings, 2 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: xen-console --]
[-- Type: text/plain, Size: 22374 bytes --]

This provides a bootstrap and ongoing emergency console which is
intended to be available from very early during boot and at all times
thereafter, in contrast with alternatives such as UDP-based syslogd,
or logging in via ssh. The protocol is based on a simple shared-memory
ring buffer.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 drivers/char/tty_io.c              |    7 
 drivers/xen/Makefile               |    1 
 drivers/xen/console/Makefile       |    2 
 drivers/xen/console/console.c      |  637 +++++++++++++++++++++++++++++++++++++
 drivers/xen/console/xencons_ring.c |  141 ++++++++
 include/xen/xencons.h              |   14 
 6 files changed, 801 insertions(+), 1 deletion(-)

--- linus-2.6.orig/drivers/char/tty_io.c
+++ linus-2.6/drivers/char/tty_io.c
@@ -132,6 +132,8 @@ LIST_HEAD(tty_drivers);			/* linked list
    vt.c for deeply disgusting hack reasons */
 DEFINE_MUTEX(tty_mutex);
 
+int console_use_vt = 1;
+
 #ifdef CONFIG_UNIX98_PTYS
 extern struct tty_driver *ptm_driver;	/* Unix98 pty masters; for /dev/ptmx */
 extern int pty_limit;		/* Config limit on Unix98 ptys */
@@ -2060,7 +2062,7 @@ retry_open:
 		goto got_driver;
 	}
 #ifdef CONFIG_VT
-	if (device == MKDEV(TTY_MAJOR,0)) {
+	if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) {
 		extern struct tty_driver *console_driver;
 		driver = console_driver;
 		index = fg_console;
@@ -3258,6 +3260,8 @@ static int __init tty_init(void)
 #endif
 
 #ifdef CONFIG_VT
+	if (!console_use_vt)
+		goto out_vt;
 	cdev_init(&vc0_cdev, &console_fops);
 	if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) ||
 	    register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0)
@@ -3266,6 +3270,7 @@ static int __init tty_init(void)
 	class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0");
 
 	vty_init();
+ out_vt:
 #endif
 	return 0;
 }
--- linus-2.6.orig/drivers/xen/Makefile
+++ linus-2.6/drivers/xen/Makefile
@@ -1,3 +1,4 @@
 
 obj-y	+= core/
+obj-y	+= console/
 
--- /dev/null
+++ linus-2.6/drivers/xen/console/Makefile
@@ -0,0 +1,2 @@
+
+obj-y	:= console.o xencons_ring.o
--- /dev/null
+++ linus-2.6/drivers/xen/console/console.c
@@ -0,0 +1,637 @@
+/******************************************************************************
+ * console.c
+ * 
+ * Virtual console driver.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/bootmem.h>
+#include <linux/sysrq.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/uaccess.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/xencons.h>
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * Modes:
+ *  'xencons=off'  [XC_OFF]:     Console is disabled.
+ *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
+ *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
+ *                 [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY.
+ * 
+ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
+ * warnings from standard distro startup scripts.
+ */
+static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT;
+static int xc_num = -1;
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static unsigned long sysrq_requested;
+extern int sysrq_enabled;
+#endif
+
+static int __init xencons_setup(char *str)
+{
+	char *q;
+	int n;
+
+	if (!strncmp(str, "ttyS", 4))
+		xc_mode = XC_SERIAL;
+	else if (!strncmp(str, "tty", 3))
+		xc_mode = XC_TTY;
+	else if (!strncmp(str, "off", 3))
+		xc_mode = XC_OFF;
+
+	switch (xc_mode) {
+	case XC_SERIAL:
+		n = simple_strtol(str+4, &q, 10);
+		if (q > (str + 4))
+			xc_num = n;
+		break;
+	case XC_TTY:
+		n = simple_strtol(str+3, &q, 10);
+		if (q > (str + 3))
+			xc_num = n;
+		break;
+	default:
+		break;
+	}
+
+	return 1;
+}
+__setup("xencons=", xencons_setup);
+
+/* The kernel and user-land drivers share a common transmit buffer. */
+static unsigned int wbuf_size = 4096;
+#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
+static char *wbuf;
+static unsigned int wc, wp; /* write_cons, write_prod */
+
+static int __init xencons_bufsz_setup(char *str)
+{
+	unsigned int goal;
+	goal = simple_strtoul(str, NULL, 0);
+	while (wbuf_size < goal)
+		wbuf_size <<= 1;
+	return 1;
+}
+__setup("xencons_bufsz=", xencons_bufsz_setup);
+
+/* This lock protects accesses to the common transmit buffer. */
+static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED;
+
+/* Common transmit-kick routine. */
+static void __xencons_tx_flush(void);
+
+static struct tty_driver *xencons_driver;
+
+/******************** Kernel console driver ********************************/
+
+static void kcons_write(
+	struct console *c, const char *s, unsigned int count)
+{
+	int           i = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+
+	while (i < count) {
+		for (; i < count; i++) {
+			if ((wp - wc) >= (wbuf_size - 1))
+				break;
+			if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
+				wbuf[WBUF_MASK(wp++)] = '\r';
+		}
+
+		__xencons_tx_flush();
+	}
+
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void kcons_write_dom0(
+	struct console *c, const char *s, unsigned int count)
+{
+	int rc;
+
+	while ((count > 0) &&
+	       ((rc = HYPERVISOR_console_io(
+			CONSOLEIO_write, count, (char *)s)) > 0)) {
+		count -= rc;
+		s += rc;
+	}
+}
+
+static struct tty_driver *kcons_device(struct console *c, int *index)
+{
+	*index = 0;
+	return xencons_driver;
+}
+
+static struct console kcons_info = {
+	.device	= kcons_device,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1,
+};
+
+static int __init xen_console_init(void)
+{
+	if (xen_init() < 0)
+		return 0;
+
+	if (xen_start_info->flags & SIF_INITDOMAIN) {
+		if (xc_mode == XC_DEFAULT)
+			xc_mode = XC_SERIAL;
+		kcons_info.write = kcons_write_dom0;
+		if (xc_mode == XC_SERIAL)
+			kcons_info.flags |= CON_ENABLED;
+	} else {
+		if (xc_mode == XC_DEFAULT)
+			xc_mode = XC_TTY;
+		kcons_info.write = kcons_write;
+	}
+
+	switch (xc_mode) {
+	case XC_SERIAL:
+		strcpy(kcons_info.name, "ttyS");
+		if (xc_num == -1)
+			xc_num = 0;
+		break;
+
+	case XC_TTY:
+		strcpy(kcons_info.name, "tty");
+		if (xc_num == -1)
+			xc_num = 1;
+		break;
+
+	default:
+		return 0;
+	}
+
+	wbuf = alloc_bootmem(wbuf_size);
+
+	register_console(&kcons_info);
+
+	return 0;
+}
+console_initcall(xen_console_init);
+
+/*** Useful function for console debugging -- goes straight to Xen. ***/
+asmlinkage int xprintk(const char *fmt, ...)
+{
+	va_list args;
+	int printk_len;
+	static char printk_buf[1024];
+
+	/* Emit the output into the temporary buffer */
+	va_start(args, fmt);
+	printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+	va_end(args);
+
+	/* Send the processed output directly to Xen. */
+	kcons_write_dom0(NULL, printk_buf, printk_len);
+
+	return 0;
+}
+
+/*** Forcibly flush console data before dying. ***/
+void xencons_force_flush(void)
+{
+	int sz;
+
+	/* Emergency console is synchronous, so there's nothing to flush. */
+	if (xen_start_info->flags & SIF_INITDOMAIN)
+		return;
+
+	/* Spin until console data is flushed through to the daemon. */
+	while (wc != wp) {
+		int sent = 0;
+		if ((sz = wp - wc) == 0)
+			continue;
+		sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+		if (sent > 0)
+			wc += sent;
+	}
+}
+
+
+/******************** User-space console driver (/dev/console) ************/
+
+#define DRV(_d)         (_d)
+#define TTY_INDEX(_tty) ((_tty)->index)
+
+static struct termios *xencons_termios[MAX_NR_CONSOLES];
+static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
+static struct tty_struct *xencons_tty;
+static int xencons_priv_irq;
+static char x_char;
+
+void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
+{
+	int           i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	if (xencons_tty == NULL)
+		goto out;
+
+	for (i = 0; i < len; i++) {
+#ifdef CONFIG_MAGIC_SYSRQ
+		if (sysrq_enabled) {
+			if (buf[i] == '\x0f') { /* ^O */
+				sysrq_requested = jiffies;
+				continue; /* don't print the sysrq key */
+			} else if (sysrq_requested) {
+				unsigned long sysrq_timeout =
+					sysrq_requested + HZ*2;
+				sysrq_requested = 0;
+				if (time_before(jiffies, sysrq_timeout)) {
+					spin_unlock_irqrestore(
+						&xencons_lock, flags);
+					handle_sysrq(
+						buf[i], regs, xencons_tty);
+					spin_lock_irqsave(
+						&xencons_lock, flags);
+					continue;
+				}
+			}
+		}
+#endif
+		tty_insert_flip_char(xencons_tty, buf[i], 0);
+	}
+	tty_flip_buffer_push(xencons_tty);
+
+ out:
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void __xencons_tx_flush(void)
+{
+	int sent, sz, work_done = 0;
+
+	if (x_char) {
+		if (xen_start_info->flags & SIF_INITDOMAIN)
+			kcons_write_dom0(NULL, &x_char, 1);
+		else
+			while (x_char)
+				if (xencons_ring_send(&x_char, 1) == 1)
+					break;
+		x_char = 0;
+		work_done = 1;
+	}
+
+	while (wc != wp) {
+		sz = wp - wc;
+		if (sz > (wbuf_size - WBUF_MASK(wc)))
+			sz = wbuf_size - WBUF_MASK(wc);
+		if (xen_start_info->flags & SIF_INITDOMAIN) {
+			kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
+			wc += sz;
+		} else {
+			sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+			if (sent == 0)
+				break;
+			wc += sent;
+		}
+		work_done = 1;
+	}
+
+	if (work_done && (xencons_tty != NULL)) {
+		wake_up_interruptible(&xencons_tty->write_wait);
+		if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
+		    (xencons_tty->ldisc.write_wakeup != NULL))
+			(xencons_tty->ldisc.write_wakeup)(xencons_tty);
+	}
+}
+
+void xencons_tx(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+/* Privileged receive callback and transmit kicker. */
+static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
+                                          struct pt_regs *regs)
+{
+	static char rbuf[16];
+	int         l;
+
+	while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
+		xencons_rx(rbuf, l, regs);
+
+	xencons_tx();
+
+	return IRQ_HANDLED;
+}
+
+static int xencons_write_room(struct tty_struct *tty)
+{
+	return wbuf_size - (wp - wc);
+}
+
+static int xencons_chars_in_buffer(struct tty_struct *tty)
+{
+	return wp - wc;
+}
+
+static void xencons_send_xchar(struct tty_struct *tty, char ch)
+{
+	unsigned long flags;
+
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	x_char = ch;
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_throttle(struct tty_struct *tty)
+{
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	if (I_IXOFF(tty))
+		xencons_send_xchar(tty, STOP_CHAR(tty));
+}
+
+static void xencons_unthrottle(struct tty_struct *tty)
+{
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	if (I_IXOFF(tty)) {
+		if (x_char != 0)
+			x_char = 0;
+		else
+			xencons_send_xchar(tty, START_CHAR(tty));
+	}
+}
+
+static void xencons_flush_buffer(struct tty_struct *tty)
+{
+	unsigned long flags;
+
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	wc = wp = 0;
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static inline int __xencons_put_char(int ch)
+{
+	char _ch = (char)ch;
+	if ((wp - wc) == wbuf_size)
+		return 0;
+	wbuf[WBUF_MASK(wp++)] = _ch;
+	return 1;
+}
+
+static int xencons_write(
+	struct tty_struct *tty,
+	const unsigned char *buf,
+	int count)
+{
+	int i;
+	unsigned long flags;
+
+	if (TTY_INDEX(tty) != 0)
+		return count;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+
+	for (i = 0; i < count; i++)
+		if (!__xencons_put_char(buf[i]))
+			break;
+
+	if (i != 0)
+		__xencons_tx_flush();
+
+	spin_unlock_irqrestore(&xencons_lock, flags);
+
+	return i;
+}
+
+static void xencons_put_char(struct tty_struct *tty, u_char ch)
+{
+	unsigned long flags;
+
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	(void)__xencons_put_char(ch);
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_flush_chars(struct tty_struct *tty)
+{
+	unsigned long flags;
+
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	unsigned long orig_jiffies = jiffies;
+
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	while (DRV(tty->driver)->chars_in_buffer(tty)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(1);
+		if (signal_pending(current))
+			break;
+		if (timeout && time_after(jiffies, orig_jiffies + timeout))
+			break;
+	}
+
+	set_current_state(TASK_RUNNING);
+}
+
+static int xencons_open(struct tty_struct *tty, struct file *filp)
+{
+	unsigned long flags;
+
+	if (TTY_INDEX(tty) != 0)
+		return 0;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	tty->driver_data = NULL;
+	if (xencons_tty == NULL)
+		xencons_tty = tty;
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+
+	return 0;
+}
+
+static void xencons_close(struct tty_struct *tty, struct file *filp)
+{
+	unsigned long flags;
+
+	if (TTY_INDEX(tty) != 0)
+		return;
+
+	if (tty->count == 1) {
+		tty->closing = 1;
+		tty_wait_until_sent(tty, 0);
+		if (DRV(tty->driver)->flush_buffer != NULL)
+			DRV(tty->driver)->flush_buffer(tty);
+		if (tty->ldisc.flush_buffer != NULL)
+			tty->ldisc.flush_buffer(tty);
+		tty->closing = 0;
+		spin_lock_irqsave(&xencons_lock, flags);
+		xencons_tty = NULL;
+		spin_unlock_irqrestore(&xencons_lock, flags);
+	}
+}
+
+static struct tty_operations xencons_ops = {
+	.open = xencons_open,
+	.close = xencons_close,
+	.write = xencons_write,
+	.write_room = xencons_write_room,
+	.put_char = xencons_put_char,
+	.flush_chars = xencons_flush_chars,
+	.chars_in_buffer = xencons_chars_in_buffer,
+	.send_xchar = xencons_send_xchar,
+	.flush_buffer = xencons_flush_buffer,
+	.throttle = xencons_throttle,
+	.unthrottle = xencons_unthrottle,
+	.wait_until_sent = xencons_wait_until_sent,
+};
+
+static int __init xencons_init(void)
+{
+	int rc;
+
+	if (xen_init() < 0)
+		return -ENODEV;
+
+	if (xc_mode == XC_OFF)
+		return 0;
+
+	xencons_ring_init();
+
+	xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ?
+					  1 : MAX_NR_CONSOLES);
+	if (xencons_driver == NULL)
+		return -ENOMEM;
+
+	DRV(xencons_driver)->name            = "xencons";
+	DRV(xencons_driver)->major           = TTY_MAJOR;
+	DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
+	DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
+	DRV(xencons_driver)->init_termios    = tty_std_termios;
+	DRV(xencons_driver)->flags           =
+		TTY_DRIVER_REAL_RAW |
+		TTY_DRIVER_RESET_TERMIOS |
+		TTY_DRIVER_NO_DEVFS;
+	DRV(xencons_driver)->termios         = xencons_termios;
+	DRV(xencons_driver)->termios_locked  = xencons_termios_locked;
+
+	if (xc_mode == XC_SERIAL) {
+		DRV(xencons_driver)->name        = "ttyS";
+		DRV(xencons_driver)->minor_start = 64 + xc_num;
+		DRV(xencons_driver)->name_base   = 0 + xc_num;
+	} else {
+		DRV(xencons_driver)->name        = "tty";
+		DRV(xencons_driver)->minor_start = xc_num;
+		DRV(xencons_driver)->name_base   = xc_num;
+	}
+
+	tty_set_operations(xencons_driver, &xencons_ops);
+
+	if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
+		printk("WARNING: Failed to register Xen virtual "
+		       "console driver as '%s%d'\n",
+		       DRV(xencons_driver)->name,
+		       DRV(xencons_driver)->name_base);
+		put_tty_driver(xencons_driver);
+		xencons_driver = NULL;
+		return rc;
+	}
+
+	tty_register_device(xencons_driver, 0, NULL);
+
+	if (xen_start_info->flags & SIF_INITDOMAIN) {
+		xencons_priv_irq = bind_virq_to_irqhandler(
+			VIRQ_CONSOLE,
+			0,
+			xencons_priv_interrupt,
+			0,
+			"console",
+			NULL);
+		BUG_ON(xencons_priv_irq < 0);
+	}
+
+	printk("Xen virtual console successfully installed as %s%d\n",
+	       DRV(xencons_driver)->name,
+	       DRV(xencons_driver)->name_base );
+
+	return 0;
+}
+
+module_init(xencons_init);
--- /dev/null
+++ linus-2.6/drivers/xen/console/xencons_ring.c
@@ -0,0 +1,141 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/xencons.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <xen/interface/io/console.h>
+
+static int xencons_irq;
+
+static inline struct xencons_interface *xencons_interface(void)
+{
+	return mfn_to_virt(xen_start_info->console_mfn);
+}
+
+static inline void notify_daemon(void)
+{
+	/* Use evtchn: this is called early, before irq is set up. */
+	notify_remote_via_evtchn(xen_start_info->console_evtchn);
+}
+
+int xencons_ring_send(const char *data, unsigned len)
+{
+	int sent = 0;
+	struct xencons_interface *intf = xencons_interface();
+	XENCONS_RING_IDX cons, prod;
+
+	cons = intf->out_cons;
+	prod = intf->out_prod;
+	mb();
+	BUG_ON((prod - cons) > sizeof(intf->out));
+
+	while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
+		intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
+
+	wmb();
+	intf->out_prod = prod;
+
+	notify_daemon();
+
+	return sent;
+}
+
+static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
+{
+	struct xencons_interface *intf = xencons_interface();
+	XENCONS_RING_IDX cons, prod;
+
+	cons = intf->in_cons;
+	prod = intf->in_prod;
+	mb();
+	BUG_ON((prod - cons) > sizeof(intf->in));
+
+	while (cons != prod) {
+		xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
+		cons++;
+	}
+
+	mb();
+	intf->in_cons = cons;
+
+	notify_daemon();
+
+	xencons_tx();
+
+	return IRQ_HANDLED;
+}
+
+int xencons_ring_init(void)
+{
+	int err;
+
+	if (xencons_irq)
+		unbind_from_irqhandler(xencons_irq, NULL);
+	xencons_irq = 0;
+
+	if (!xen_start_info->console_evtchn)
+		return 0;
+
+	err = bind_evtchn_to_irqhandler(
+		xen_start_info->console_evtchn,
+		handle_input, 0, "xencons", NULL);
+	if (err <= 0) {
+		printk(KERN_ERR "XEN console request irq failed %i\n", err);
+		return err;
+	}
+
+	xencons_irq = err;
+
+	/* In case we have in-flight data after save/restore... */
+	notify_daemon();
+
+	return 0;
+}
+
+void xencons_resume(void)
+{
+	(void)xencons_ring_init();
+}
--- /dev/null
+++ linus-2.6/include/xen/xencons.h
@@ -0,0 +1,14 @@
+#ifndef __ASM_XENCONS_H__
+#define __ASM_XENCONS_H__
+
+void xencons_force_flush(void);
+void xencons_resume(void);
+
+/* Interrupt work hooks. Receive data, or kick data out. */
+void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
+void xencons_tx(void);
+
+int xencons_ring_init(void);
+int xencons_ring_send(const char *data, unsigned len);
+
+#endif /* __ASM_XENCONS_H__ */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 30/35] Add apply_to_page_range() function
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (28 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 29/35] Add the Xen virtual console driver Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 31/35] Add Xen grant table support Chris Wright
                   ` (5 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: apply-to-page-range --]
[-- Type: text/plain, Size: 4007 bytes --]

Add a new mm function apply_to_page_range() which applies a given
function to every pte in a given virtual address range in a given mm
structure. This is a generic alternative to cut-and-pasting the Linux
idiomatic pagetable walking code in every place that a sequence of
PTEs must be accessed.

Although this interface is intended to be useful in a wide range of
situations, it is currently used specifically by several Xen
subsystems, for example: to ensure that pagetables have been allocated
for a virtual address range, and to construct batched special
pagetable update requests to map I/O memory (in ioremap()).

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 include/linux/mm.h |    5 ++
 mm/memory.c        |   94 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

--- linus-2.6.orig/include/linux/mm.h
+++ linus-2.6/include/linux/mm.h
@@ -1014,6 +1014,11 @@ struct page *follow_page(struct vm_area_
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
+typedef int (*pte_fn_t)(pte_t *pte, struct page *pmd_page, unsigned long addr,
+			void *data);
+extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
+			       unsigned long size, pte_fn_t fn, void *data);
+
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
--- linus-2.6.orig/mm/memory.c
+++ linus-2.6/mm/memory.c
@@ -1356,6 +1356,100 @@ int remap_pfn_range(struct vm_area_struc
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+static inline int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
+{
+	pte_t *pte;
+	int err;
+	struct page *pmd_page;
+	spinlock_t *ptl;
+
+	pte = (mm == &init_mm) ?
+		pte_alloc_kernel(pmd, addr) :
+		pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	if (!pte)
+		return -ENOMEM;
+
+	BUG_ON(pmd_huge(*pmd));
+
+	pmd_page = pmd_page(*pmd);
+
+	do {
+		err = fn(pte, pmd_page, addr, data);
+		if (err)
+			break;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+
+	if (mm != &init_mm)
+		pte_unmap_unlock(pte-1, ptl);
+	return err;
+}
+
+static inline int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	int err;
+
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+		if (err)
+			break;
+	} while (pmd++, addr = next, addr != end);
+	return err;
+}
+
+static inline int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
+{
+	pud_t *pud;
+	unsigned long next;
+	int err;
+
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+		if (err)
+			break;
+	} while (pud++, addr = next, addr != end);
+	return err;
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long end = addr + size;
+	int err;
+
+	BUG_ON(addr >= end);
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+	return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_page_range);
+
 /*
  * handle_pte_fault chooses page fault handler according to an entry
  * which was read non-atomically.  Before making any commitment, on

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 31/35] Add Xen grant table support
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (29 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 30/35] Add apply_to_page_range() function Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 32/35] Add Xen driver utility functions Chris Wright
                   ` (4 subsequent siblings)
  35 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: grant-table --]
[-- Type: text/plain, Size: 16337 bytes --]

Add Xen 'grant table' driver which allows granting of access to
selected local memory pages by other virtual machines and,
symmetrically, the mapping of remote memory pages which other virtual
machines have granted access to.

This driver is a prerequisite for many of the Xen virtual device
drivers, which grant the 'device driver domain' restricted and
temporary access to only those memory pages that are currently
involved in I/O operations.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 drivers/xen/core/Makefile |    2 
 drivers/xen/core/gnttab.c |  422 ++++++++++++++++++++++++++++++++++++++++++++++
 include/xen/gnttab.h      |  105 +++++++++++
 3 files changed, 528 insertions(+), 1 deletion(-)

--- linus-2.6.orig/drivers/xen/core/Makefile
+++ linus-2.6/drivers/xen/core/Makefile
@@ -1,3 +1,3 @@
 
-obj-y	:= features.o
+obj-y	:= features.o gnttab.o
 
--- /dev/null
+++ linus-2.6/include/xen/gnttab.h
@@ -0,0 +1,105 @@
+/******************************************************************************
+ * gnttab.h
+ * 
+ * Two sets of functionality:
+ * 1. Granting foreign access to our memory reservation.
+ * 2. Accessing others' memory reservations via grant references.
+ * (i.e., mechanisms for both sender and recipient of grant references)
+ * 
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2005, Christopher Clark
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_GNTTAB_H__
+#define __ASM_GNTTAB_H__
+
+#include <linux/config.h>
+#include <asm/hypervisor.h>
+#include <xen/interface/grant_table.h>
+
+/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
+#define NR_GRANT_FRAMES 4
+
+struct gnttab_free_callback {
+	struct gnttab_free_callback *next;
+	void (*fn)(void *);
+	void *arg;
+	u16 count;
+};
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+				int readonly);
+
+/*
+ * End access through the given grant reference, iff the grant entry is no
+ * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
+ * use.
+ */
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
+
+/*
+ * Eventually end access through the given grant reference, and once that
+ * access has been ended, free the given page too.  Access will be ended
+ * immediately iff the grant entry is not in use, otherwise it will happen
+ * some time later.  page may be 0, in which case no freeing will occur.
+ */
+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
+			       unsigned long page);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
+
+int gnttab_query_foreign_access(grant_ref_t ref);
+
+/*
+ * operations on reserved batches of grant references
+ */
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
+
+void gnttab_free_grant_reference(grant_ref_t ref);
+
+void gnttab_free_grant_references(grant_ref_t head);
+
+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+				    grant_ref_t release);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+				  void (*fn)(void *), void *arg, u16 count);
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+				     unsigned long frame, int readonly);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
+				       unsigned long pfn);
+
+#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
+
+#endif /* __ASM_GNTTAB_H__ */
--- /dev/null
+++ linus-2.6/drivers/xen/core/gnttab.c
@@ -0,0 +1,422 @@
+/******************************************************************************
+ * gnttab.c
+ * 
+ * Granting foreign access to our memory reservation.
+ * 
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable.h>
+#include <xen/interface/xen.h>
+#include <asm/fixmap.h>
+#include <asm/uaccess.h>
+#include <xen/gnttab.h>
+#include <asm/bitops.h>
+
+#define WPRINTK(fmt, args...)				\
+	printk(KERN_WARNING "xen_grant: " fmt, ##args)
+
+
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
+
+/* External tools reserve first few grant table entries. */
+#define NR_RESERVED_ENTRIES 8
+
+#define NR_GRANT_ENTRIES \
+	(NR_GRANT_FRAMES * PAGE_SIZE / sizeof(struct grant_entry))
+#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1)
+
+static grant_ref_t gnttab_list[NR_GRANT_ENTRIES];
+static int gnttab_free_count;
+static grant_ref_t gnttab_free_head;
+static spinlock_t gnttab_list_lock = SPIN_LOCK_UNLOCKED;
+
+static struct grant_entry *shared = NULL;
+
+static struct gnttab_free_callback *gnttab_free_callback_list = NULL;
+
+static int get_free_entries(int count)
+{
+	unsigned long flags;
+	int ref;
+	grant_ref_t head;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	if (gnttab_free_count < count) {
+		spin_unlock_irqrestore(&gnttab_list_lock, flags);
+		return -1;
+	}
+	ref = head = gnttab_free_head;
+	gnttab_free_count -= count;
+	while (count-- > 1)
+		head = gnttab_list[head];
+	gnttab_free_head = gnttab_list[head];
+	gnttab_list[head] = GNTTAB_LIST_END;
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+	return ref;
+}
+
+#define get_free_entry() get_free_entries(1)
+
+static void do_free_callbacks(void)
+{
+	struct gnttab_free_callback *callback, *next;
+
+	callback = gnttab_free_callback_list;
+	gnttab_free_callback_list = NULL;
+
+	while (callback != NULL) {
+		next = callback->next;
+		if (gnttab_free_count >= callback->count) {
+			callback->next = NULL;
+			callback->fn(callback->arg);
+		} else {
+			callback->next = gnttab_free_callback_list;
+			gnttab_free_callback_list = callback;
+		}
+		callback = next;
+	}
+}
+
+static inline void check_free_callbacks(void)
+{
+	if (unlikely(gnttab_free_callback_list))
+		do_free_callbacks();
+}
+
+static void put_free_entry(grant_ref_t ref)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	gnttab_list[ref] = gnttab_free_head;
+	gnttab_free_head = ref;
+	gnttab_free_count++;
+	check_free_callbacks();
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+/*
+ * Public grant-issuing interface functions
+ */
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+				int readonly)
+{
+	int ref;
+
+	if (unlikely((ref = get_free_entry()) == -1))
+		return -ENOSPC;
+
+	shared[ref].frame = frame;
+	shared[ref].domid = domid;
+	wmb();
+	shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
+
+	return ref;
+}
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+				     unsigned long frame, int readonly)
+{
+	shared[ref].frame = frame;
+	shared[ref].domid = domid;
+	wmb();
+	shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
+}
+
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+	u16 nflags;
+
+	nflags = shared[ref].flags;
+
+	return (nflags & (GTF_reading|GTF_writing));
+}
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+{
+	u16 flags, nflags;
+
+	nflags = shared[ref].flags;
+	do {
+		if ((flags = nflags) & (GTF_reading|GTF_writing)) {
+			printk(KERN_ALERT "WARNING: g.e. still in use!\n");
+			return 0;
+		}
+	} while ((nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) !=
+		 flags);
+
+	return 1;
+}
+
+void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
+			       unsigned long page)
+{
+	if (gnttab_end_foreign_access_ref(ref, readonly)) {
+		put_free_entry(ref);
+		if (page != 0)
+			free_page(page);
+	} else {
+		/* XXX This needs to be fixed so that the ref and page are
+		   placed on a list to be freed up later. */
+		printk(KERN_WARNING
+		       "WARNING: leaking g.e. and page still in use!\n");
+	}
+}
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+{
+	int ref;
+
+	if (unlikely((ref = get_free_entry()) == -1))
+		return -ENOSPC;
+	gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
+
+	return ref;
+}
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
+				       unsigned long pfn)
+{
+	shared[ref].frame = pfn;
+	shared[ref].domid = domid;
+	wmb();
+	shared[ref].flags = GTF_accept_transfer;
+}
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+	unsigned long frame;
+	u16           flags;
+
+	/*
+         * If a transfer is not even yet started, try to reclaim the grant
+         * reference and return failure (== 0).
+         */
+	while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
+		if (synch_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+			return 0;
+		cpu_relax();
+	}
+
+	/* If a transfer is in progress then wait until it is completed. */
+	while (!(flags & GTF_transfer_completed)) {
+		flags = shared[ref].flags;
+		cpu_relax();
+	}
+
+	/* Read the frame number /after/ reading completion status. */
+	rmb();
+	frame = shared[ref].frame;
+	BUG_ON(frame == 0);
+
+	return frame;
+}
+
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
+{
+	unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
+	put_free_entry(ref);
+	return frame;
+}
+
+void gnttab_free_grant_reference(grant_ref_t ref)
+{
+	put_free_entry(ref);
+}
+
+void gnttab_free_grant_references(grant_ref_t head)
+{
+	grant_ref_t ref;
+	unsigned long flags;
+	int count = 1;
+	if (head == GNTTAB_LIST_END)
+		return;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	ref = head;
+	while (gnttab_list[ref] != GNTTAB_LIST_END) {
+		ref = gnttab_list[ref];
+		count++;
+	}
+	gnttab_list[ref] = gnttab_free_head;
+	gnttab_free_head = head;
+	gnttab_free_count += count;
+	check_free_callbacks();
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
+{
+	int h = get_free_entries(count);
+
+	if (h == -1)
+		return -ENOSPC;
+
+	*head = h;
+
+	return 0;
+}
+
+int gnttab_claim_grant_reference(grant_ref_t *private_head)
+{
+	grant_ref_t g = *private_head;
+	if (unlikely(g == GNTTAB_LIST_END))
+		return -ENOSPC;
+	*private_head = gnttab_list[g];
+	return g;
+}
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+				    grant_ref_t release)
+{
+	gnttab_list[release] = *private_head;
+	*private_head = release;
+}
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+				  void (*fn)(void *), void *arg, u16 count)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	if (callback->next)
+		goto out;
+	callback->fn = fn;
+	callback->arg = arg;
+	callback->count = count;
+	callback->next = gnttab_free_callback_list;
+	gnttab_free_callback_list = callback;
+	check_free_callbacks();
+ out:
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+#ifndef __ia64__
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+		      unsigned long addr, void *data)
+{
+	unsigned long **frames = (unsigned long **)data;
+
+	set_pte_at(&init_mm, addr, pte, pfn_pte((*frames)[0], PAGE_KERNEL));
+	(*frames)++;
+	return 0;
+}
+
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+			unsigned long addr, void *data)
+{
+
+	set_pte_at(&init_mm, addr, pte, __pte(0));
+	return 0;
+}
+#endif
+
+int gnttab_resume(void)
+{
+	struct gnttab_setup_table setup;
+	unsigned long frames[NR_GRANT_FRAMES];
+	int rc;
+#ifndef __ia64__
+	void *pframes = frames;
+	struct vm_struct *area;
+#endif
+
+	setup.dom        = DOMID_SELF;
+	setup.nr_frames  = NR_GRANT_FRAMES;
+	setup.frame_list = frames;
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
+	BUG_ON(rc || setup.status);
+
+#ifndef __ia64__
+	if (shared == NULL) {
+		area = get_vm_area(PAGE_SIZE * NR_GRANT_FRAMES, VM_IOREMAP);
+		BUG_ON(area == NULL);
+		shared = area->addr;
+	}
+	rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+				 PAGE_SIZE * NR_GRANT_FRAMES,
+				 map_pte_fn, &pframes);
+	BUG_ON(rc);
+#else
+	shared = __va(frames[0] << PAGE_SHIFT);
+	printk("grant table at %p\n", shared);
+#endif
+
+	return 0;
+}
+
+int gnttab_suspend(void)
+{
+
+#ifndef __ia64__
+	apply_to_page_range(&init_mm, (unsigned long)shared,
+			    PAGE_SIZE * NR_GRANT_FRAMES,
+			    unmap_pte_fn, NULL);
+#endif
+
+	return 0;
+}
+
+static int __init gnttab_init(void)
+{
+	int i;
+
+	if (xen_init() < 0)
+		return -ENODEV;
+
+	BUG_ON(gnttab_resume());
+
+	for (i = NR_RESERVED_ENTRIES; i < NR_GRANT_ENTRIES; i++)
+		gnttab_list[i] = i + 1;
+	gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES;
+	gnttab_free_head  = NR_RESERVED_ENTRIES;
+
+	printk("Grant table initialized\n");
+	return 0;
+}
+
+core_initcall(gnttab_init);

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 32/35] Add Xen driver utility functions.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (30 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 31/35] Add Xen grant table support Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 19:48   ` Greg KH
  2006-05-09 21:50   ` Andi Kleen
  2006-05-09  7:00 ` [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver Chris Wright
                   ` (3 subsequent siblings)
  35 siblings, 2 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach, Jan Beulich

[-- Attachment #1: driver-util --]
[-- Type: text/plain, Size: 3481 bytes --]

Allocate/destroy a 'vmalloc' VM area: alloc_vm_area and free_vm_area
The alloc function ensures that page tables are constructed for the
region of kernel virtual address space and mapped into init_mm.

Lock an area so that PTEs are accessible in the current address space:
lock_vm_area and unlock_vm_area
The lock function prevents context switches to a lazy mm that doesn't
have the area mapped into its page tables.  It also ensures that the
page tables are mapped into the current mm by causing the page fault
handler to copy the page directory pointers from init_mm into the
current mm.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: "Jan Beulich" <JBeulich@novell.com>
---
TODO:
- possible vmalloc_sync use instead

 drivers/xen/Makefile      |    2 +
 drivers/xen/util.c        |   70 ++++++++++++++++++++++++++++++++++++++++++++++
 include/xen/driver_util.h |   16 ++++++++++
 3 files changed, 88 insertions(+)

--- linus-2.6.orig/drivers/xen/Makefile
+++ linus-2.6/drivers/xen/Makefile
@@ -1,4 +1,6 @@
 
+obj-y	+= util.o
+
 obj-y	+= core/
 obj-y	+= console/
 
--- /dev/null
+++ linus-2.6/drivers/xen/util.c
@@ -0,0 +1,70 @@
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <xen/driver_util.h>
+
+static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	/* apply_to_page_range() does all the hard work. */
+	return 0;
+}
+
+struct vm_struct *alloc_vm_area(unsigned long size)
+{
+	struct vm_struct *area;
+
+	area = get_vm_area(size, VM_IOREMAP);
+	if (area == NULL)
+		return NULL;
+
+	/*
+	 * This ensures that page tables are constructed for this region
+	 * of kernel virtual address space and mapped into init_mm.
+	 */
+	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+				area->size, f, NULL)) {
+		free_vm_area(area);
+		return NULL;
+	}
+
+	return area;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+	struct vm_struct *ret;
+	ret = remove_vm_area(area->addr);
+	BUG_ON(ret != area);
+	kfree(area);
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
+void lock_vm_area(struct vm_struct *area)
+{
+	unsigned long i;
+	char c;
+
+	/*
+	 * Prevent context switch to a lazy mm that doesn't have this area
+	 * mapped into its page tables.
+	 */
+	preempt_disable();
+
+	/*
+	 * Ensure that the page tables are mapped into the current mm. The
+	 * page-fault path will copy the page directory pointers from init_mm.
+	 */
+	for (i = 0; i < area->size; i += PAGE_SIZE)
+		(void)__get_user(c, (char __user *)area->addr + i);
+}
+EXPORT_SYMBOL_GPL(lock_vm_area);
+
+void unlock_vm_area(struct vm_struct *area)
+{
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(unlock_vm_area);
--- /dev/null
+++ linus-2.6/include/xen/driver_util.h
@@ -0,0 +1,16 @@
+
+#ifndef __ASM_XEN_DRIVER_UTIL_H__
+#define __ASM_XEN_DRIVER_UTIL_H__
+
+#include <linux/config.h>
+#include <linux/vmalloc.h>
+
+/* Allocate/destroy a 'vmalloc' VM area. */
+extern struct vm_struct *alloc_vm_area(unsigned long size);
+extern void free_vm_area(struct vm_struct *area);
+
+/* Lock an area so that PTEs are accessible in the current address space. */
+extern void lock_vm_area(struct vm_struct *area);
+extern void unlock_vm_area(struct vm_struct *area);
+
+#endif /* __ASM_XEN_DRIVER_UTIL_H__ */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (31 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 32/35] Add Xen driver utility functions Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 16:06   ` Alexey Dobriyan
                     ` (3 more replies)
  2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
                   ` (2 subsequent siblings)
  35 siblings, 4 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: xenbus --]
[-- Type: text/plain, Size: 76782 bytes --]

This communicates with the machine control software via a registry
residing in a controlling virtual machine. This allows dynamic
creation, destruction and modification of virtual device
configurations (network devices, block devices and CPUS, to name some
examples).

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 drivers/xen/Makefile               |    1 
 drivers/xen/xenbus/Makefile        |    7 
 drivers/xen/xenbus/xenbus_client.c |  395 +++++++++++++
 drivers/xen/xenbus/xenbus_comms.c  |  208 +++++++
 drivers/xen/xenbus/xenbus_comms.h  |   43 +
 drivers/xen/xenbus/xenbus_probe.c  | 1065 +++++++++++++++++++++++++++++++++++++
 drivers/xen/xenbus/xenbus_xs.c     |  829 ++++++++++++++++++++++++++++
 include/xen/xenbus.h               |  291 ++++++++++
 8 files changed, 2839 insertions(+)

--- linus-2.6.orig/drivers/xen/Makefile
+++ linus-2.6/drivers/xen/Makefile
@@ -3,4 +3,5 @@ obj-y	+= util.o
 
 obj-y	+= core/
 obj-y	+= console/
+obj-y	+= xenbus/
 
--- /dev/null
+++ linus-2.6/drivers/xen/xenbus/Makefile
@@ -0,0 +1,7 @@
+obj-y	+= xenbus.o
+
+xenbus-objs =
+xenbus-objs += xenbus_client.o
+xenbus-objs += xenbus_comms.o
+xenbus-objs += xenbus_xs.o
+xenbus-objs += xenbus_probe.o
--- /dev/null
+++ linus-2.6/drivers/xen/xenbus/xenbus_client.c
@@ -0,0 +1,395 @@
+/******************************************************************************
+ * Client-facing interface for the Xenbus driver.  In other words, the
+ * interface between the Xenbus and the device-specific code, be it the
+ * frontend or the backend of that driver.
+ *
+ * Copyright (C) 2005 XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+#include <xen/driver_util.h>
+
+/* xenbus_probe.c */
+extern char *kasprintf(const char *fmt, ...);
+
+#define DPRINTK(fmt, args...) \
+    pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+
+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
+		      struct xenbus_watch *watch,
+		      void (*callback)(struct xenbus_watch *,
+				       const char **, unsigned int))
+{
+	int err;
+
+	watch->node = path;
+	watch->callback = callback;
+
+	err = register_xenbus_watch(watch);
+
+	if (err) {
+		watch->node = NULL;
+		watch->callback = NULL;
+		xenbus_dev_fatal(dev, err, "adding watch on %s", path);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path);
+
+
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+		       const char *path2, struct xenbus_watch *watch,
+		       void (*callback)(struct xenbus_watch *,
+					const char **, unsigned int))
+{
+	int err;
+	char *state = kasprintf("%s/%s", path, path2);
+	if (!state) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
+		return -ENOMEM;
+	}
+	err = xenbus_watch_path(dev, state, watch, callback);
+
+	if (err)
+		kfree(state);
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path2);
+
+
+int xenbus_switch_state(struct xenbus_device *dev,
+			xenbus_transaction_t xbt,
+			XenbusState state)
+{
+	/* We check whether the state is currently set to the given value, and
+	   if not, then the state is set.  We don't want to unconditionally
+	   write the given state, because we don't want to fire watches
+	   unnecessarily.  Furthermore, if the node has gone, we don't write
+	   to it, as the device will be tearing down, and we don't want to
+	   resurrect that directory.
+	 */
+
+	int current_state;
+	int err;
+
+	if (state == dev->state)
+		return 0;
+
+	err = xenbus_scanf(xbt, dev->nodename, "state", "%d",
+			       &current_state);
+	if (err != 1)
+		return 0;
+
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
+	if (err) {
+		if (state != XenbusStateClosing) /* Avoid looping */
+			xenbus_dev_fatal(dev, err, "writing new state");
+		return err;
+	}
+
+	dev->state = state;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_switch_state);
+
+
+/**
+ * Return the path to the error node for the given device, or NULL on failure.
+ * If the value returned is non-NULL, then it is the caller's to kfree.
+ */
+static char *error_path(struct xenbus_device *dev)
+{
+	return kasprintf("error/%s", dev->nodename);
+}
+
+
+void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
+		va_list ap)
+{
+	int ret;
+	unsigned int len;
+	char *printf_buffer = NULL, *path_buffer = NULL;
+
+#define PRINTF_BUFFER_SIZE 4096
+	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+	if (printf_buffer == NULL)
+		goto fail;
+
+	len = sprintf(printf_buffer, "%i ", -err);
+	ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
+
+	BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
+
+	dev_err(&dev->dev, "%s\n", printf_buffer);
+
+	path_buffer = error_path(dev);
+
+	if (path_buffer == NULL) {
+		printk("xenbus: failed to write error node for %s (%s)\n",
+		       dev->nodename, printf_buffer);
+		goto fail;
+	}
+
+	if (xenbus_write(XBT_NULL, path_buffer, "error", printf_buffer) != 0) {
+		printk("xenbus: failed to write error node for %s (%s)\n",
+		       dev->nodename, printf_buffer);
+		goto fail;
+	}
+
+fail:
+	kfree(printf_buffer);
+	kfree(path_buffer);
+}
+
+
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
+		      ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	_dev_error(dev, err, fmt, ap);
+	va_end(ap);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_error);
+
+
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
+		      ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	_dev_error(dev, err, fmt, ap);
+	va_end(ap);
+
+	xenbus_switch_state(dev, XBT_NULL, XenbusStateClosing);
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
+
+
+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
+{
+	int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
+	if (err < 0)
+		xenbus_dev_fatal(dev, err, "granting access to ring page");
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_grant_ring);
+
+
+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
+{
+	struct evtchn_op op = {
+		.cmd = EVTCHNOP_alloc_unbound,
+		.u.alloc_unbound.dom = DOMID_SELF,
+		.u.alloc_unbound.remote_dom = dev->otherend_id
+	};
+	int err = HYPERVISOR_event_channel_op(&op);
+	if (err)
+		xenbus_dev_fatal(dev, err, "allocating event channel");
+	else
+		*port = op.u.alloc_unbound.port;
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
+
+
+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
+{
+	struct evtchn_op op = {
+		.cmd = EVTCHNOP_bind_interdomain,
+		.u.bind_interdomain.remote_dom = dev->otherend_id,
+		.u.bind_interdomain.remote_port = remote_port,
+	};
+	int err = HYPERVISOR_event_channel_op(&op);
+	if (err)
+		xenbus_dev_fatal(dev, err,
+				 "binding to event channel %d from domain %d",
+				 remote_port, dev->otherend_id);
+	else
+		*port = op.u.bind_interdomain.local_port;
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
+
+
+int xenbus_free_evtchn(struct xenbus_device *dev, int port)
+{
+	struct evtchn_op op = {
+		.cmd = EVTCHNOP_close,
+		.u.close.port = port,
+	};
+	int err = HYPERVISOR_event_channel_op(&op);
+	if (err)
+		xenbus_dev_error(dev, err, "freeing event channel %d", port);
+	return err;
+}
+
+
+/* Based on Rusty Russell's skeleton driver's map_page */
+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
+{
+	struct gnttab_map_grant_ref op = {
+		.flags = GNTMAP_host_map,
+		.ref   = gnt_ref,
+		.dom   = dev->otherend_id,
+	};
+	struct vm_struct *area;
+
+	*vaddr = NULL;
+
+	area = alloc_vm_area(PAGE_SIZE);
+	if (!area)
+		return -ENOMEM;
+
+	op.host_addr = (unsigned long)area->addr;
+
+	lock_vm_area(area);
+	BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
+	unlock_vm_area(area);
+
+	if (op.status != GNTST_okay) {
+		free_vm_area(area);
+		xenbus_dev_fatal(dev, op.status,
+				 "mapping in shared page %d from domain %d",
+				 gnt_ref, dev->otherend_id);
+		return op.status;
+	}
+
+	/* Stuff the handle in an unused field */
+	area->phys_addr = (unsigned long)op.handle;
+
+	*vaddr = area->addr;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+		    grant_handle_t *handle, void *vaddr)
+{
+	struct gnttab_map_grant_ref op = {
+		.host_addr = (unsigned long)vaddr,
+		.flags     = GNTMAP_host_map,
+		.ref       = gnt_ref,
+		.dom       = dev->otherend_id,
+	};
+
+	BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1));
+
+	if (op.status != GNTST_okay) {
+		xenbus_dev_fatal(dev, op.status,
+				 "mapping in shared page %d from domain %d",
+				 gnt_ref, dev->otherend_id);
+	} else
+		*handle = op.handle;
+
+	return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring);
+
+
+/* Based on Rusty Russell's skeleton driver's unmap_page */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
+{
+	struct vm_struct *area;
+	struct gnttab_unmap_grant_ref op = {
+		.host_addr = (unsigned long)vaddr,
+	};
+
+	/* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
+	 * method so that we don't have to muck with vmalloc internals here.
+	 * We could force the user to hang on to their struct vm_struct from
+	 * xenbus_map_ring_valloc, but these 6 lines considerably simplify
+	 * this API.
+	 */
+	read_lock(&vmlist_lock);
+	for (area = vmlist; area != NULL; area = area->next) {
+		if (area->addr == vaddr)
+			break;
+	}
+	read_unlock(&vmlist_lock);
+
+	if (!area) {
+		xenbus_dev_error(dev, -ENOENT,
+				 "can't find mapped virtual address %p", vaddr);
+		return GNTST_bad_virt_addr;
+	}
+
+	op.handle = (grant_handle_t)area->phys_addr;
+
+	lock_vm_area(area);
+	BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+	unlock_vm_area(area);
+
+	if (op.status == GNTST_okay)
+		free_vm_area(area);
+	else
+		xenbus_dev_error(dev, op.status,
+				 "unmapping page at handle %d error %d",
+				 (int16_t)area->phys_addr, op.status);
+
+	return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+
+int xenbus_unmap_ring(struct xenbus_device *dev,
+		      grant_handle_t handle, void *vaddr)
+{
+	struct gnttab_unmap_grant_ref op = {
+		.host_addr = (unsigned long)vaddr,
+		.handle    = handle,
+	};
+
+	BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+
+	if (op.status != GNTST_okay)
+		xenbus_dev_error(dev, op.status,
+				 "unmapping page at handle %d error %d",
+				 handle, op.status);
+
+	return op.status;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+
+
+XenbusState xenbus_read_driver_state(const char *path)
+{
+	XenbusState result;
+	int err = xenbus_gather(XBT_NULL, path, "state", "%d", &result, NULL);
+	if (err)
+		result = XenbusStateClosed;
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
--- /dev/null
+++ linus-2.6/drivers/xen/xenbus/xenbus_comms.c
@@ -0,0 +1,208 @@
+/******************************************************************************
+ * xenbus_comms.c
+ *
+ * Low level code to talks to Xen Store: ringbuffer and event channel.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <xen/xenbus.h>
+#include "xenbus_comms.h"
+
+static int xenbus_irq;
+
+extern void xenbus_probe(void *);
+extern int xenstored_ready;
+static DECLARE_WORK(probe_work, xenbus_probe, NULL);
+
+DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
+
+static inline struct xenstore_domain_interface *xenstore_domain_interface(void)
+{
+	return mfn_to_virt(xen_start_info->store_mfn);
+}
+
+static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
+{
+	if (unlikely(xenstored_ready == 0)) {
+		xenstored_ready = 1;
+		schedule_work(&probe_work);
+	}
+
+	wake_up(&xb_waitq);
+	return IRQ_HANDLED;
+}
+
+static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
+{
+	return ((prod - cons) <= XENSTORE_RING_SIZE);
+}
+
+static void *get_output_chunk(XENSTORE_RING_IDX cons,
+			      XENSTORE_RING_IDX prod,
+			      char *buf, uint32_t *len)
+{
+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
+	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
+		*len = XENSTORE_RING_SIZE - (prod - cons);
+	return buf + MASK_XENSTORE_IDX(prod);
+}
+
+static const void *get_input_chunk(XENSTORE_RING_IDX cons,
+				   XENSTORE_RING_IDX prod,
+				   const char *buf, uint32_t *len)
+{
+	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
+	if ((prod - cons) < *len)
+		*len = prod - cons;
+	return buf + MASK_XENSTORE_IDX(cons);
+}
+
+int xb_write(const void *data, unsigned len)
+{
+	struct xenstore_domain_interface *intf = xenstore_domain_interface();
+	XENSTORE_RING_IDX cons, prod;
+	int rc;
+
+	while (len != 0) {
+		void *dst;
+		unsigned int avail;
+
+		rc = wait_event_interruptible(
+			xb_waitq,
+			(intf->req_prod - intf->req_cons) !=
+			XENSTORE_RING_SIZE);
+		if (rc < 0)
+			return rc;
+
+		/* Read indexes, then verify. */
+		cons = intf->req_cons;
+		prod = intf->req_prod;
+		mb();
+		if (!check_indexes(cons, prod)) {
+			intf->req_cons = intf->req_prod = 0;
+			return -EIO;
+		}
+
+		dst = get_output_chunk(cons, prod, intf->req, &avail);
+		if (avail == 0)
+			continue;
+		if (avail > len)
+			avail = len;
+
+		memcpy(dst, data, avail);
+		data += avail;
+		len -= avail;
+
+		/* Other side must not see new header until data is there. */
+		wmb();
+		intf->req_prod += avail;
+
+		/* This implies mb() before other side sees interrupt. */
+		notify_remote_via_evtchn(xen_start_info->store_evtchn);
+	}
+
+	return 0;
+}
+
+int xb_read(void *data, unsigned len)
+{
+	struct xenstore_domain_interface *intf = xenstore_domain_interface();
+	XENSTORE_RING_IDX cons, prod;
+	int rc;
+
+	while (len != 0) {
+		unsigned int avail;
+		const char *src;
+
+		rc = wait_event_interruptible(
+			xb_waitq,
+			intf->rsp_cons != intf->rsp_prod);
+		if (rc < 0)
+			return rc;
+
+		/* Read indexes, then verify. */
+		cons = intf->rsp_cons;
+		prod = intf->rsp_prod;
+		mb();
+		if (!check_indexes(cons, prod)) {
+			intf->rsp_cons = intf->rsp_prod = 0;
+			return -EIO;
+		}
+
+		src = get_input_chunk(cons, prod, intf->rsp, &avail);
+		if (avail == 0)
+			continue;
+		if (avail > len)
+			avail = len;
+
+		/* We must read header before we read data. */
+		rmb();
+
+		memcpy(data, src, avail);
+		data += avail;
+		len -= avail;
+
+		/* Other side must not see free space until we've copied out */
+		mb();
+		intf->rsp_cons += avail;
+
+		pr_debug("Finished read of %i bytes (%i to go)\n", avail, len);
+
+		/* Implies mb(): they will see new header. */
+		notify_remote_via_evtchn(xen_start_info->store_evtchn);
+	}
+
+	return 0;
+}
+
+/* Set up interrupt handler off store event channel. */
+int xb_init_comms(void)
+{
+	int err;
+
+	if (xenbus_irq)
+		unbind_from_irqhandler(xenbus_irq, &xb_waitq);
+
+	err = bind_evtchn_to_irqhandler(
+		xen_start_info->store_evtchn, wake_waiting,
+		0, "xenbus", &xb_waitq);
+	if (err <= 0) {
+		printk(KERN_ERR "XENBUS request irq failed %i\n", err);
+		return err;
+	}
+
+	xenbus_irq = err;
+
+	return 0;
+}
--- /dev/null
+++ linus-2.6/drivers/xen/xenbus/xenbus_comms.h
@@ -0,0 +1,43 @@
+/*
+ * Private include for xenbus communications.
+ * 
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XENBUS_COMMS_H
+#define _XENBUS_COMMS_H
+
+int xs_init(void);
+int xb_init_comms(void);
+
+/* Low level routines. */
+int xb_write(const void *data, unsigned len);
+int xb_read(void *data, unsigned len);
+int xs_input_avail(void);
+extern wait_queue_head_t xb_waitq;
+
+#endif /* _XENBUS_COMMS_H */
--- /dev/null
+++ linus-2.6/drivers/xen/xenbus/xenbus_probe.c
@@ -0,0 +1,1065 @@
+/******************************************************************************
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
+ * Copyright (C) 2005 XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define DPRINTK(fmt, args...) \
+    pr_debug("xenbus_probe (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+
+#include <asm/io.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#ifdef XEN_XENBUS_PROC_INTERFACE
+#include <xen/xen_proc.h>
+#endif
+#include <xen/evtchn.h>
+
+#include "xenbus_comms.h"
+
+extern struct mutex xenwatch_mutex;
+
+static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+
+/* If something in array of ids matches this device, return it. */
+static const struct xenbus_device_id *
+match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
+{
+	for (; *arr->devicetype != '\0'; arr++) {
+		if (!strcmp(arr->devicetype, dev->devicetype))
+			return arr;
+	}
+	return NULL;
+}
+
+static int xenbus_match(struct device *_dev, struct device_driver *_drv)
+{
+	struct xenbus_driver *drv = to_xenbus_driver(_drv);
+
+	if (!drv->ids)
+		return 0;
+
+	return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
+}
+
+struct xen_bus_type
+{
+	char *root;
+	unsigned int levels;
+	int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
+	int (*probe)(const char *type, const char *dir);
+	struct bus_type bus;
+	struct device dev;
+};
+
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
+{
+	nodename = strchr(nodename, '/');
+	if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
+		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+		return -EINVAL;
+	}
+
+	strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
+	if (!strchr(bus_id, '/')) {
+		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+		return -EINVAL;
+	}
+	*strchr(bus_id, '/') = '-';
+	return 0;
+}
+
+
+static void free_otherend_details(struct xenbus_device *dev)
+{
+	kfree(dev->otherend);
+	dev->otherend = NULL;
+}
+
+
+static void free_otherend_watch(struct xenbus_device *dev)
+{
+	if (dev->otherend_watch.node) {
+		unregister_xenbus_watch(&dev->otherend_watch);
+		kfree(dev->otherend_watch.node);
+		dev->otherend_watch.node = NULL;
+	}
+}
+
+
+static int read_otherend_details(struct xenbus_device *xendev,
+				 char *id_node, char *path_node)
+{
+	int err = xenbus_gather(XBT_NULL, xendev->nodename,
+				id_node, "%i", &xendev->otherend_id,
+				path_node, NULL, &xendev->otherend,
+				NULL);
+	if (err) {
+		xenbus_dev_fatal(xendev, err,
+				 "reading other end details from %s",
+				 xendev->nodename);
+		return err;
+	}
+	if (strlen(xendev->otherend) == 0 ||
+	    !xenbus_exists(XBT_NULL, xendev->otherend, "")) {
+		xenbus_dev_fatal(xendev, -ENOENT, "missing other end from %s",
+				 xendev->nodename);
+		free_otherend_details(xendev);
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+	return read_otherend_details(xendev, "backend-id", "backend");
+}
+
+
+static int read_frontend_details(struct xenbus_device *xendev)
+{
+	return read_otherend_details(xendev, "frontend-id", "frontend");
+}
+
+
+/* Bus type for frontend drivers. */
+static int xenbus_probe_frontend(const char *type, const char *name);
+static struct xen_bus_type xenbus_frontend = {
+	.root = "device",
+	.levels = 2, 		/* device/type/<id> */
+	.get_bus_id = frontend_bus_id,
+	.probe = xenbus_probe_frontend,
+	.bus = {
+		.name  = "xen",
+		.match = xenbus_match,
+	},
+	.dev = {
+		.bus_id = "xen",
+	},
+};
+
+/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
+static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
+{
+	int domid, err;
+	const char *devid, *type, *frontend;
+	unsigned int typelen;
+
+	type = strchr(nodename, '/');
+	if (!type)
+		return -EINVAL;
+	type++;
+	typelen = strcspn(type, "/");
+	if (!typelen || type[typelen] != '/')
+		return -EINVAL;
+
+	devid = strrchr(nodename, '/') + 1;
+
+	err = xenbus_gather(XBT_NULL, nodename, "frontend-id", "%i", &domid,
+			    "frontend", NULL, &frontend,
+			    NULL);
+	if (err)
+		return err;
+	if (strlen(frontend) == 0)
+		err = -ERANGE;
+	if (!err && !xenbus_exists(XBT_NULL, frontend, ""))
+		err = -ENOENT;
+
+	kfree(frontend);
+
+	if (err)
+		return err;
+
+	if (snprintf(bus_id, BUS_ID_SIZE,
+		     "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
+		return -ENOSPC;
+	return 0;
+}
+
+static int xenbus_uevent_backend(struct device *dev, char **envp,
+				 int num_envp, char *buffer, int buffer_size);
+static int xenbus_probe_backend(const char *type, const char *domid);
+static struct xen_bus_type xenbus_backend = {
+	.root = "backend",
+	.levels = 3, 		/* backend/type/<frontend>/<id> */
+	.get_bus_id = backend_bus_id,
+	.probe = xenbus_probe_backend,
+	.bus = {
+		.name  = "xen-backend",
+		.match = xenbus_match,
+		.uevent = xenbus_uevent_backend,
+	},
+	.dev = {
+		.bus_id = "xen-backend",
+	},
+};
+
+static int xenbus_uevent_backend(struct device *dev, char **envp,
+				 int num_envp, char *buffer, int buffer_size)
+{
+	struct xenbus_device *xdev;
+	struct xenbus_driver *drv;
+	int i = 0;
+	int length = 0;
+
+	DPRINTK("");
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	xdev = to_xenbus_device(dev);
+	if (xdev == NULL)
+		return -ENODEV;
+
+	/* stuff we want to pass to /sbin/hotplug */
+	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
+		       "XENBUS_TYPE=%s", xdev->devicetype);
+
+	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
+		       "XENBUS_PATH=%s", xdev->nodename);
+
+	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
+		       "XENBUS_BASE_PATH=%s", xenbus_backend.root);
+
+	/* terminate, set to next free slot, shrink available space */
+	envp[i] = NULL;
+	envp = &envp[i];
+	num_envp -= i;
+	buffer = &buffer[length];
+	buffer_size -= length;
+
+	if (dev->driver) {
+		drv = to_xenbus_driver(dev->driver);
+		if (drv && drv->uevent)
+			return drv->uevent(xdev, envp, num_envp, buffer,
+					   buffer_size);
+	}
+
+	return 0;
+}
+
+static void otherend_changed(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	struct xenbus_device *dev =
+		container_of(watch, struct xenbus_device, otherend_watch);
+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+	XenbusState state;
+
+	/* Protect us against watches firing on old details when the otherend
+	   details change, say immediately after a resume. */
+	if (!dev->otherend ||
+	    strncmp(dev->otherend, vec[XS_WATCH_PATH],
+		    strlen(dev->otherend))) {
+		DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]);
+		return;
+	}
+
+	state = xenbus_read_driver_state(dev->otherend);
+
+	DPRINTK("state is %d, %s, %s",
+		state, dev->otherend_watch.node, vec[XS_WATCH_PATH]);
+	if (drv->otherend_changed)
+		drv->otherend_changed(dev, state);
+}
+
+
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+	free_otherend_watch(dev);
+	free_otherend_details(dev);
+
+	return drv->read_otherend_details(dev);
+}
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+	return xenbus_watch_path2(dev, dev->otherend, "state",
+				  &dev->otherend_watch, otherend_changed);
+}
+
+
+static int xenbus_dev_probe(struct device *_dev)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+	struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+	const struct xenbus_device_id *id;
+	int err;
+
+	DPRINTK("");
+
+	if (!drv->probe) {
+		err = -ENODEV;
+		goto fail;
+	}
+
+	id = match_device(drv->ids, dev);
+	if (!id) {
+		err = -ENODEV;
+		goto fail;
+	}
+
+	err = talk_to_otherend(dev);
+	if (err) {
+		printk(KERN_WARNING
+		       "xenbus_probe: talk_to_otherend on %s failed.\n",
+		       dev->nodename);
+		return err;
+	}
+
+	err = drv->probe(dev, id);
+	if (err)
+		goto fail;
+
+	err = watch_otherend(dev);
+	if (err) {
+		printk(KERN_WARNING
+		       "xenbus_probe: watch_otherend on %s failed.\n",
+		       dev->nodename);
+		return err;
+	}
+
+	return 0;
+fail:
+	xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
+	xenbus_switch_state(dev, XBT_NULL, XenbusStateClosed);
+	return -ENODEV;
+}
+
+static int xenbus_dev_remove(struct device *_dev)
+{
+	struct xenbus_device *dev = to_xenbus_device(_dev);
+	struct xenbus_driver *drv = to_xenbus_driver(_dev->driver);
+
+	DPRINTK("");
+
+	free_otherend_watch(dev);
+	free_otherend_details(dev);
+
+	if (drv->remove)
+		drv->remove(dev);
+
+	xenbus_switch_state(dev, XBT_NULL, XenbusStateClosed);
+	return 0;
+}
+
+static int xenbus_register_driver_common(struct xenbus_driver *drv,
+					 struct xen_bus_type *bus)
+{
+	int ret;
+
+	drv->driver.name = drv->name;
+	drv->driver.bus = &bus->bus;
+	drv->driver.owner = drv->owner;
+	drv->driver.probe = xenbus_dev_probe;
+	drv->driver.remove = xenbus_dev_remove;
+
+	mutex_lock(&xenwatch_mutex);
+	ret = driver_register(&drv->driver);
+	mutex_unlock(&xenwatch_mutex);
+	return ret;
+}
+
+int xenbus_register_frontend(struct xenbus_driver *drv)
+{
+	drv->read_otherend_details = read_backend_details;
+
+	return xenbus_register_driver_common(drv, &xenbus_frontend);
+}
+EXPORT_SYMBOL_GPL(xenbus_register_frontend);
+
+int xenbus_register_backend(struct xenbus_driver *drv)
+{
+	drv->read_otherend_details = read_frontend_details;
+
+	return xenbus_register_driver_common(drv, &xenbus_backend);
+}
+EXPORT_SYMBOL_GPL(xenbus_register_backend);
+
+void xenbus_unregister_driver(struct xenbus_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(xenbus_unregister_driver);
+
+struct xb_find_info
+{
+	struct xenbus_device *dev;
+	const char *nodename;
+};
+
+static int cmp_dev(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct xb_find_info *info = data;
+
+	if (!strcmp(xendev->nodename, info->nodename)) {
+		info->dev = xendev;
+		get_device(dev);
+		return 1;
+	}
+	return 0;
+}
+
+struct xenbus_device *xenbus_device_find(const char *nodename,
+					 struct bus_type *bus)
+{
+	struct xb_find_info info = { .dev = NULL, .nodename = nodename };
+
+	bus_for_each_dev(bus, NULL, &info, cmp_dev);
+	return info.dev;
+}
+
+static int cleanup_dev(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct xb_find_info *info = data;
+	int len = strlen(info->nodename);
+
+	DPRINTK("%s", info->nodename);
+
+	/* Match the info->nodename path, or any subdirectory of that path. */
+	if (strncmp(xendev->nodename, info->nodename, len))
+		return 0;
+
+	/* If the node name is longer, ensure it really is a subdirectory. */
+	if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/'))
+		return 0;
+
+	info->dev = xendev;
+	get_device(dev);
+	return 1;
+}
+
+static void xenbus_cleanup_devices(const char *path, struct bus_type *bus)
+{
+	struct xb_find_info info = { .nodename = path };
+
+	do {
+		info.dev = NULL;
+		bus_for_each_dev(bus, NULL, &info, cleanup_dev);
+		if (info.dev) {
+			device_unregister(&info.dev->dev);
+			put_device(&info.dev->dev);
+		}
+	} while (info.dev);
+}
+
+static void xenbus_dev_release(struct device *dev)
+{
+	if (dev)
+		kfree(to_xenbus_device(dev));
+}
+
+/* Simplified asprintf. */
+char *kasprintf(const char *fmt, ...)
+{
+	va_list ap;
+	unsigned int len;
+	char *p, dummy[1];
+
+	va_start(ap, fmt);
+	/* FIXME: vsnprintf has a bug, NULL should work */
+	len = vsnprintf(dummy, 0, fmt, ap);
+	va_end(ap);
+
+	p = kmalloc(len + 1, GFP_KERNEL);
+	if (!p)
+		return NULL;
+	va_start(ap, fmt);
+	vsprintf(p, fmt, ap);
+	va_end(ap);
+	return p;
+}
+
+static ssize_t xendev_show_nodename(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
+}
+DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
+
+static ssize_t xendev_show_devtype(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
+}
+DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+
+
+static int xenbus_probe_node(struct xen_bus_type *bus,
+			     const char *type,
+			     const char *nodename)
+{
+	int err;
+	struct xenbus_device *xendev;
+	size_t stringlen;
+	char *tmpstring;
+
+	XenbusState state = xenbus_read_driver_state(nodename);
+
+	if (state != XenbusStateInitialising) {
+		/* Device is not new, so ignore it.  This can happen if a
+		   device is going away after switching to Closed.  */
+		return 0;
+	}
+
+	stringlen = strlen(nodename) + 1 + strlen(type) + 1;
+	xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL);
+	if (!xendev)
+		return -ENOMEM;
+
+	/* Copy the strings into the extra space. */
+
+	tmpstring = (char *)(xendev + 1);
+	strcpy(tmpstring, nodename);
+	xendev->nodename = tmpstring;
+
+	tmpstring += strlen(tmpstring) + 1;
+	strcpy(tmpstring, type);
+	xendev->devicetype = tmpstring;
+
+	xendev->dev.parent = &bus->dev;
+	xendev->dev.bus = &bus->bus;
+	xendev->dev.release = xenbus_dev_release;
+
+	err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
+	if (err)
+		goto fail;
+
+	/* Register with generic device framework. */
+	err = device_register(&xendev->dev);
+	if (err)
+		goto fail;
+
+	device_create_file(&xendev->dev, &dev_attr_nodename);
+	device_create_file(&xendev->dev, &dev_attr_devtype);
+
+	return 0;
+fail:
+	kfree(xendev);
+	return err;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(const char *type, const char *name)
+{
+	char *nodename;
+	int err;
+
+	nodename = kasprintf("%s/%s/%s", xenbus_frontend.root, type, name);
+	if (!nodename)
+		return -ENOMEM;
+
+	DPRINTK("%s", nodename);
+
+	err = xenbus_probe_node(&xenbus_frontend, type, nodename);
+	kfree(nodename);
+	return err;
+}
+
+/* backend/<typename>/<frontend-uuid>/<name> */
+static int xenbus_probe_backend_unit(const char *dir,
+				     const char *type,
+				     const char *name)
+{
+	char *nodename;
+	int err;
+
+	nodename = kasprintf("%s/%s", dir, name);
+	if (!nodename)
+		return -ENOMEM;
+
+	DPRINTK("%s\n", nodename);
+
+	err = xenbus_probe_node(&xenbus_backend, type, nodename);
+	kfree(nodename);
+	return err;
+}
+
+/* backend/<typename>/<frontend-domid> */
+static int xenbus_probe_backend(const char *type, const char *domid)
+{
+	char *nodename;
+	int err = 0;
+	char **dir;
+	unsigned int i, dir_n = 0;
+
+	DPRINTK("");
+
+	nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, domid);
+	if (!nodename)
+		return -ENOMEM;
+
+	dir = xenbus_directory(XBT_NULL, nodename, "", &dir_n);
+	if (IS_ERR(dir)) {
+		kfree(nodename);
+		return PTR_ERR(dir);
+	}
+
+	for (i = 0; i < dir_n; i++) {
+		err = xenbus_probe_backend_unit(nodename, type, dir[i]);
+		if (err)
+			break;
+	}
+	kfree(dir);
+	kfree(nodename);
+	return err;
+}
+
+static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+{
+	int err = 0;
+	char **dir;
+	unsigned int dir_n = 0;
+	int i;
+
+	dir = xenbus_directory(XBT_NULL, bus->root, type, &dir_n);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+
+	for (i = 0; i < dir_n; i++) {
+		err = bus->probe(type, dir[i]);
+		if (err)
+			break;
+	}
+	kfree(dir);
+	return err;
+}
+
+static int xenbus_probe_devices(struct xen_bus_type *bus)
+{
+	int err = 0;
+	char **dir;
+	unsigned int i, dir_n;
+
+	dir = xenbus_directory(XBT_NULL, bus->root, "", &dir_n);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+
+	for (i = 0; i < dir_n; i++) {
+		err = xenbus_probe_device_type(bus, dir[i]);
+		if (err)
+			break;
+	}
+	kfree(dir);
+	return err;
+}
+
+static unsigned int char_count(const char *str, char c)
+{
+	unsigned int i, ret = 0;
+
+	for (i = 0; str[i]; i++)
+		if (str[i] == c)
+			ret++;
+	return ret;
+}
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+	unsigned int i;
+
+	for (i = 0; str[i]; i++)
+		if (str[i] == c) {
+			if (len == 0)
+				return i;
+			len--;
+		}
+	return (len == 0) ? i : -ERANGE;
+}
+
+static void dev_changed(const char *node, struct xen_bus_type *bus)
+{
+	int exists, rootlen;
+	struct xenbus_device *dev;
+	char type[BUS_ID_SIZE];
+	const char *p, *root;
+
+	if (char_count(node, '/') < 2)
+ 		return;
+
+	exists = xenbus_exists(XBT_NULL, node, "");
+	if (!exists) {
+		xenbus_cleanup_devices(node, &bus->bus);
+		return;
+	}
+
+	/* backend/<type>/... or device/<type>/... */
+	p = strchr(node, '/') + 1;
+	snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p);
+	type[BUS_ID_SIZE-1] = '\0';
+
+	rootlen = strsep_len(node, '/', bus->levels);
+	if (rootlen < 0)
+		return;
+	root = kasprintf("%.*s", rootlen, node);
+	if (!root)
+		return;
+
+	dev = xenbus_device_find(root, &bus->bus);
+	if (!dev)
+		xenbus_probe_node(bus, type, root);
+	else
+		put_device(&dev->dev);
+
+	kfree(root);
+}
+
+static void frontend_changed(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	DPRINTK("");
+
+	dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	DPRINTK("");
+
+	dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
+}
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+	.node = "device",
+	.callback = frontend_changed,
+};
+
+static struct xenbus_watch be_watch = {
+	.node = "backend",
+	.callback = backend_changed,
+};
+
+static int suspend_dev(struct device *dev, void *data)
+{
+	int err = 0;
+	struct xenbus_driver *drv;
+	struct xenbus_device *xdev;
+
+	DPRINTK("");
+
+	if (dev->driver == NULL)
+		return 0;
+	drv = to_xenbus_driver(dev->driver);
+	xdev = container_of(dev, struct xenbus_device, dev);
+	if (drv->suspend)
+		err = drv->suspend(xdev);
+	if (err)
+		printk(KERN_WARNING
+		       "xenbus: suspend %s failed: %i\n", dev->bus_id, err);
+	return 0;
+}
+
+static int resume_dev(struct device *dev, void *data)
+{
+	int err;
+	struct xenbus_driver *drv;
+	struct xenbus_device *xdev;
+
+	DPRINTK("");
+
+	if (dev->driver == NULL)
+		return 0;
+	drv = to_xenbus_driver(dev->driver);
+	xdev = container_of(dev, struct xenbus_device, dev);
+
+	err = talk_to_otherend(xdev);
+	if (err) {
+		printk(KERN_WARNING
+		       "xenbus: resume (talk_to_otherend) %s failed: %i\n",
+		       dev->bus_id, err);
+		return err;
+	}
+
+	err = watch_otherend(xdev);
+	if (err) {
+		printk(KERN_WARNING
+		       "xenbus_probe: resume (watch_otherend) %s failed: "
+		       "%d.\n", dev->bus_id, err);
+		return err;
+	}
+
+	if (drv->resume)
+		err = drv->resume(xdev);
+	if (err)
+		printk(KERN_WARNING
+		       "xenbus: resume %s failed: %i\n", dev->bus_id, err);
+	return err;
+}
+
+void xenbus_suspend(void)
+{
+	DPRINTK("");
+
+	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
+	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev);
+	xs_suspend();
+}
+EXPORT_SYMBOL_GPL(xenbus_suspend);
+
+void xenbus_resume(void)
+{
+	xb_init_comms();
+	xs_resume();
+	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
+	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev);
+}
+EXPORT_SYMBOL_GPL(xenbus_resume);
+
+
+/* A flag to determine if xenstored is 'ready' (i.e. has started) */
+int xenstored_ready = 0;
+
+
+int register_xenstore_notifier(struct notifier_block *nb)
+{
+	int ret = 0;
+
+	if (xenstored_ready > 0)
+		ret = nb->notifier_call(nb, 0, NULL);
+	else
+		blocking_notifier_chain_register(&xenstore_chain, nb);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_xenstore_notifier);
+
+void unregister_xenstore_notifier(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&xenstore_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
+
+
+static int all_devices_ready_(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	int *result = data;
+
+	if (xendev->state != XenbusStateConnected) {
+		result = 0;
+		return 1;
+	}
+
+	return 0;
+}
+
+
+static int all_devices_ready(void)
+{
+	int ready = 1;
+	bus_for_each_dev(&xenbus_frontend.bus, NULL, &ready,
+			 all_devices_ready_);
+	return ready;
+}
+
+
+void xenbus_probe(void *unused)
+{
+	int i;
+
+	BUG_ON((xenstored_ready <= 0));
+
+	/* Enumerate devices in xenstore. */
+	xenbus_probe_devices(&xenbus_frontend);
+	xenbus_probe_devices(&xenbus_backend);
+
+	/* Watch for changes. */
+	register_xenbus_watch(&fe_watch);
+	register_xenbus_watch(&be_watch);
+
+	/* Notify others that xenstore is up */
+	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
+
+	/* On a 10 second timeout, waiting for all devices currently
+	   configured.  We need to do this to guarantee that the filesystems
+	   and / or network devices needed for boot are available, before we
+	   can allow the boot to proceed.
+
+	   A possible improvement here would be to have the tools add a
+	   per-device flag to the store entry, indicating whether it is needed
+	   at boot time.  This would allow people who knew what they were
+	   doing to accelerate their boot slightly, but of course needs tools
+	   or manual intervention to set up those flags correctly.
+	 */
+	for (i = 0; i < 10 * HZ; i++) {
+		if (all_devices_ready())
+			return;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(1);
+	}
+
+	printk(KERN_WARNING
+	       "XENBUS: Timeout connecting to devices!\n");
+}
+
+
+#ifdef XEN_XENBUS_PROC_INTERFACE
+static struct file_operations xsd_kva_fops;
+static struct proc_dir_entry *xsd_kva_intf;
+static struct proc_dir_entry *xsd_port_intf;
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+		return -EINVAL;
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			    mfn_to_pfn(xen_start_info->store_mfn),
+			    size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static int xsd_kva_read(char *page, char **start, off_t off,
+                        int count, int *eof, void *data)
+{
+	int len;
+
+	len  = sprintf(page, "0x%p", mfn_to_virt(xen_start_info->store_mfn));
+	*eof = 1;
+	return len;
+}
+
+static int xsd_port_read(char *page, char **start, off_t off,
+			 int count, int *eof, void *data)
+{
+	int len;
+
+	len  = sprintf(page, "%d", xen_start_info->store_evtchn);
+	*eof = 1;
+	return len;
+}
+#endif
+
+
+static int __init xenbus_probe_init(void)
+{
+	int err = 0, dom0;
+
+	DPRINTK("");
+
+	if (xen_init() < 0) {
+		DPRINTK("failed");
+		return -ENODEV;
+	}
+
+	/* Register ourselves with the kernel bus & device subsystems */
+	bus_register(&xenbus_frontend.bus);
+	bus_register(&xenbus_backend.bus);
+	device_register(&xenbus_frontend.dev);
+	device_register(&xenbus_backend.dev);
+
+	/*
+	 * Domain0 doesn't have a store_evtchn or store_mfn yet.
+	 */
+	dom0 = (xen_start_info->store_evtchn == 0);
+
+#ifdef XEN_XENBUS_PROC_INTERFACE
+	if (dom0) {
+
+		unsigned long page;
+		evtchn_op_t op = { 0 };
+		int ret;
+
+
+		/* Allocate page. */
+		page = get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		xen_start_info->store_mfn =
+			pfn_to_mfn(virt_to_phys((void *)page) >>
+				   PAGE_SHIFT);
+
+		/* Next allocate a local port which xenstored can bind to */
+		op.cmd = EVTCHNOP_alloc_unbound;
+		op.u.alloc_unbound.dom        = DOMID_SELF;
+		op.u.alloc_unbound.remote_dom = 0;
+
+		ret = HYPERVISOR_event_channel_op(&op);
+		BUG_ON(ret);
+		xen_start_info->store_evtchn = op.u.alloc_unbound.port;
+
+		/* And finally publish the above info in /proc/xen */
+		xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
+		if (xsd_kva_intf) {
+			memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
+			       sizeof(xsd_kva_fops));
+			xsd_kva_fops.mmap = xsd_kva_mmap;
+			xsd_kva_intf->proc_fops = &xsd_kva_fops;
+			xsd_kva_intf->read_proc = xsd_kva_read;
+		}
+		xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
+		if (xsd_port_intf)
+			xsd_port_intf->read_proc = xsd_port_read;
+	} else
+#endif
+		xenstored_ready = 1;
+
+	/* Initialize the interface to xenstore. */
+	err = xs_init();
+	if (err) {
+		printk(KERN_WARNING
+		       "XENBUS: Error initializing xenstore comms: %i\n", err);
+		return err;
+	}
+
+	if (!dom0)
+		xenbus_probe(NULL);
+
+	return 0;
+}
+
+postcore_initcall(xenbus_probe_init);
--- /dev/null
+++ linus-2.6/drivers/xen/xenbus/xenbus_xs.c
@@ -0,0 +1,829 @@
+/******************************************************************************
+ * xenbus_xs.c
+ *
+ * This is the kernel equivalent of the "xs" library.  We don't need everything
+ * and we use xenbus_comms for communication.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/unistd.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/fcntl.h>
+#include <linux/kthread.h>
+#include <linux/rwsem.h>
+#include <xen/xenbus.h>
+#include "xenbus_comms.h"
+
+/* xenbus_probe.c */
+extern char *kasprintf(const char *fmt, ...);
+
+struct xs_stored_msg {
+	struct list_head list;
+
+	struct xsd_sockmsg hdr;
+
+	union {
+		/* Queued replies. */
+		struct {
+			char *body;
+		} reply;
+
+		/* Queued watch events. */
+		struct {
+			struct xenbus_watch *handle;
+			char **vec;
+			unsigned int vec_size;
+		} watch;
+	} u;
+};
+
+struct xs_handle {
+	/* A list of replies. Currently only one will ever be outstanding. */
+	struct list_head reply_list;
+	spinlock_t reply_lock;
+	wait_queue_head_t reply_waitq;
+
+	/* One request at a time. */
+	struct mutex request_mutex;
+
+	/* Protect transactions against save/restore. */
+	struct rw_semaphore suspend_mutex;
+};
+
+static struct xs_handle xs_state;
+
+/* List of registered watches, and a lock to protect it. */
+static LIST_HEAD(watches);
+static DEFINE_SPINLOCK(watches_lock);
+
+/* List of pending watch callback events, and a lock to protect it. */
+static LIST_HEAD(watch_events);
+static DEFINE_SPINLOCK(watch_events_lock);
+
+/*
+ * Details of the xenwatch callback kernel thread. The thread waits on the
+ * watch_events_waitq for work to do (queued on watch_events list). When it
+ * wakes up it acquires the xenwatch_mutex before reading the list and
+ * carrying out work.
+ */
+static pid_t xenwatch_pid;
+/* static */ DEFINE_MUTEX(xenwatch_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
+
+static int get_error(const char *errorstring)
+{
+	unsigned int i;
+
+	for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
+		if (i == ARRAY_SIZE(xsd_errors) - 1) {
+			printk(KERN_WARNING
+			       "XENBUS xen store gave: unknown error %s",
+			       errorstring);
+			return EINVAL;
+		}
+	}
+	return xsd_errors[i].errnum;
+}
+
+static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
+{
+	struct xs_stored_msg *msg;
+	char *body;
+
+	spin_lock(&xs_state.reply_lock);
+
+	while (list_empty(&xs_state.reply_list)) {
+		spin_unlock(&xs_state.reply_lock);
+		/* XXX FIXME: Avoid synchronous wait for response here. */
+		wait_event(xs_state.reply_waitq,
+			   !list_empty(&xs_state.reply_list));
+		spin_lock(&xs_state.reply_lock);
+	}
+
+	msg = list_entry(xs_state.reply_list.next,
+			 struct xs_stored_msg, list);
+	list_del(&msg->list);
+
+	spin_unlock(&xs_state.reply_lock);
+
+	*type = msg->hdr.type;
+	if (len)
+		*len = msg->hdr.len;
+	body = msg->u.reply.body;
+
+	kfree(msg);
+
+	return body;
+}
+
+/* Emergency write. */
+void xenbus_debug_write(const char *str, unsigned int count)
+{
+	struct xsd_sockmsg msg = { 0 };
+
+	msg.type = XS_DEBUG;
+	msg.len = sizeof("print") + count + 1;
+
+	mutex_lock(&xs_state.request_mutex);
+	xb_write(&msg, sizeof(msg));
+	xb_write("print", sizeof("print"));
+	xb_write(str, count);
+	xb_write("", 1);
+	mutex_unlock(&xs_state.request_mutex);
+}
+
+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+{
+	void *ret;
+	struct xsd_sockmsg req_msg = *msg;
+	int err;
+
+	if (req_msg.type == XS_TRANSACTION_START)
+		down_read(&xs_state.suspend_mutex);
+
+	mutex_lock(&xs_state.request_mutex);
+
+	err = xb_write(msg, sizeof(*msg) + msg->len);
+	if (err) {
+		msg->type = XS_ERROR;
+		ret = ERR_PTR(err);
+	} else
+		ret = read_reply(&msg->type, &msg->len);
+
+	mutex_unlock(&xs_state.request_mutex);
+
+	if ((msg->type == XS_TRANSACTION_END) ||
+	    ((req_msg.type == XS_TRANSACTION_START) &&
+	     (msg->type == XS_ERROR)))
+		up_read(&xs_state.suspend_mutex);
+
+	return ret;
+}
+
+/* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
+static void *xs_talkv(xenbus_transaction_t t,
+		      enum xsd_sockmsg_type type,
+		      const struct kvec *iovec,
+		      unsigned int num_vecs,
+		      unsigned int *len)
+{
+	struct xsd_sockmsg msg;
+	void *ret = NULL;
+	unsigned int i;
+	int err;
+
+	msg.tx_id = t;
+	msg.req_id = 0;
+	msg.type = type;
+	msg.len = 0;
+	for (i = 0; i < num_vecs; i++)
+		msg.len += iovec[i].iov_len;
+
+	mutex_lock(&xs_state.request_mutex);
+
+	err = xb_write(&msg, sizeof(msg));
+	if (err) {
+		mutex_unlock(&xs_state.request_mutex);
+		return ERR_PTR(err);
+	}
+
+	for (i = 0; i < num_vecs; i++) {
+		err = xb_write(iovec[i].iov_base, iovec[i].iov_len);;
+		if (err) {
+			mutex_unlock(&xs_state.request_mutex);
+			return ERR_PTR(err);
+		}
+	}
+
+	ret = read_reply(&msg.type, len);
+
+	mutex_unlock(&xs_state.request_mutex);
+
+	if (IS_ERR(ret))
+		return ret;
+
+	if (msg.type == XS_ERROR) {
+		err = get_error(ret);
+		kfree(ret);
+		return ERR_PTR(-err);
+	}
+
+	if (msg.type != type) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING
+			       "XENBUS unexpected type [%d], expected [%d]\n",
+			       msg.type, type);
+		kfree(ret);
+		return ERR_PTR(-EINVAL);
+	}
+	return ret;
+}
+
+/* Simplified version of xs_talkv: single message. */
+static void *xs_single(xenbus_transaction_t t,
+		       enum xsd_sockmsg_type type,
+		       const char *string,
+		       unsigned int *len)
+{
+	struct kvec iovec;
+
+	iovec.iov_base = (void *)string;
+	iovec.iov_len = strlen(string) + 1;
+	return xs_talkv(t, type, &iovec, 1, len);
+}
+
+/* Many commands only need an ack, don't care what it says. */
+static int xs_error(char *reply)
+{
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+	kfree(reply);
+	return 0;
+}
+
+static unsigned int count_strings(const char *strings, unsigned int len)
+{
+	unsigned int num;
+	const char *p;
+
+	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
+		num++;
+
+	return num;
+}
+
+/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */
+static char *join(const char *dir, const char *name)
+{
+	char *buffer;
+
+	if (strlen(name) == 0)
+		buffer = kasprintf("%s", dir);
+	else
+		buffer = kasprintf("%s/%s", dir, name);
+	return (!buffer) ? ERR_PTR(-ENOMEM) : buffer;
+}
+
+static char **split(char *strings, unsigned int len, unsigned int *num)
+{
+	char *p, **ret;
+
+	/* Count the strings. */
+	*num = count_strings(strings, len);
+
+	/* Transfer to one big alloc for easy freeing. */
+	ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL);
+	if (!ret) {
+		kfree(strings);
+		return ERR_PTR(-ENOMEM);
+	}
+	memcpy(&ret[*num], strings, len);
+	kfree(strings);
+
+	strings = (char *)&ret[*num];
+	for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
+		ret[(*num)++] = p;
+
+	return ret;
+}
+
+char **xenbus_directory(xenbus_transaction_t t,
+			const char *dir, const char *node, unsigned int *num)
+{
+	char *strings, *path;
+	unsigned int len;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return (char **)path;
+
+	strings = xs_single(t, XS_DIRECTORY, path, &len);
+	kfree(path);
+	if (IS_ERR(strings))
+		return (char **)strings;
+
+	return split(strings, len, num);
+}
+EXPORT_SYMBOL_GPL(xenbus_directory);
+
+/* Check if a path exists. Return 1 if it does. */
+int xenbus_exists(xenbus_transaction_t t,
+		  const char *dir, const char *node)
+{
+	char **d;
+	int dir_n;
+
+	d = xenbus_directory(t, dir, node, &dir_n);
+	if (IS_ERR(d))
+		return 0;
+	kfree(d);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(xenbus_exists);
+
+/* Get the value of a single file.
+ * Returns a kmalloced value: call free() on it after use.
+ * len indicates length in bytes.
+ */
+void *xenbus_read(xenbus_transaction_t t,
+		  const char *dir, const char *node, unsigned int *len)
+{
+	char *path;
+	void *ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return (void *)path;
+
+	ret = xs_single(t, XS_READ, path, len);
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_read);
+
+/* Write the value of a single file.
+ * Returns -err on failure.
+ */
+int xenbus_write(xenbus_transaction_t t,
+		 const char *dir, const char *node, const char *string)
+{
+	const char *path;
+	struct kvec iovec[2];
+	int ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	iovec[0].iov_base = (void *)path;
+	iovec[0].iov_len = strlen(path) + 1;
+	iovec[1].iov_base = (void *)string;
+	iovec[1].iov_len = strlen(string);
+
+	ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_write);
+
+/* Create a new directory. */
+int xenbus_mkdir(xenbus_transaction_t t,
+		 const char *dir, const char *node)
+{
+	char *path;
+	int ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_mkdir);
+
+/* Destroy a file or directory (directories must be empty). */
+int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node)
+{
+	char *path;
+	int ret;
+
+	path = join(dir, node);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	ret = xs_error(xs_single(t, XS_RM, path, NULL));
+	kfree(path);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_rm);
+
+/* Start a transaction: changes by others will not be seen during this
+ * transaction, and changes will not be visible to others until end.
+ */
+int xenbus_transaction_start(xenbus_transaction_t *t)
+{
+	char *id_str;
+
+	down_read(&xs_state.suspend_mutex);
+
+	id_str = xs_single(XBT_NULL, XS_TRANSACTION_START, "", NULL);
+	if (IS_ERR(id_str)) {
+		up_read(&xs_state.suspend_mutex);
+		return PTR_ERR(id_str);
+	}
+
+	*t = simple_strtoul(id_str, NULL, 0);
+	kfree(id_str);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_start);
+
+/* End a transaction.
+ * If abandon is true, transaction is discarded instead of committed.
+ */
+int xenbus_transaction_end(xenbus_transaction_t t, int abort)
+{
+	char abortstr[2];
+	int err;
+
+	if (abort)
+		strcpy(abortstr, "F");
+	else
+		strcpy(abortstr, "T");
+
+	err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
+
+	up_read(&xs_state.suspend_mutex);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_transaction_end);
+
+/* Single read and scanf: returns -errno or num scanned. */
+int xenbus_scanf(xenbus_transaction_t t,
+		 const char *dir, const char *node, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	char *val;
+
+	val = xenbus_read(t, dir, node, NULL);
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+
+	va_start(ap, fmt);
+	ret = vsscanf(val, fmt, ap);
+	va_end(ap);
+	kfree(val);
+	/* Distinctive errno. */
+	if (ret == 0)
+		return -ERANGE;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_scanf);
+
+/* Single printf and write: returns -errno or 0. */
+int xenbus_printf(xenbus_transaction_t t,
+		  const char *dir, const char *node, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+#define PRINTF_BUFFER_SIZE 4096
+	char *printf_buffer;
+
+	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+	if (printf_buffer == NULL)
+		return -ENOMEM;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap);
+	va_end(ap);
+
+	BUG_ON(ret > PRINTF_BUFFER_SIZE-1);
+	ret = xenbus_write(t, dir, node, printf_buffer);
+
+	kfree(printf_buffer);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_printf);
+
+/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */
+int xenbus_gather(xenbus_transaction_t t, const char *dir, ...)
+{
+	va_list ap;
+	const char *name;
+	int ret = 0;
+
+	va_start(ap, dir);
+	while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+		const char *fmt = va_arg(ap, char *);
+		void *result = va_arg(ap, void *);
+		char *p;
+
+		p = xenbus_read(t, dir, name, NULL);
+		if (IS_ERR(p)) {
+			ret = PTR_ERR(p);
+			break;
+		}
+		if (fmt) {
+			if (sscanf(p, fmt, result) == 0)
+				ret = -EINVAL;
+			kfree(p);
+		} else
+			*(char **)result = p;
+	}
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xenbus_gather);
+
+static int xs_watch(const char *path, const char *token)
+{
+	struct kvec iov[2];
+
+	iov[0].iov_base = (void *)path;
+	iov[0].iov_len = strlen(path) + 1;
+	iov[1].iov_base = (void *)token;
+	iov[1].iov_len = strlen(token) + 1;
+
+	return xs_error(xs_talkv(XBT_NULL, XS_WATCH, iov,
+				 ARRAY_SIZE(iov), NULL));
+}
+
+static int xs_unwatch(const char *path, const char *token)
+{
+	struct kvec iov[2];
+
+	iov[0].iov_base = (char *)path;
+	iov[0].iov_len = strlen(path) + 1;
+	iov[1].iov_base = (char *)token;
+	iov[1].iov_len = strlen(token) + 1;
+
+	return xs_error(xs_talkv(XBT_NULL, XS_UNWATCH, iov,
+				 ARRAY_SIZE(iov), NULL));
+}
+
+static struct xenbus_watch *find_watch(const char *token)
+{
+	struct xenbus_watch *i, *cmp;
+
+	cmp = (void *)simple_strtoul(token, NULL, 16);
+
+	list_for_each_entry(i, &watches, list)
+		if (i == cmp)
+			return i;
+
+	return NULL;
+}
+
+/* Register callback to watch this node. */
+int register_xenbus_watch(struct xenbus_watch *watch)
+{
+	/* Pointer in ascii is the token. */
+	char token[sizeof(watch) * 2 + 1];
+	int err;
+
+	sprintf(token, "%lX", (long)watch);
+
+	down_read(&xs_state.suspend_mutex);
+
+	spin_lock(&watches_lock);
+	BUG_ON(find_watch(token));
+	list_add(&watch->list, &watches);
+	spin_unlock(&watches_lock);
+
+	err = xs_watch(watch->node, token);
+
+	/* Ignore errors due to multiple registration. */
+	if ((err != 0) && (err != -EEXIST)) {
+		spin_lock(&watches_lock);
+		list_del(&watch->list);
+		spin_unlock(&watches_lock);
+	}
+
+	up_read(&xs_state.suspend_mutex);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_xenbus_watch);
+
+void unregister_xenbus_watch(struct xenbus_watch *watch)
+{
+	struct xs_stored_msg *msg, *tmp;
+	char token[sizeof(watch) * 2 + 1];
+	int err;
+
+	sprintf(token, "%lX", (long)watch);
+
+	down_read(&xs_state.suspend_mutex);
+
+	spin_lock(&watches_lock);
+	BUG_ON(!find_watch(token));
+	list_del(&watch->list);
+	spin_unlock(&watches_lock);
+
+	err = xs_unwatch(watch->node, token);
+	if (err)
+		printk(KERN_WARNING
+		       "XENBUS Failed to release watch %s: %i\n",
+		       watch->node, err);
+
+	up_read(&xs_state.suspend_mutex);
+
+	/* Cancel pending watch events. */
+	spin_lock(&watch_events_lock);
+	list_for_each_entry_safe(msg, tmp, &watch_events, list) {
+		if (msg->u.watch.handle != watch)
+			continue;
+		list_del(&msg->list);
+		kfree(msg->u.watch.vec);
+		kfree(msg);
+	}
+	spin_unlock(&watch_events_lock);
+
+	/* Flush any currently-executing callback, unless we are it. :-) */
+	if (current->pid != xenwatch_pid) {
+		mutex_lock(&xenwatch_mutex);
+		mutex_unlock(&xenwatch_mutex);
+	}
+}
+EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
+
+void xs_suspend(void)
+{
+	down_write(&xs_state.suspend_mutex);
+	mutex_lock(&xs_state.request_mutex);
+}
+
+void xs_resume(void)
+{
+	struct xenbus_watch *watch;
+	char token[sizeof(watch) * 2 + 1];
+
+	mutex_unlock(&xs_state.request_mutex);
+
+	/* No need for watches_lock: the suspend_mutex is sufficient. */
+	list_for_each_entry(watch, &watches, list) {
+		sprintf(token, "%lX", (long)watch);
+		xs_watch(watch->node, token);
+	}
+
+	up_write(&xs_state.suspend_mutex);
+}
+
+static int xenwatch_thread(void *unused)
+{
+	struct list_head *ent;
+	struct xs_stored_msg *msg;
+
+	for (;;) {
+		wait_event_interruptible(watch_events_waitq,
+					 !list_empty(&watch_events));
+
+		if (kthread_should_stop())
+			break;
+
+		mutex_lock(&xenwatch_mutex);
+
+		spin_lock(&watch_events_lock);
+		ent = watch_events.next;
+		if (ent != &watch_events)
+			list_del(ent);
+		spin_unlock(&watch_events_lock);
+
+		if (ent != &watch_events) {
+			msg = list_entry(ent, struct xs_stored_msg, list);
+			msg->u.watch.handle->callback(
+				msg->u.watch.handle,
+				(const char **)msg->u.watch.vec,
+				msg->u.watch.vec_size);
+			kfree(msg->u.watch.vec);
+			kfree(msg);
+		}
+
+		mutex_unlock(&xenwatch_mutex);
+	}
+
+	return 0;
+}
+
+static int process_msg(void)
+{
+	struct xs_stored_msg *msg;
+	char *body;
+	int err;
+
+	msg = kmalloc(sizeof(*msg), GFP_KERNEL);
+	if (msg == NULL)
+		return -ENOMEM;
+
+	err = xb_read(&msg->hdr, sizeof(msg->hdr));
+	if (err) {
+		kfree(msg);
+		return err;
+	}
+
+	body = kmalloc(msg->hdr.len + 1, GFP_KERNEL);
+	if (body == NULL) {
+		kfree(msg);
+		return -ENOMEM;
+	}
+
+	err = xb_read(body, msg->hdr.len);
+	if (err) {
+		kfree(body);
+		kfree(msg);
+		return err;
+	}
+	body[msg->hdr.len] = '\0';
+
+	if (msg->hdr.type == XS_WATCH_EVENT) {
+		msg->u.watch.vec = split(body, msg->hdr.len,
+					 &msg->u.watch.vec_size);
+		if (IS_ERR(msg->u.watch.vec)) {
+			kfree(msg);
+			return PTR_ERR(msg->u.watch.vec);
+		}
+
+		spin_lock(&watches_lock);
+		msg->u.watch.handle = find_watch(
+			msg->u.watch.vec[XS_WATCH_TOKEN]);
+		if (msg->u.watch.handle != NULL) {
+			spin_lock(&watch_events_lock);
+			list_add_tail(&msg->list, &watch_events);
+			wake_up(&watch_events_waitq);
+			spin_unlock(&watch_events_lock);
+		} else {
+			kfree(msg->u.watch.vec);
+			kfree(msg);
+		}
+		spin_unlock(&watches_lock);
+	} else {
+		msg->u.reply.body = body;
+		spin_lock(&xs_state.reply_lock);
+		list_add_tail(&msg->list, &xs_state.reply_list);
+		spin_unlock(&xs_state.reply_lock);
+		wake_up(&xs_state.reply_waitq);
+	}
+
+	return 0;
+}
+
+static int xenbus_thread(void *unused)
+{
+	int err;
+
+	for (;;) {
+		err = process_msg();
+		if (err)
+			printk(KERN_WARNING "XENBUS error %d while reading "
+			       "message\n", err);
+		if (kthread_should_stop())
+			break;
+	}
+
+	return 0;
+}
+
+int xs_init(void)
+{
+	int err;
+	struct task_struct *task;
+
+	INIT_LIST_HEAD(&xs_state.reply_list);
+	spin_lock_init(&xs_state.reply_lock);
+	init_waitqueue_head(&xs_state.reply_waitq);
+
+	mutex_init(&xs_state.request_mutex);
+	init_rwsem(&xs_state.suspend_mutex);
+
+	/* Initialize the shared memory rings to talk to xenstored */
+	err = xb_init_comms();
+	if (err)
+		return err;
+
+	task = kthread_run(xenwatch_thread, NULL, "xenwatch");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	xenwatch_pid = task->pid;
+
+	task = kthread_run(xenbus_thread, NULL, "xenbus");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	return 0;
+}
--- /dev/null
+++ linus-2.6/include/xen/xenbus.h
@@ -0,0 +1,291 @@
+/******************************************************************************
+ * xenbus.h
+ *
+ * Talks to Xen Store to figure out what devices we have.
+ *
+ * Copyright (C) 2005 Rusty Russell, IBM Corporation
+ * Copyright (C) 2005 XenSource Ltd.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_XENBUS_H
+#define _XEN_XENBUS_H
+
+#include <linux/device.h>
+#include <linux/notifier.h>
+#include <linux/mutex.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/xenbus.h>
+#include <xen/interface/io/xs_wire.h>
+
+#define XBT_NULL 0
+
+/* Register callback to watch this node. */
+struct xenbus_watch
+{
+	struct list_head list;
+
+	/* Path being watched. */
+	const char *node;
+
+	/* Callback (executed in a process context with no locks held). */
+	void (*callback)(struct xenbus_watch *,
+			 const char **vec, unsigned int len);
+};
+
+
+/* A xenbus device. */
+struct xenbus_device {
+	const char *devicetype;
+	const char *nodename;
+	const char *otherend;
+	int otherend_id;
+	struct xenbus_watch otherend_watch;
+	struct device dev;
+	XenbusState state;
+	void *data;
+};
+
+static inline struct xenbus_device *to_xenbus_device(struct device *dev)
+{
+	return container_of(dev, struct xenbus_device, dev);
+}
+
+struct xenbus_device_id
+{
+	/* .../device/<device_type>/<identifier> */
+	char devicetype[32]; 	/* General class of device. */
+};
+
+/* A xenbus driver. */
+struct xenbus_driver {
+	char *name;
+	struct module *owner;
+	const struct xenbus_device_id *ids;
+	int (*probe)(struct xenbus_device *dev,
+		     const struct xenbus_device_id *id);
+	void (*otherend_changed)(struct xenbus_device *dev,
+				 XenbusState backend_state);
+	int (*remove)(struct xenbus_device *dev);
+	int (*suspend)(struct xenbus_device *dev);
+	int (*resume)(struct xenbus_device *dev);
+	int (*uevent)(struct xenbus_device *, char **, int, char *, int);
+	struct device_driver driver;
+	int (*read_otherend_details)(struct xenbus_device *dev);
+};
+
+static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
+{
+	return container_of(drv, struct xenbus_driver, driver);
+}
+
+int xenbus_register_frontend(struct xenbus_driver *drv);
+int xenbus_register_backend(struct xenbus_driver *drv);
+void xenbus_unregister_driver(struct xenbus_driver *drv);
+
+typedef u32 xenbus_transaction_t;
+
+char **xenbus_directory(xenbus_transaction_t t,
+			const char *dir, const char *node, unsigned int *num);
+void *xenbus_read(xenbus_transaction_t t,
+		  const char *dir, const char *node, unsigned int *len);
+int xenbus_write(xenbus_transaction_t t,
+		 const char *dir, const char *node, const char *string);
+int xenbus_mkdir(xenbus_transaction_t t,
+		 const char *dir, const char *node);
+int xenbus_exists(xenbus_transaction_t t,
+		  const char *dir, const char *node);
+int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node);
+int xenbus_transaction_start(xenbus_transaction_t *t);
+int xenbus_transaction_end(xenbus_transaction_t t, int abort);
+
+/* Single read and scanf: returns -errno or num scanned if > 0. */
+int xenbus_scanf(xenbus_transaction_t t,
+		 const char *dir, const char *node, const char *fmt, ...)
+	__attribute__((format(scanf, 4, 5)));
+
+/* Single printf and write: returns -errno or 0. */
+int xenbus_printf(xenbus_transaction_t t,
+		  const char *dir, const char *node, const char *fmt, ...)
+	__attribute__((format(printf, 4, 5)));
+
+/* Generic read function: NULL-terminated triples of name,
+ * sprintf-style type string, and pointer. Returns 0 or errno.*/
+int xenbus_gather(xenbus_transaction_t t, const char *dir, ...);
+
+/* notifer routines for when the xenstore comes up */
+int register_xenstore_notifier(struct notifier_block *nb);
+void unregister_xenstore_notifier(struct notifier_block *nb);
+
+int register_xenbus_watch(struct xenbus_watch *watch);
+void unregister_xenbus_watch(struct xenbus_watch *watch);
+void xs_suspend(void);
+void xs_resume(void);
+
+/* Used by xenbus_dev to borrow kernel's store connection. */
+void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
+
+/* Called from xen core code. */
+void xenbus_suspend(void);
+void xenbus_resume(void);
+
+#define XENBUS_IS_ERR_READ(str) ({			\
+	if (!IS_ERR(str) && strlen(str) == 0) {		\
+		kfree(str);				\
+		str = ERR_PTR(-ERANGE);			\
+	}						\
+	IS_ERR(str);					\
+})
+
+#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
+
+
+/**
+ * Register a watch on the given path, using the given xenbus_watch structure
+ * for storage, and the given callback function as the callback.  Return 0 on
+ * success, or -errno on error.  On success, the given path will be saved as
+ * watch->node, and remains the caller's to free.  On error, watch->node will
+ * be NULL, the device will switch to XenbusStateClosing, and the error will
+ * be saved in the store.
+ */
+int xenbus_watch_path(struct xenbus_device *dev, const char *path,
+		      struct xenbus_watch *watch,
+		      void (*callback)(struct xenbus_watch *,
+				       const char **, unsigned int));
+
+
+/**
+ * Register a watch on the given path/path2, using the given xenbus_watch
+ * structure for storage, and the given callback function as the callback.
+ * Return 0 on success, or -errno on error.  On success, the watched path
+ * (path/path2) will be saved as watch->node, and becomes the caller's to
+ * kfree().  On error, watch->node will be NULL, so the caller has nothing to
+ * free, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+		       const char *path2, struct xenbus_watch *watch,
+		       void (*callback)(struct xenbus_watch *,
+					const char **, unsigned int));
+
+
+/**
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Perform the change inside the given transaction xbt.  xbt may be NULL, in
+ * which case this is performed inside its own transaction.  Return 0 on
+ * success, or -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_switch_state(struct xenbus_device *dev,
+			xenbus_transaction_t xbt,
+			XenbusState new_state);
+
+
+/**
+ * Grant access to the given ring_mfn to the peer of the given device.  Return
+ * 0 on success, or -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
+
+
+/**
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
+ * page to that address, and sets *vaddr to that address.
+ * xenbus_map_ring does not allocate the virtual address space (you must do
+ * this yourself!). It only maps in the page to the specified address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+int xenbus_map_ring_valloc(struct xenbus_device *dev,
+			   int gnt_ref, void **vaddr);
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+			   grant_handle_t *handle, void *vaddr);
+
+
+/**
+ * Unmap a page of memory in this domain that was imported from another domain.
+ * Use xenbus_unmap_ring_vfree if you mapped in your memory with
+ * xenbus_map_ring_valloc (it will free the virtual address space).
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
+int xenbus_unmap_ring(struct xenbus_device *dev,
+		      grant_handle_t handle, void *vaddr);
+
+
+/**
+ * Allocate an event channel for the given xenbus_device, assigning the newly
+ * created local port to *port.  Return 0 on success, or -errno on error.  On
+ * error, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
+
+
+/**
+ * Bind to an existing interdomain event channel in another domain. Returns 0
+ * on success and stores the local port in *port. On error, returns -errno,
+ * switches the device to XenbusStateClosing, and saves the error in XenStore.
+ */
+int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
+
+
+/**
+ * Free an existing event channel. Returns 0 on success or -errno on error.
+ */
+int xenbus_free_evtchn(struct xenbus_device *dev, int port);
+
+
+/**
+ * Return the state of the driver rooted at the given store path, or
+ * XenbusStateClosed if no state can be read.
+ */
+XenbusState xenbus_read_driver_state(const char *path);
+
+
+/***
+ * Report the given negative errno into the store, along with the given
+ * formatted message.
+ */
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
+		      ...);
+
+
+/***
+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
+ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * closedown of this driver and its peer.
+ */
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
+		      ...);
+
+
+#endif /* _XEN_XENBUS_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (32 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 11:55   ` [Xen-devel] " Herbert Xu
                     ` (4 more replies)
  2006-05-09  7:00 ` [RFC PATCH 35/35] Add Xen virtual block " Chris Wright
  2006-05-09 14:49 ` [RFC PATCH 00/35] Xen i386 paravirtualization support Martin J. Bligh
  35 siblings, 5 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach, netdev

[-- Attachment #1: netfront --]
[-- Type: text/plain, Size: 46804 bytes --]

The network device frontend driver allows the kernel to access network
devices exported exported by a virtual machine containing a physical
network device driver.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: netdev@vger.kernel.org
---
TODO:
- drop proc
- more ethtool ops
- s/g support

 drivers/net/Kconfig             |    2 
 drivers/xen/Kconfig.net         |   14 
 drivers/xen/Makefile            |    3 
 drivers/xen/net_driver_util.c   |   58 +
 drivers/xen/netfront/Makefile   |    4 
 drivers/xen/netfront/netfront.c | 1510 ++++++++++++++++++++++++++++++++++++++++
 include/xen/net_driver_util.h   |   48 +
 7 files changed, 1639 insertions(+)

--- linus-2.6.orig/drivers/net/Kconfig
+++ linus-2.6/drivers/net/Kconfig
@@ -2325,6 +2325,8 @@ source "drivers/atm/Kconfig"
 
 source "drivers/s390/net/Kconfig"
 
+source "drivers/xen/Kconfig.net"
+
 config ISERIES_VETH
 	tristate "iSeries Virtual Ethernet driver support"
 	depends on PPC_ISERIES
--- linus-2.6.orig/drivers/xen/Makefile
+++ linus-2.6/drivers/xen/Makefile
@@ -1,7 +1,10 @@
 
+obj-y	+= net_driver_util.o
 obj-y	+= util.o
 
 obj-y	+= core/
 obj-y	+= console/
 obj-y	+= xenbus/
 
+obj-$(CONFIG_XEN_NETDEV_FRONTEND)	+= netfront/
+
--- /dev/null
+++ linus-2.6/drivers/xen/Kconfig.net
@@ -0,0 +1,14 @@
+menu "Xen network device drivers"
+        depends on NETDEVICES && XEN
+
+config XEN_NETDEV_FRONTEND
+	tristate "Network-device frontend driver"
+	depends on XEN
+	default y
+	help
+	  The network-device frontend driver allows the kernel to access
+	  network interfaces within another guest OS. Unless you are building a
+	  dedicated device-driver domain, or your master control domain
+	  (domain 0), then you almost certainly want to say Y here.
+
+endmenu
--- /dev/null
+++ linus-2.6/drivers/xen/net_driver_util.c
@@ -0,0 +1,58 @@
+/*****************************************************************************
+ *
+ * Utility functions for Xen network devices.
+ *
+ * Copyright (c) 2005 XenSource Ltd.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/if_ether.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <xen/net_driver_util.h>
+
+
+int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+	char *s;
+	int i;
+	char *e;
+	char *macstr = xenbus_read(XBT_NULL, dev->nodename, "mac", NULL);
+	if (IS_ERR(macstr))
+		return PTR_ERR(macstr);
+	s = macstr;
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = simple_strtoul(s, &e, 16);
+		if (s == e || (e[0] != ':' && e[0] != 0)) {
+			kfree(macstr);
+			return -ENOENT;
+		}
+		s = &e[1];
+	}
+	kfree(macstr);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_net_read_mac);
--- /dev/null
+++ linus-2.6/drivers/xen/netfront/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_XEN_NETDEV_FRONTEND)	:= xennet.o
+
+xennet-objs := netfront.o
--- /dev/null
+++ linus-2.6/drivers/xen/netfront/netfront.c
@@ -0,0 +1,1510 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/proc_fs.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/netif.h>
+#include <xen/interface/memory.h>
+#ifdef CONFIG_XEN_BALLOON
+#include <xen/balloon.h>
+#endif
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+#include <xen/net_driver_util.h>
+
+#define GRANT_INVALID_REF	0
+
+#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
+#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
+
+static inline void init_skb_shinfo(struct sk_buff *skb)
+{
+	atomic_set(&(skb_shinfo(skb)->dataref), 1);
+	skb_shinfo(skb)->nr_frags = 0;
+	skb_shinfo(skb)->frag_list = NULL;
+}
+
+struct netfront_info
+{
+	struct list_head list;
+	struct net_device *netdev;
+
+	struct net_device_stats stats;
+	unsigned int tx_full;
+
+	struct netif_tx_front_ring tx;
+	struct netif_rx_front_ring rx;
+
+	spinlock_t   tx_lock;
+	spinlock_t   rx_lock;
+
+	unsigned int handle;
+	unsigned int evtchn, irq;
+
+	/* What is the status of our connection to the remote backend? */
+#define BEST_CLOSED       0
+#define BEST_DISCONNECTED 1
+#define BEST_CONNECTED    2
+	unsigned int backend_state;
+
+	/* Is this interface open or closed (down or up)? */
+#define UST_CLOSED        0
+#define UST_OPEN          1
+	unsigned int user_state;
+
+	/* Receive-ring batched refills. */
+#define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
+#define RX_MAX_TARGET NET_RX_RING_SIZE
+	int rx_min_target, rx_max_target, rx_target;
+	struct sk_buff_head rx_batch;
+
+	struct timer_list rx_refill_timer;
+
+	/*
+	 * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
+	 * array is an index into a chain of free entries.
+	 */
+	struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
+	struct sk_buff *rx_skbs[NET_RX_RING_SIZE+1];
+
+	grant_ref_t gref_tx_head;
+	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
+	grant_ref_t gref_rx_head;
+	grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1];
+
+	struct xenbus_device *xbdev;
+	int tx_ring_ref;
+	int rx_ring_ref;
+	u8 mac[ETH_ALEN];
+
+	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+};
+
+/*
+ * Access macros for acquiring freeing slots in {tx,rx}_skbs[].
+ */
+
+static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
+{
+	list[id] = list[0];
+	list[0]  = (void *)(unsigned long)id;
+}
+
+static inline unsigned short get_id_from_freelist(struct sk_buff **list)
+{
+	unsigned int id = (unsigned int)(unsigned long)list[0];
+	list[0] = list[id];
+	return id;
+}
+
+#ifdef DEBUG
+static char *be_state_name[] = {
+	[BEST_CLOSED]       = "closed",
+	[BEST_DISCONNECTED] = "disconnected",
+	[BEST_CONNECTED]    = "connected",
+};
+#endif
+
+#define DPRINTK(fmt, args...) pr_debug("netfront (%s:%d) " fmt, \
+                                       __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...)				\
+	printk(KERN_INFO "netfront: " fmt, ##args)
+#define WPRINTK(fmt, args...)				\
+	printk(KERN_WARNING "netfront: " fmt, ##args)
+
+
+static int talk_to_backend(struct xenbus_device *, struct netfront_info *);
+static int setup_device(struct xenbus_device *, struct netfront_info *);
+static int create_netdev(int, struct xenbus_device *, struct net_device **);
+
+static void netfront_closing(struct xenbus_device *);
+
+static void end_access(int, void *);
+static void netif_disconnect_backend(struct netfront_info *);
+static void close_netdev(struct netfront_info *);
+static void netif_free(struct netfront_info *);
+
+static void show_device(struct netfront_info *);
+
+static void network_connect(struct net_device *);
+static void network_tx_buf_gc(struct net_device *);
+static void network_alloc_rx_buffers(struct net_device *);
+static int send_fake_arp(struct net_device *);
+
+static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
+
+#define XEN_XENNET_PROC_INTERFACE
+#if defined(CONFIG_PROC_FS) && defined(XEN_XENNET_PROC_INTERFACE)
+static int xennet_proc_init(void);
+static int xennet_proc_addif(struct net_device *dev);
+static void xennet_proc_delif(struct net_device *dev);
+#else
+#define xennet_proc_init()  (0)
+#define xennet_proc_addif(d) (0)
+#define xennet_proc_delif(d) ((void)0)
+#endif
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffers for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Connected state.
+ */
+static int netfront_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int err;
+	struct net_device *netdev;
+	struct netfront_info *info;
+	unsigned int handle;
+
+	err = xenbus_scanf(XBT_NULL, dev->nodename, "handle", "%u", &handle);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading handle");
+		return err;
+	}
+
+	err = create_netdev(handle, dev, &netdev);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "creating netdev");
+		return err;
+	}
+
+	info = netdev_priv(netdev);
+	dev->data = info;
+
+	err = talk_to_backend(dev, info);
+	if (err) {
+		kfree(info);
+		dev->data = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our netif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int netfront_resume(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev->data;
+
+	DPRINTK("%s\n", dev->nodename);
+
+	netif_disconnect_backend(info);
+	return talk_to_backend(dev, info);
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct netfront_info *info)
+{
+	const char *message;
+	xenbus_transaction_t xbt;
+	int err;
+
+	err = xen_net_read_mac(dev, info->mac);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+		goto out;
+	}
+
+	/* Create shared ring, alloc event channel. */
+	err = setup_device(dev, info);
+	if (err)
+		goto out;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_ring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
+			    info->tx_ring_ref);
+	if (err) {
+		message = "writing tx ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
+			    info->rx_ring_ref);
+	if (err) {
+		message = "writing rx ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "event-channel", "%u", info->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename,
+			    "state", "%d", XenbusStateConnected);
+	if (err) {
+		message = "writing frontend XenbusStateConnected";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_ring;
+	}
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_ring:
+	netif_free(info);
+ out:
+	return err;
+}
+
+
+static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
+{
+	struct netif_tx_sring *txs;
+	struct netif_rx_sring *rxs;
+	int err;
+	struct net_device *netdev = info->netdev;
+
+	info->tx_ring_ref = GRANT_INVALID_REF;
+	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->rx.sring = NULL;
+	info->tx.sring = NULL;
+	info->irq = 0;
+
+	txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
+	if (!txs) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err, "allocating tx ring page");
+		goto fail;
+	}
+	rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
+	if (!rxs) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err, "allocating rx ring page");
+		free_page((unsigned long)txs);
+		goto fail;
+	}
+	info->backend_state = BEST_DISCONNECTED;
+
+	SHARED_RING_INIT(txs);
+	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
+
+	SHARED_RING_INIT(rxs);
+	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+	if (err < 0)
+		goto fail;
+	info->tx_ring_ref = err;
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+	if (err < 0)
+		goto fail;
+	info->rx_ring_ref = err;
+
+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	if (err)
+		goto fail;
+
+	memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
+	network_connect(netdev);
+	info->irq = bind_evtchn_to_irqhandler(
+		info->evtchn, netif_int, SA_SAMPLE_RANDOM, netdev->name,
+		netdev);
+	(void)send_fake_arp(netdev);
+	show_device(info);
+
+	return 0;
+
+ fail:
+	netif_free(info);
+	return err;
+}
+
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    XenbusState backend_state)
+{
+	DPRINTK("\n");
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateClosing:
+		netfront_closing(dev);
+		break;
+	}
+}
+
+
+/** Send a packet on a net device to encourage switches to learn the
+ * MAC. We send a fake ARP request.
+ *
+ * @param dev device
+ * @return 0 on success, error code otherwise
+ */
+static int send_fake_arp(struct net_device *dev)
+{
+	struct sk_buff *skb;
+	u32             src_ip, dst_ip;
+
+	dst_ip = INADDR_BROADCAST;
+	src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK);
+
+	/* No IP? Then nothing to do. */
+	if (src_ip == 0)
+		return 0;
+
+	skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
+			 dst_ip, dev, src_ip,
+			 /*dst_hw*/ NULL, /*src_hw*/ NULL,
+			 /*target_hw*/ dev->dev_addr);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	return dev_queue_xmit(skb);
+}
+
+
+static int network_open(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+
+	memset(&np->stats, 0, sizeof(np->stats));
+
+	np->user_state = UST_OPEN;
+
+	network_alloc_rx_buffers(dev);
+	np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+	RING_IDX i, prod;
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb;
+
+	if (np->backend_state != BEST_CONNECTED)
+		return;
+
+	do {
+		prod = np->tx.sring->rsp_prod;
+		rmb(); /* Ensure we see responses up to 'rp'. */
+
+		for (i = np->tx.rsp_cons; i != prod; i++) {
+			id  = RING_GET_RESPONSE(&np->tx, i)->id;
+			skb = np->tx_skbs[id];
+			if (unlikely(gnttab_query_foreign_access(
+				np->grant_tx_ref[id]) != 0)) {
+				printk(KERN_ALERT "network_tx_buf_gc: warning "
+				       "-- grant still in use by backend "
+				       "domain.\n");
+				goto out;
+			}
+			gnttab_end_foreign_access_ref(
+				np->grant_tx_ref[id], GNTMAP_readonly);
+			gnttab_release_grant_reference(
+				&np->gref_tx_head, np->grant_tx_ref[id]);
+			np->grant_tx_ref[id] = GRANT_INVALID_REF;
+			add_id_to_freelist(np->tx_skbs, id);
+			dev_kfree_skb_irq(skb);
+		}
+
+		np->tx.rsp_cons = prod;
+
+		/*
+		 * Set a new event, then check for race with update of tx_cons.
+		 * Note that it is essential to schedule a callback, no matter
+		 * how few buffers are pending. Even if there is space in the
+		 * transmit ring, higher layers may be blocked because too much
+		 * data is outstanding: in such cases notification from Xen is
+		 * likely to be the only kick that we'll get.
+		 */
+		np->tx.sring->rsp_event =
+			prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
+		mb();
+	} while (prod != np->tx.sring->rsp_prod);
+
+ out:
+	if (np->tx_full &&
+	    ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) {
+		np->tx_full = 0;
+		if (np->user_state == UST_OPEN)
+			netif_wake_queue(dev);
+	}
+}
+
+
+static void rx_refill_timeout(unsigned long data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	netif_rx_schedule(dev);
+}
+
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb;
+	int i, batch_target;
+	RING_IDX req_prod = np->rx.req_prod_pvt;
+	struct xen_memory_reservation reservation;
+	grant_ref_t ref;
+
+	if (unlikely(np->backend_state != BEST_CONNECTED))
+		return;
+
+	/*
+	 * Allocate skbuffs greedily, even though we batch updates to the
+	 * receive ring. This creates a less bursty demand on the memory
+	 * allocator, so should reduce the chance of failed allocation requests
+	 * both for ourself and for other kernel subsystems.
+	 */
+	batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
+	for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
+		/*
+		 * Subtract dev_alloc_skb headroom (16 bytes) and shared info
+		 * tailroom then round down to SKB_DATA_ALIGN boundary.
+		 */
+		skb = __dev_alloc_skb(
+			((PAGE_SIZE - sizeof(struct skb_shared_info)) &
+			 (-SKB_DATA_ALIGN(1))) - 16,
+			GFP_ATOMIC|__GFP_NOWARN);
+		if (skb == NULL) {
+			/* Any skbuffs queued for refill? Force them out. */
+			if (i != 0)
+				goto refill;
+			/* Could not allocate any skbuffs. Try again later. */
+			mod_timer(&np->rx_refill_timer,
+				  jiffies + (HZ/10));
+			return;
+		}
+		__skb_queue_tail(&np->rx_batch, skb);
+	}
+
+	/* Is the batch large enough to be worthwhile? */
+	if (i < (np->rx_target/2))
+		return;
+
+	/* Adjust our fill target if we risked running out of buffers. */
+	if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
+	    ((np->rx_target *= 2) > np->rx_max_target))
+		np->rx_target = np->rx_max_target;
+
+ refill:
+	for (i = 0; ; i++) {
+		if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
+			break;
+
+		skb->dev = dev;
+
+		id = get_id_from_freelist(np->rx_skbs);
+
+		np->rx_skbs[id] = skb;
+
+		RING_GET_REQUEST(&np->rx, req_prod + i)->id = id;
+		ref = gnttab_claim_grant_reference(&np->gref_rx_head);
+		BUG_ON((signed short)ref < 0);
+		np->grant_rx_ref[id] = ref;
+		gnttab_grant_foreign_transfer_ref(ref,
+						  np->xbdev->otherend_id,
+						  __pa(skb->head) >> PAGE_SHIFT);
+		RING_GET_REQUEST(&np->rx, req_prod + i)->gref = ref;
+		np->rx_pfn_array[i] = virt_to_mfn(skb->head);
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* Remove this page before passing back to Xen. */
+			set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
+					    INVALID_P2M_ENTRY);
+			MULTI_update_va_mapping(np->rx_mcl+i,
+						(unsigned long)skb->head,
+						__pte(0), 0);
+		}
+#endif
+	}
+
+#ifdef CONFIG_XEN_BALLOON
+	/* Tell the ballon driver what is going on. */
+	balloon_update_driver_allowance(i);
+#endif
+
+	reservation.extent_start = np->rx_pfn_array;
+	reservation.nr_extents   = i;
+	reservation.extent_order = 0;
+	reservation.address_bits = 0;
+	reservation.domid        = DOMID_SELF;
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/* After all PTEs have been zapped, flush the TLB. */
+		np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
+			UVMF_TLB_FLUSH|UVMF_ALL;
+
+		/* Give away a batch of pages. */
+		np->rx_mcl[i].op = __HYPERVISOR_memory_op;
+		np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
+		np->rx_mcl[i].args[1] = (unsigned long)&reservation;
+
+		/* Zap PTEs and give away pages in one big multicall. */
+		(void)HYPERVISOR_multicall(np->rx_mcl, i+1);
+
+		/* Check return status of HYPERVISOR_memory_op(). */
+		if (unlikely(np->rx_mcl[i].result != i))
+			panic("Unable to reduce memory reservation\n");
+	} else
+#endif
+		if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					 &reservation) != i)
+			panic("Unable to reduce memory reservation\n");
+
+	/* Above is a suitable barrier to ensure backend will see requests. */
+	np->rx.req_prod_pvt = req_prod + i;
+	RING_PUSH_REQUESTS(&np->rx);
+}
+
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct netif_tx_request *tx;
+	RING_IDX i;
+	grant_ref_t ref;
+	unsigned long mfn;
+	int notify;
+
+	if (unlikely(np->tx_full)) {
+		printk(KERN_ALERT "%s: full queue wasn't stopped!\n",
+		       dev->name);
+		netif_stop_queue(dev);
+		goto drop;
+	}
+
+	if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
+		     PAGE_SIZE)) {
+		struct sk_buff *nskb;
+		nskb = __dev_alloc_skb(skb->len, GFP_ATOMIC|__GFP_NOWARN);
+		if (unlikely(nskb == NULL))
+			goto drop;
+		skb_put(nskb, skb->len);
+		memcpy(nskb->data, skb->data, skb->len);
+		nskb->dev = skb->dev;
+		dev_kfree_skb(skb);
+		skb = nskb;
+	}
+
+	spin_lock_irq(&np->tx_lock);
+
+	if (np->backend_state != BEST_CONNECTED) {
+		spin_unlock_irq(&np->tx_lock);
+		goto drop;
+	}
+
+	i = np->tx.req_prod_pvt;
+
+	id = get_id_from_freelist(np->tx_skbs);
+	np->tx_skbs[id] = skb;
+
+	tx = RING_GET_REQUEST(&np->tx, i);
+
+	tx->id   = id;
+	ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+	BUG_ON((signed short)ref < 0);
+	mfn = virt_to_mfn(skb->data);
+	gnttab_grant_foreign_access_ref(
+		ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
+	tx->gref = np->grant_tx_ref[id] = ref;
+	tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
+	tx->size = skb->len;
+	tx->flags = (skb->ip_summed == CHECKSUM_HW) ? NETTXF_csum_blank : 0;
+
+	np->tx.req_prod_pvt = i + 1;
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
+	if (notify)
+		notify_remote_via_irq(np->irq);
+
+	network_tx_buf_gc(dev);
+
+	if (RING_FULL(&np->tx)) {
+		np->tx_full = 1;
+		netif_stop_queue(dev);
+	}
+
+	spin_unlock_irq(&np->tx_lock);
+
+	np->stats.tx_bytes += skb->len;
+	np->stats.tx_packets++;
+
+	return 0;
+
+ drop:
+	np->stats.tx_dropped++;
+	dev_kfree_skb(skb);
+	return 0;
+}
+
+static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+	struct net_device *dev = dev_id;
+	struct netfront_info *np = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&np->tx_lock, flags);
+	network_tx_buf_gc(dev);
+	spin_unlock_irqrestore(&np->tx_lock, flags);
+
+	if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx) &&
+	    (np->user_state == UST_OPEN))
+		netif_rx_schedule(dev);
+
+	return IRQ_HANDLED;
+}
+
+
+static int netif_poll(struct net_device *dev, int *pbudget)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb, *nskb;
+	struct netif_rx_response *rx;
+	RING_IDX i, rp;
+	struct mmu_update *mmu = np->rx_mmu;
+	struct multicall_entry *mcl = np->rx_mcl;
+	int work_done, budget, more_to_do = 1;
+	struct sk_buff_head rxq;
+	unsigned long flags;
+	unsigned long mfn;
+	grant_ref_t ref;
+
+	spin_lock(&np->rx_lock);
+
+	if (np->backend_state != BEST_CONNECTED) {
+		spin_unlock(&np->rx_lock);
+		return 0;
+	}
+
+	skb_queue_head_init(&rxq);
+
+	if ((budget = *pbudget) > dev->quota)
+		budget = dev->quota;
+	rp = np->rx.sring->rsp_prod;
+	rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+	for (i = np->rx.rsp_cons, work_done = 0;
+	     (i != rp) && (work_done < budget);
+	     i++, work_done++) {
+		rx = RING_GET_RESPONSE(&np->rx, i);
+
+		/*
+                 * This definitely indicates a bug, either in this driver or
+                 * in the backend driver. In future this should flag the bad
+                 * situation to the system controller to reboot the backed.
+                 */
+		if ((ref = np->grant_rx_ref[rx->id]) == GRANT_INVALID_REF) {
+			WPRINTK("Bad rx response id %d.\n", rx->id);
+			work_done--;
+			continue;
+		}
+
+		/* Memory pressure, insufficient buffer headroom, ... */
+		if ((mfn = gnttab_end_foreign_transfer_ref(ref)) == 0) {
+			if (net_ratelimit())
+				WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
+					rx->id, rx->status);
+			RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id =
+				rx->id;
+			RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref =
+				ref;
+			np->rx.req_prod_pvt++;
+			RING_PUSH_REQUESTS(&np->rx);
+			work_done--;
+			continue;
+		}
+
+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
+		np->grant_rx_ref[rx->id] = GRANT_INVALID_REF;
+
+		skb = np->rx_skbs[rx->id];
+		add_id_to_freelist(np->rx_skbs, rx->id);
+
+		/* NB. We handle skb overflow later. */
+		skb->data = skb->head + rx->offset;
+		skb->len  = rx->status;
+		skb->tail = skb->data + skb->len;
+
+		if (rx->flags & NETRXF_data_validated)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		np->stats.rx_packets++;
+		np->stats.rx_bytes += rx->status;
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* Remap the page. */
+			MULTI_update_va_mapping(mcl, (unsigned long)skb->head,
+						pfn_pte_ma(mfn, PAGE_KERNEL),
+						0);
+			mcl++;
+			mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+				| MMU_MACHPHYS_UPDATE;
+			mmu->val = __pa(skb->head) >> PAGE_SHIFT;
+			mmu++;
+
+			set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT,
+					    mfn);
+		}
+#endif
+
+		__skb_queue_tail(&rxq, skb);
+	}
+
+#ifdef CONFIG_XEN_BALLOON
+	/* Some pages are no longer absent... */
+	balloon_update_driver_allowance(-work_done);
+#endif
+
+	/* Do all the remapping work, and M2P updates, in one big hypercall. */
+	if (likely((mcl - np->rx_mcl) != 0)) {
+		mcl->op = __HYPERVISOR_mmu_update;
+		mcl->args[0] = (unsigned long)np->rx_mmu;
+		mcl->args[1] = mmu - np->rx_mmu;
+		mcl->args[2] = 0;
+		mcl->args[3] = DOMID_SELF;
+		mcl++;
+		(void)HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
+	}
+
+	while ((skb = __skb_dequeue(&rxq)) != NULL) {
+		if (skb->len > (dev->mtu + ETH_HLEN + 4)) {
+			if (net_ratelimit())
+				printk(KERN_INFO "Received packet too big for "
+				       "MTU (%d > %d)\n",
+				       skb->len - ETH_HLEN - 4, dev->mtu);
+			skb->len  = 0;
+			skb->tail = skb->data;
+			init_skb_shinfo(skb);
+			dev_kfree_skb(skb);
+			continue;
+		}
+
+		/*
+		 * Enough room in skbuff for the data we were passed? Also,
+		 * Linux expects at least 16 bytes headroom in each rx buffer.
+		 */
+		if (unlikely(skb->tail > skb->end) ||
+		    unlikely((skb->data - skb->head) < 16)) {
+			if (net_ratelimit()) {
+				if (skb->tail > skb->end)
+					printk(KERN_INFO "Received packet "
+					       "is %zd bytes beyond tail.\n",
+					       skb->tail - skb->end);
+				else
+					printk(KERN_INFO "Received packet "
+					       "is %zd bytes before head.\n",
+					       16 - (skb->data - skb->head));
+			}
+
+			nskb = __dev_alloc_skb(skb->len + 2,
+					       GFP_ATOMIC|__GFP_NOWARN);
+			if (nskb != NULL) {
+				skb_reserve(nskb, 2);
+				skb_put(nskb, skb->len);
+				memcpy(nskb->data, skb->data, skb->len);
+				nskb->dev = skb->dev;
+				nskb->ip_summed = skb->ip_summed;
+			}
+
+			/* Reinitialise and then destroy the old skbuff. */
+			skb->len  = 0;
+			skb->tail = skb->data;
+			init_skb_shinfo(skb);
+			dev_kfree_skb(skb);
+
+			/* Switch old for new, if we copied the buffer. */
+			if ((skb = nskb) == NULL)
+				continue;
+		}
+
+		/* Set the shinfo area, which is hidden behind the data. */
+		init_skb_shinfo(skb);
+		/* Ethernet work: Delayed to here as it peeks the header. */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* Pass it up. */
+		netif_receive_skb(skb);
+		dev->last_rx = jiffies;
+	}
+
+	np->rx.rsp_cons = i;
+
+	/* If we get a callback with very few responses, reduce fill target. */
+	/* NB. Note exponential increase, linear decrease. */
+	if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
+	     ((3*np->rx_target) / 4)) &&
+	    (--np->rx_target < np->rx_min_target))
+		np->rx_target = np->rx_min_target;
+
+	network_alloc_rx_buffers(dev);
+
+	*pbudget   -= work_done;
+	dev->quota -= work_done;
+
+	if (work_done < budget) {
+		local_irq_save(flags);
+
+		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
+		if (!more_to_do)
+			__netif_rx_complete(dev);
+
+		local_irq_restore(flags);
+	}
+
+	spin_unlock(&np->rx_lock);
+
+	return more_to_do;
+}
+
+
+static int network_close(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	np->user_state = UST_CLOSED;
+	netif_stop_queue(np->netdev);
+	return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	return &np->stats;
+}
+
+static void network_connect(struct net_device *dev)
+{
+	struct netfront_info *np;
+	int i, requeue_idx;
+	struct netif_tx_request *tx;
+	struct sk_buff *skb;
+
+	np = netdev_priv(dev);
+	spin_lock_irq(&np->tx_lock);
+	spin_lock(&np->rx_lock);
+
+	/* Recovery procedure: */
+
+	/* Step 1: Reinitialise variables. */
+	np->tx_full = 0;
+
+	/*
+	 * Step 2: Rebuild the RX and TX ring contents.
+	 * NB. We could just free the queued TX packets now but we hope
+	 * that sending them out might do some good.  We have to rebuild
+	 * the RX ring because some of our pages are currently flipped out
+	 * so we can't just free the RX skbs.
+	 * NB2. Freelist index entries are always going to be less than
+	 *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
+	 * greater than __PAGE_OFFSET: we use this property to distinguish
+	 * them.
+	 */
+
+	/*
+	 * Rebuild the TX buffer freelist and the TX ring itself.
+	 * NB. This reorders packets.  We could keep more private state
+	 * to avoid this but maybe it doesn't matter so much given the
+	 * interface has been down.
+	 */
+	for (requeue_idx = 0, i = 1; i <= NET_TX_RING_SIZE; i++) {
+		if ((unsigned long)np->tx_skbs[i] < __PAGE_OFFSET)
+			continue;
+
+		skb = np->tx_skbs[i];
+
+		tx = RING_GET_REQUEST(&np->tx, requeue_idx);
+		requeue_idx++;
+
+		tx->id = i;
+		gnttab_grant_foreign_access_ref(
+			np->grant_tx_ref[i], np->xbdev->otherend_id,
+			virt_to_mfn(np->tx_skbs[i]->data),
+			GNTMAP_readonly);
+		tx->gref = np->grant_tx_ref[i];
+		tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
+		tx->size = skb->len;
+		tx->flags = (skb->ip_summed == CHECKSUM_HW) ?
+			NETTXF_csum_blank : 0;
+
+		np->stats.tx_bytes += skb->len;
+		np->stats.tx_packets++;
+	}
+
+	np->tx.req_prod_pvt = requeue_idx;
+	RING_PUSH_REQUESTS(&np->tx);
+
+	/* Rebuild the RX buffer freelist and the RX ring itself. */
+	for (requeue_idx = 0, i = 1; i <= NET_RX_RING_SIZE; i++) {
+		if ((unsigned long)np->rx_skbs[i] < __PAGE_OFFSET)
+			continue;
+		gnttab_grant_foreign_transfer_ref(
+			np->grant_rx_ref[i], np->xbdev->otherend_id,
+			__pa(np->rx_skbs[i]->data) >> PAGE_SHIFT);
+		RING_GET_REQUEST(&np->rx, requeue_idx)->gref =
+			np->grant_rx_ref[i];
+		RING_GET_REQUEST(&np->rx, requeue_idx)->id = i;
+		requeue_idx++;
+	}
+
+	np->rx.req_prod_pvt = requeue_idx;
+	RING_PUSH_REQUESTS(&np->rx);
+
+	/*
+	 * Step 3: All public and private state should now be sane.  Get
+	 * ready to start sending and receiving packets and give the driver
+	 * domain a kick because we've probably just requeued some
+	 * packets.
+	 */
+	np->backend_state = BEST_CONNECTED;
+	notify_remote_via_irq(np->irq);
+	network_tx_buf_gc(dev);
+
+	if (np->user_state == UST_OPEN)
+		netif_start_queue(dev);
+
+	spin_unlock(&np->rx_lock);
+	spin_unlock_irq(&np->tx_lock);
+}
+
+static void show_device(struct netfront_info *np)
+{
+#ifdef DEBUG
+	if (np) {
+		IPRINTK("<vif handle=%u %s(%s) evtchn=%u tx=%p rx=%p>\n",
+			np->handle,
+			be_state_name[np->backend_state],
+			np->user_state ? "open" : "closed",
+			np->evtchn,
+			np->tx,
+			np->rx);
+	} else
+		IPRINTK("<vif NULL>\n");
+#endif
+}
+
+static void netif_uninit(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	gnttab_free_grant_references(np->gref_tx_head);
+	gnttab_free_grant_references(np->gref_rx_head);
+}
+
+static struct ethtool_ops network_ethtool_ops =
+{
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = ethtool_op_set_tx_csum,
+};
+
+/** Create a network device.
+ * @param handle device handle
+ * @param val return parameter for created device
+ * @return 0 on success, error code otherwise
+ */
+static int create_netdev(int handle, struct xenbus_device *dev,
+			 struct net_device **val)
+{
+	int i, err = 0;
+	struct net_device *netdev = NULL;
+	struct netfront_info *np = NULL;
+
+	if ((netdev = alloc_etherdev(sizeof(struct netfront_info))) == NULL) {
+		printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
+		       __FUNCTION__);
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	np                = netdev_priv(netdev);
+	np->backend_state = BEST_CLOSED;
+	np->user_state    = UST_CLOSED;
+	np->handle        = handle;
+	np->xbdev         = dev;
+
+	spin_lock_init(&np->tx_lock);
+	spin_lock_init(&np->rx_lock);
+
+	skb_queue_head_init(&np->rx_batch);
+	np->rx_target     = RX_DFL_MIN_TARGET;
+	np->rx_min_target = RX_DFL_MIN_TARGET;
+	np->rx_max_target = RX_MAX_TARGET;
+
+	init_timer(&np->rx_refill_timer);
+	np->rx_refill_timer.data = (unsigned long)netdev;
+	np->rx_refill_timer.function = rx_refill_timeout;
+
+	/* Initialise {tx,rx}_skbs as a free chain containing every entry. */
+	for (i = 0; i <= NET_TX_RING_SIZE; i++) {
+		np->tx_skbs[i] = (void *)((unsigned long) i+1);
+		np->grant_tx_ref[i] = GRANT_INVALID_REF;
+	}
+
+	for (i = 0; i <= NET_RX_RING_SIZE; i++) {
+		np->rx_skbs[i] = (void *)((unsigned long) i+1);
+		np->grant_rx_ref[i] = GRANT_INVALID_REF;
+	}
+
+	/* A grant for every tx ring slot */
+	if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
+					  &np->gref_tx_head) < 0) {
+		printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
+		err = -ENOMEM;
+		goto exit;
+	}
+	/* A grant for every rx ring slot */
+	if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
+					  &np->gref_rx_head) < 0) {
+		printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
+		gnttab_free_grant_references(np->gref_tx_head);
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	netdev->open            = network_open;
+	netdev->hard_start_xmit = network_start_xmit;
+	netdev->stop            = network_close;
+	netdev->get_stats       = network_get_stats;
+	netdev->poll            = netif_poll;
+	netdev->uninit          = netif_uninit;
+	netdev->weight          = 64;
+	netdev->features        = NETIF_F_IP_CSUM;
+
+	SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
+	SET_MODULE_OWNER(netdev);
+	SET_NETDEV_DEV(netdev, &dev->dev);
+
+	if ((err = register_netdev(netdev)) != 0) {
+		printk(KERN_WARNING "%s> register_netdev err=%d\n",
+		       __FUNCTION__, err);
+		goto exit_free_grefs;
+	}
+
+	if ((err = xennet_proc_addif(netdev)) != 0) {
+		unregister_netdev(netdev);
+		goto exit_free_grefs;
+	}
+
+	np->netdev = netdev;
+
+ exit:
+	if (err != 0) {
+		if (netdev)
+			free_netdev(netdev);
+	} else if (val != NULL)
+		*val = netdev;
+	return err;
+
+ exit_free_grefs:
+	gnttab_free_grant_references(np->gref_tx_head);
+	gnttab_free_grant_references(np->gref_rx_head);
+	goto exit;
+}
+
+/*
+ * We use this notifier to send out a fake ARP reply to reset switches and
+ * router ARP caches when an IP interface is brought up on a VIF.
+ */
+static int
+inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct in_ifaddr  *ifa = (struct in_ifaddr *)ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
+
+	/* UP event and is it one of our devices? */
+	if (event == NETDEV_UP && dev->open == network_open)
+		(void)send_fake_arp(dev);
+
+	return NOTIFY_DONE;
+}
+
+
+/* ** Close down ** */
+
+
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void netfront_closing(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev->data;
+
+	DPRINTK("netfront_closing: %s removed\n", dev->nodename);
+
+	close_netdev(info);
+
+	xenbus_switch_state(dev, XBT_NULL, XenbusStateClosed);
+}
+
+
+static int netfront_remove(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev->data;
+
+	DPRINTK("%s\n", dev->nodename);
+
+	netif_disconnect_backend(info);
+	free_netdev(info->netdev);
+
+	return 0;
+}
+
+
+static void close_netdev(struct netfront_info *info)
+{
+	spin_lock_irq(&info->netdev->xmit_lock);
+	netif_stop_queue(info->netdev);
+	spin_unlock_irq(&info->netdev->xmit_lock);
+
+#if defined(CONFIG_PROC_FS) && defined(XEN_XENNET_PROC_INTERFACE)
+	xennet_proc_delif(info->netdev);
+#endif
+
+	del_timer_sync(&info->rx_refill_timer);
+
+	unregister_netdev(info->netdev);
+}
+
+
+static void netif_disconnect_backend(struct netfront_info *info)
+{
+	/* Stop old i/f to prevent errors whilst we rebuild the state. */
+	spin_lock_irq(&info->tx_lock);
+	spin_lock(&info->rx_lock);
+	info->backend_state = BEST_DISCONNECTED;
+	spin_unlock(&info->rx_lock);
+	spin_unlock_irq(&info->tx_lock);
+
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info->netdev);
+	info->evtchn = info->irq = 0;
+
+	end_access(info->tx_ring_ref, info->tx.sring);
+	end_access(info->rx_ring_ref, info->rx.sring);
+	info->tx_ring_ref = GRANT_INVALID_REF;
+	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->tx.sring = NULL;
+	info->rx.sring = NULL;
+}
+
+
+static void netif_free(struct netfront_info *info)
+{
+	close_netdev(info);
+	netif_disconnect_backend(info);
+	free_netdev(info->netdev);
+}
+
+
+static void end_access(int ref, void *page)
+{
+	if (ref != GRANT_INVALID_REF)
+		gnttab_end_foreign_access(ref, 0, (unsigned long)page);
+}
+
+
+/* ** Driver registration ** */
+
+
+static struct xenbus_device_id netfront_ids[] = {
+	{ "vif" },
+	{ "" }
+};
+
+
+static struct xenbus_driver netfront = {
+	.name = "vif",
+	.owner = THIS_MODULE,
+	.ids = netfront_ids,
+	.probe = netfront_probe,
+	.remove = netfront_remove,
+	.resume = netfront_resume,
+	.otherend_changed = backend_changed,
+};
+
+
+static struct notifier_block notifier_inetdev = {
+	.notifier_call  = inetdev_notify,
+};
+
+static int __init netif_init(void)
+{
+	int err = 0;
+
+	if (xen_start_info->flags & SIF_INITDOMAIN)
+		return 0;
+
+	if ((err = xennet_proc_init()) != 0)
+		return err;
+
+	IPRINTK("Initialising virtual ethernet driver.\n");
+
+	(void)register_inetaddr_notifier(&notifier_inetdev);
+
+	return xenbus_register_frontend(&netfront);
+}
+module_init(netif_init);
+
+
+static void netif_exit(void)
+{
+	unregister_inetaddr_notifier(&notifier_inetdev);
+
+	return xenbus_unregister_driver(&netfront);
+}
+module_exit(netif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+
+/* ** /proc **/
+
+
+#if defined(CONFIG_PROC_FS) && defined(XEN_XENNET_PROC_INTERFACE)
+
+#define TARGET_MIN 0UL
+#define TARGET_MAX 1UL
+#define TARGET_CUR 2UL
+
+static int xennet_proc_read(
+	char *page, char **start, off_t off, int count, int *eof, void *data)
+{
+	struct net_device *dev =
+		(struct net_device *)((unsigned long)data & ~3UL);
+	struct netfront_info *np = netdev_priv(dev);
+	int len = 0, which_target = (long)data & 3;
+
+	switch (which_target) {
+	case TARGET_MIN:
+		len = sprintf(page, "%d\n", np->rx_min_target);
+		break;
+	case TARGET_MAX:
+		len = sprintf(page, "%d\n", np->rx_max_target);
+		break;
+	case TARGET_CUR:
+		len = sprintf(page, "%d\n", np->rx_target);
+		break;
+	}
+
+	*eof = 1;
+	return len;
+}
+
+static int xennet_proc_write(
+	struct file *file, const char __user *buffer,
+	unsigned long count, void *data)
+{
+	struct net_device *dev =
+		(struct net_device *)((unsigned long)data & ~3UL);
+	struct netfront_info *np = netdev_priv(dev);
+	int which_target = (long)data & 3;
+	char string[64];
+	long target;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count <= 1)
+		return -EBADMSG; /* runt */
+	if (count > sizeof(string))
+		return -EFBIG;   /* too long */
+
+	if (copy_from_user(string, buffer, count))
+		return -EFAULT;
+	string[sizeof(string)-1] = '\0';
+
+	target = simple_strtol(string, NULL, 10);
+	if (target < RX_MIN_TARGET)
+		target = RX_MIN_TARGET;
+	if (target > RX_MAX_TARGET)
+		target = RX_MAX_TARGET;
+
+	spin_lock(&np->rx_lock);
+
+	switch (which_target) {
+	case TARGET_MIN:
+		if (target > np->rx_max_target)
+			np->rx_max_target = target;
+		np->rx_min_target = target;
+		if (target > np->rx_target)
+			np->rx_target = target;
+		break;
+	case TARGET_MAX:
+		if (target < np->rx_min_target)
+			np->rx_min_target = target;
+		np->rx_max_target = target;
+		if (target < np->rx_target)
+			np->rx_target = target;
+		break;
+	case TARGET_CUR:
+		break;
+	}
+
+	network_alloc_rx_buffers(dev);
+
+	spin_unlock(&np->rx_lock);
+
+	return count;
+}
+
+static int xennet_proc_init(void)
+{
+	if (proc_mkdir("xen/net", NULL) == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static int xennet_proc_addif(struct net_device *dev)
+{
+	struct proc_dir_entry *dir, *min, *max, *cur;
+	char name[30];
+
+	sprintf(name, "xen/net/%s", dev->name);
+
+	dir = proc_mkdir(name, NULL);
+	if (!dir)
+		goto nomem;
+	dir->owner = THIS_MODULE;
+
+	min = create_proc_entry("rxbuf_min", 0644, dir);
+	max = create_proc_entry("rxbuf_max", 0644, dir);
+	cur = create_proc_entry("rxbuf_cur", 0444, dir);
+	if (!min || !max || !cur)
+		goto nomem;
+
+	min->read_proc  = xennet_proc_read;
+	min->write_proc = xennet_proc_write;
+	min->data       = (void *)((unsigned long)dev | TARGET_MIN);
+	min->owner = THIS_MODULE;
+
+	max->read_proc  = xennet_proc_read;
+	max->write_proc = xennet_proc_write;
+	max->data       = (void *)((unsigned long)dev | TARGET_MAX);
+	max->owner = THIS_MODULE;
+
+	cur->read_proc  = xennet_proc_read;
+	cur->write_proc = xennet_proc_write;
+	cur->data       = (void *)((unsigned long)dev | TARGET_CUR);
+	max->owner = THIS_MODULE;
+
+	return 0;
+
+ nomem:
+	xennet_proc_delif(dev);
+	return -ENOMEM;
+}
+
+static void xennet_proc_delif(struct net_device *dev)
+{
+	char name[30];
+
+	sprintf(name, "xen/net/%s/rxbuf_min", dev->name);
+	remove_proc_entry(name, NULL);
+
+	sprintf(name, "xen/net/%s/rxbuf_max", dev->name);
+	remove_proc_entry(name, NULL);
+
+	sprintf(name, "xen/net/%s/rxbuf_cur", dev->name);
+	remove_proc_entry(name, NULL);
+
+	sprintf(name, "xen/net/%s", dev->name);
+	remove_proc_entry(name, NULL);
+}
+
+#endif
--- /dev/null
+++ linus-2.6/include/xen/net_driver_util.h
@@ -0,0 +1,48 @@
+/*****************************************************************************
+ *
+ * Utility functions for Xen network devices.
+ *
+ * Copyright (c) 2005 XenSource Ltd.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _ASM_XEN_NET_DRIVER_UTIL_H
+#define _ASM_XEN_NET_DRIVER_UTIL_H
+
+
+#include <xen/xenbus.h>
+
+
+/**
+ * Read the 'mac' node at the given device's node in the store, and parse that
+ * as colon-separated octets, placing result the given mac array.  mac must be
+ * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h).
+ * Return 0 on success, or -errno on error.
+ */
+int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]);
+
+
+#endif /* _ASM_XEN_NET_DRIVER_UTIL_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 35/35] Add Xen virtual block device driver.
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (33 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
@ 2006-05-09  7:00 ` Chris Wright
  2006-05-09 12:01   ` Christoph Hellwig
  2006-05-09 14:49 ` [RFC PATCH 00/35] Xen i386 paravirtualization support Martin J. Bligh
  35 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09  7:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel, Ian Pratt, Christian Limpach

[-- Attachment #1: blkfront --]
[-- Type: text/plain, Size: 34125 bytes --]

The block device frontend driver allows the kernel to access block
devices exported exported by a virtual machine containing a physical
block device driver.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 drivers/block/Kconfig           |    2 
 drivers/xen/Kconfig.blk         |   14 
 drivers/xen/Makefile            |    1 
 drivers/xen/blkfront/Makefile   |    5 
 drivers/xen/blkfront/blkfront.c |  813 ++++++++++++++++++++++++++++++++++++++++
 drivers/xen/blkfront/block.h    |  156 +++++++
 drivers/xen/blkfront/vbd.c      |  216 ++++++++++
 7 files changed, 1207 insertions(+)

--- linus-2.6.orig/drivers/block/Kconfig
+++ linus-2.6/drivers/block/Kconfig
@@ -449,6 +449,8 @@ config CDROM_PKTCDVD_WCACHE
 
 source "drivers/s390/block/Kconfig"
 
+source "drivers/xen/Kconfig.blk"
+
 config ATA_OVER_ETH
 	tristate "ATA over Ethernet support"
 	depends on NET
--- linus-2.6.orig/drivers/xen/Makefile
+++ linus-2.6/drivers/xen/Makefile
@@ -6,5 +6,6 @@ obj-y	+= core/
 obj-y	+= console/
 obj-y	+= xenbus/
 
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= blkfront/
 obj-$(CONFIG_XEN_NETDEV_FRONTEND)	+= netfront/
 
--- /dev/null
+++ linus-2.6/drivers/xen/Kconfig.blk
@@ -0,0 +1,14 @@
+menu "Xen block device drivers"
+        depends on XEN
+
+config XEN_BLKDEV_FRONTEND
+	tristate "Block device frontend driver"
+	depends on XEN
+	default y
+	help
+	  The block device frontend driver allows the kernel to access block
+	  devices exported from a device driver virtual machine. Unless you
+	  are building a dedicated device driver virtual machine, then you
+	  almost certainly want to say Y here.
+
+endmenu
--- /dev/null
+++ linus-2.6/drivers/xen/blkfront/Makefile
@@ -0,0 +1,5 @@
+
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	:= xenblk.o
+
+xenblk-objs := blkfront.o vbd.o
+
--- /dev/null
+++ linus-2.6/drivers/xen/blkfront/blkfront.c
@@ -0,0 +1,813 @@
+/******************************************************************************
+ * blkfront.c
+ * 
+ * XenLinux virtual block device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include "block.h"
+#include <linux/cdrom.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <scsi/scsi.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+
+#define BLKIF_STATE_DISCONNECTED 0
+#define BLKIF_STATE_CONNECTED    1
+#define BLKIF_STATE_SUSPENDED    2
+
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+#define GRANT_INVALID_REF	0
+
+static void connect(struct blkfront_info *);
+static void blkfront_closing(struct xenbus_device *);
+static int blkfront_remove(struct xenbus_device *);
+static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
+static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
+
+static void kick_pending_request_queues(struct blkfront_info *);
+
+static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
+static void blkif_restart_queue(void *arg);
+static void blkif_recover(struct blkfront_info *);
+static void blkif_completion(struct blk_shadow *);
+static void blkif_free(struct blkfront_info *, int);
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int err, vdevice, i;
+	struct blkfront_info *info;
+
+	/* FIXME: Use dynamic device id if this is not set. */
+	err = xenbus_scanf(XBT_NULL, dev->nodename,
+			   "virtual-device", "%i", &vdevice);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading virtual-device");
+		return err;
+	}
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+		return -ENOMEM;
+	}
+
+	info->xbdev = dev;
+	info->vdevice = vdevice;
+	info->connected = BLKIF_STATE_DISCONNECTED;
+	INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
+
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Front end dir is a number, which is used as the id. */
+	info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
+	dev->data = info;
+
+	err = talk_to_backend(dev, info);
+	if (err) {
+		kfree(info);
+		dev->data = NULL;
+		return err;
+	}
+
+	return 0;
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->data;
+	int err;
+
+	DPRINTK("blkfront_resume: %s\n", dev->nodename);
+
+	blkif_free(info, 1);
+
+	err = talk_to_backend(dev, info);
+	if (!err)
+		blkif_recover(info);
+
+	return err;
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct blkfront_info *info)
+{
+	const char *message = NULL;
+	xenbus_transaction_t xbt;
+	int err;
+
+	/* Create shared ring, alloc event channel. */
+	err = setup_blkring(dev, info);
+	if (err)
+		goto out;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_blkring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename,
+			    "ring-ref","%u", info->ring_ref);
+	if (err) {
+		message = "writing ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "event-channel", "%u", info->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_switch_state(dev, xbt, XenbusStateInitialised);
+	if (err)
+		goto abort_transaction;
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_blkring;
+	}
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+	blkif_free(info, 0);
+ out:
+	return err;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+			 struct blkfront_info *info)
+{
+	struct blkif_sring *sring;
+	int err;
+
+	info->ring_ref = GRANT_INVALID_REF;
+
+	sring = (struct blkif_sring *)__get_free_page(GFP_KERNEL);
+	if (!sring) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+		return -ENOMEM;
+	}
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+	if (err < 0) {
+		free_page((unsigned long)sring);
+		info->ring.sring = NULL;
+		goto fail;
+	}
+	info->ring_ref = err;
+
+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	if (err)
+		goto fail;
+
+	err = bind_evtchn_to_irqhandler(
+		info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err,
+				 "bind_evtchn_to_irqhandler failed");
+		goto fail;
+	}
+	info->irq = err;
+
+	return 0;
+fail:
+	blkif_free(info, 0);
+	return err;
+}
+
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    XenbusState backend_state)
+{
+	struct blkfront_info *info = dev->data;
+	struct block_device *bd;
+
+	DPRINTK("blkfront:backend_changed.\n");
+
+	switch (backend_state) {
+	case XenbusStateUnknown:
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateConnected:
+		connect(info);
+		break;
+
+	case XenbusStateClosing:
+		bd = bdget(info->dev);
+		if (bd == NULL)
+			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+
+		mutex_lock(&bd->bd_mutex);
+		if (info->users > 0)
+			xenbus_dev_error(dev, -EBUSY,
+					 "Device in use; refusing to close");
+		else
+			blkfront_closing(dev);
+		mutex_unlock(&bd->bd_mutex);
+		bdput(bd);
+		break;
+	}
+}
+
+
+/* ** Connection ** */
+
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void connect(struct blkfront_info *info)
+{
+	unsigned long sectors, sector_size;
+	unsigned int binfo;
+	int err;
+
+	if ((info->connected == BLKIF_STATE_CONNECTED) ||
+	    (info->connected == BLKIF_STATE_SUSPENDED) )
+		return;
+
+	DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
+
+	err = xenbus_gather(XBT_NULL, info->xbdev->otherend,
+			    "sectors", "%lu", &sectors,
+			    "info", "%u", &binfo,
+			    "sector-size", "%lu", &sector_size,
+			    NULL);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err,
+				 "reading backend fields at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+		                 info->xbdev->otherend);
+		return;
+	}
+
+	(void)xenbus_switch_state(info->xbdev, XBT_NULL, XenbusStateConnected);
+
+	/* Kick pending requests. */
+	spin_lock_irq(&blkif_io_lock);
+	info->connected = BLKIF_STATE_CONNECTED;
+	kick_pending_request_queues(info);
+	spin_unlock_irq(&blkif_io_lock);
+
+	add_disk(info->gd);
+}
+
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void blkfront_closing(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->data;
+
+	DPRINTK("blkfront_closing: %s removed\n", dev->nodename);
+
+	xlvbd_del(info);
+
+	xenbus_switch_state(dev, XBT_NULL, XenbusStateClosed);
+}
+
+
+static int blkfront_remove(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev->data;
+
+	DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
+
+	blkif_free(info, 0);
+
+	kfree(info);
+
+	return 0;
+}
+
+
+static inline int GET_ID_FROM_FREELIST(
+	struct blkfront_info *info)
+{
+	unsigned long free = info->shadow_free;
+	BUG_ON(free > BLK_RING_SIZE);
+	info->shadow_free = info->shadow[free].req.id;
+	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	return free;
+}
+
+static inline void ADD_ID_TO_FREELIST(
+	struct blkfront_info *info, unsigned long id)
+{
+	info->shadow[id].req.id  = info->shadow_free;
+	info->shadow[id].request = 0;
+	info->shadow_free = id;
+}
+
+static inline void flush_requests(struct blkfront_info *info)
+{
+	int notify;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+
+	if (notify)
+		notify_remote_via_irq(info->irq);
+}
+
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+	if (!RING_FULL(&info->ring)) {
+		/* Re-enable calldowns. */
+		blk_start_queue(info->rq);
+		/* Kick things off immediately. */
+		do_blkif_request(info->rq);
+	}
+}
+
+static void blkif_restart_queue(void *arg)
+{
+	struct blkfront_info *info = (struct blkfront_info *)arg;
+	spin_lock_irq(&blkif_io_lock);
+	kick_pending_request_queues(info);
+	spin_unlock_irq(&blkif_io_lock);
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+	struct blkfront_info *info = (struct blkfront_info *)arg;
+	schedule_work(&info->work);
+}
+
+int blkif_open(struct inode *inode, struct file *filep)
+{
+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+	info->users++;
+	return 0;
+}
+
+
+int blkif_release(struct inode *inode, struct file *filep)
+{
+	struct blkfront_info *info = inode->i_bdev->bd_disk->private_data;
+	info->users--;
+	if (info->users == 0) {
+		/* Check whether we have been instructed to close.  We will
+		   have ignored this request initially, as the device was
+		   still mounted. */
+		struct xenbus_device * dev = info->xbdev;
+		XenbusState state = xenbus_read_driver_state(dev->otherend);
+
+		if (state == XenbusStateClosing)
+			blkfront_closing(dev);
+	}
+	return 0;
+}
+
+
+int blkif_ioctl(struct inode *inode, struct file *filep,
+                unsigned command, unsigned long argument)
+{
+	int i;
+
+	DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+		      command, (long)argument, inode->i_rdev);
+
+	switch (command) {
+	case HDIO_GETGEO:
+		/* return ENOSYS to use defaults */
+		return -ENOSYS;
+
+	case CDROMMULTISESSION:
+		DPRINTK("FIXME: support multisession CDs later\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	default:
+		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+		  command);*/
+		return -EINVAL; /* same return as native Linux */
+	}
+
+	return 0;
+}
+
+/*
+ * blkif_queue_request
+ *
+ * request block io
+ *
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int blkif_queue_request(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+	unsigned long buffer_mfn;
+	struct blkif_request *ring_req;
+	struct bio *bio;
+	struct bio_vec *bvec;
+	int idx;
+	unsigned long id;
+	unsigned int fsect, lsect;
+	int ref;
+	grant_ref_t gref_head;
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+		return 1;
+
+	if (gnttab_alloc_grant_references(
+		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
+		gnttab_request_free_callback(
+			&info->callback,
+			blkif_restart_queue_callback,
+			info,
+			BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		return 1;
+	}
+
+	/* Fill out a communications ring structure. */
+	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+	id = GET_ID_FROM_FREELIST(info);
+	info->shadow[id].request = (unsigned long)req;
+
+	ring_req->id = id;
+	ring_req->operation = rq_data_dir(req) ?
+		BLKIF_OP_WRITE : BLKIF_OP_READ;
+	ring_req->sector_number = (blkif_sector_t)req->sector;
+	ring_req->handle = info->handle;
+
+	ring_req->nr_segments = 0;
+	rq_for_each_bio (bio, req) {
+		bio_for_each_segment (bvec, bio, idx) {
+			BUG_ON(ring_req->nr_segments
+			       == BLKIF_MAX_SEGMENTS_PER_REQUEST);
+			buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT;
+			fsect = bvec->bv_offset >> 9;
+			lsect = fsect + (bvec->bv_len >> 9) - 1;
+			/* install a grant reference. */
+			ref = gnttab_claim_grant_reference(&gref_head);
+			BUG_ON(ref == -ENOSPC);
+
+			gnttab_grant_foreign_access_ref(
+				ref,
+				info->xbdev->otherend_id,
+				buffer_mfn,
+				rq_data_dir(req) );
+
+			info->shadow[id].frame[ring_req->nr_segments] =
+				mfn_to_pfn(buffer_mfn);
+
+			ring_req->seg[ring_req->nr_segments] =
+				(struct blkif_request_segment) {
+					.gref       = ref,
+					.first_sect = fsect,
+					.last_sect  = lsect };
+
+			ring_req->nr_segments++;
+		}
+	}
+
+	info->ring.req_prod_pvt++;
+
+	/* Keep a private copy so we can reissue requests when recovering. */
+	info->shadow[id].req = *ring_req;
+
+	gnttab_free_grant_references(gref_head);
+
+	return 0;
+}
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+void do_blkif_request(request_queue_t *rq)
+{
+	struct blkfront_info *info = NULL;
+	struct request *req;
+	int queued;
+
+	DPRINTK("Entered do_blkif_request\n");
+
+	queued = 0;
+
+	while ((req = elv_next_request(rq)) != NULL) {
+		info = req->rq_disk->private_data;
+		if (!blk_fs_request(req)) {
+			end_request(req, 0);
+			continue;
+		}
+
+		if (RING_FULL(&info->ring))
+			goto wait;
+
+		DPRINTK("do_blk_req %p: cmd %p, sec %lx, "
+			"(%u/%li) buffer:%p [%s]\n",
+			req, req->cmd, req->sector, req->current_nr_sectors,
+			req->nr_sectors, req->buffer,
+			rq_data_dir(req) ? "write" : "read");
+
+
+		blkdev_dequeue_request(req);
+		if (blkif_queue_request(req)) {
+			blk_requeue_request(rq, req);
+		wait:
+			/* Avoid pointless unplugs. */
+			blk_stop_queue(rq);
+			break;
+		}
+
+		queued++;
+	}
+
+	if (queued != 0)
+		flush_requests(info);
+}
+
+
+static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+	struct request *req;
+	struct blkif_response *bret;
+	RING_IDX i, rp;
+	unsigned long flags;
+	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+
+	spin_lock_irqsave(&blkif_io_lock, flags);
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+		spin_unlock_irqrestore(&blkif_io_lock, flags);
+		return IRQ_HANDLED;
+	}
+
+ again:
+	rp = info->ring.sring->rsp_prod;
+	rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+	for (i = info->ring.rsp_cons; i != rp; i++) {
+		unsigned long id;
+		int ret;
+
+		bret = RING_GET_RESPONSE(&info->ring, i);
+		id   = bret->id;
+		req  = (struct request *)info->shadow[id].request;
+
+		blkif_completion(&info->shadow[id]);
+
+		ADD_ID_TO_FREELIST(info, id);
+
+		switch (bret->operation) {
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+			if (unlikely(bret->status != BLKIF_RSP_OKAY))
+				DPRINTK("Bad return from blkdev data "
+					"request: %x\n", bret->status);
+
+			ret = end_that_request_first(
+				req, (bret->status == BLKIF_RSP_OKAY),
+				req->hard_nr_sectors);
+			BUG_ON(ret);
+			end_that_request_last(
+				req, (bret->status == BLKIF_RSP_OKAY));
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	info->ring.rsp_cons = i;
+
+	if (i != info->ring.req_prod_pvt) {
+		int more_to_do;
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+		if (more_to_do)
+			goto again;
+	} else
+		info->ring.sring->rsp_event = i + 1;
+
+	kick_pending_request_queues(info);
+
+	spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+	/* Prevent new requests being issued until we fix things up. */
+	spin_lock_irq(&blkif_io_lock);
+	info->connected = suspend ?
+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+	spin_unlock_irq(&blkif_io_lock);
+
+	/* Free resources associated with old device channel. */
+	if (info->ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(info->ring_ref, 0,
+					  (unsigned long)info->ring.sring);
+		info->ring_ref = GRANT_INVALID_REF;
+		info->ring.sring = NULL;
+	}
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info);
+	info->evtchn = info->irq = 0;
+
+}
+
+static void blkif_completion(struct blk_shadow *s)
+{
+	int i;
+	for (i = 0; i < s->req.nr_segments; i++)
+		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+}
+
+static void blkif_recover(struct blkfront_info *info)
+{
+	int i;
+	struct blkif_request *req;
+	struct blk_shadow *copy;
+	int j;
+
+	/* Stage 1: Make a safe copy of the shadow state. */
+	copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL);
+	memcpy(copy, info->shadow, sizeof(info->shadow));
+
+	/* Stage 2: Set up free list. */
+	memset(&info->shadow, 0, sizeof(info->shadow));
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow_free = info->ring.req_prod_pvt;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Stage 3: Find pending requests and requeue them. */
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		/* Not in use? */
+		if (copy[i].request == 0)
+			continue;
+
+		/* Grab a request slot and copy shadow state into it. */
+		req = RING_GET_REQUEST(
+			&info->ring, info->ring.req_prod_pvt);
+		*req = copy[i].req;
+
+		/* We get a new request id, and must reset the shadow state. */
+		req->id = GET_ID_FROM_FREELIST(info);
+		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+
+		/* Rewrite any grant references invalidated by susp/resume. */
+		for (j = 0; j < req->nr_segments; j++)
+			gnttab_grant_foreign_access_ref(
+				req->seg[j].gref,
+				info->xbdev->otherend_id,
+				pfn_to_mfn(info->shadow[req->id].frame[j]),
+				rq_data_dir(
+					(struct request *)
+					info->shadow[req->id].request));
+		info->shadow[req->id].req = *req;
+
+		info->ring.req_prod_pvt++;
+	}
+
+	kfree(copy);
+
+	(void)xenbus_switch_state(info->xbdev, XBT_NULL, XenbusStateConnected);
+
+	/* Now safe for us to use the shared ring */
+	spin_lock_irq(&blkif_io_lock);
+	info->connected = BLKIF_STATE_CONNECTED;
+	spin_unlock_irq(&blkif_io_lock);
+
+	/* Send off requeued requests */
+	flush_requests(info);
+
+	/* Kick any other new requests queued since we resumed */
+	spin_lock_irq(&blkif_io_lock);
+	kick_pending_request_queues(info);
+	spin_unlock_irq(&blkif_io_lock);
+}
+
+
+/* ** Driver Registration ** */
+
+
+static struct xenbus_device_id blkfront_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+
+static struct xenbus_driver blkfront = {
+	.name = "vbd",
+	.owner = THIS_MODULE,
+	.ids = blkfront_ids,
+	.probe = blkfront_probe,
+	.remove = blkfront_remove,
+	.resume = blkfront_resume,
+	.otherend_changed = backend_changed,
+};
+
+
+static int __init xlblk_init(void)
+{
+	if (xen_init() < 0)
+		return -ENODEV;
+
+	if (xlvbd_alloc_major() < 0)
+		return -ENODEV;
+
+	return xenbus_register_frontend(&blkfront);
+}
+module_init(xlblk_init);
+
+
+static void xlblk_exit(void)
+{
+	return xenbus_unregister_driver(&blkfront);
+}
+module_exit(xlblk_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
--- /dev/null
+++ linus-2.6/drivers/xen/blkfront/block.h
@@ -0,0 +1,156 @@
+/******************************************************************************
+ * block.h
+ * 
+ * Shared definitions between all levels of XenLinux Virtual block devices.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_DRIVERS_BLOCK_H__
+#define __XEN_DRIVERS_BLOCK_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+#include <linux/devfs_fs_kernel.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/ring.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#if 1
+#define IPRINTK(fmt, args...) \
+    printk(KERN_INFO "xen_blk: " fmt, ##args)
+#else
+#define IPRINTK(fmt, args...) ((void)0)
+#endif
+
+#if 1
+#define WPRINTK(fmt, args...) \
+    printk(KERN_WARNING "xen_blk: " fmt, ##args)
+#else
+#define WPRINTK(fmt, args...) ((void)0)
+#endif
+
+#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+struct xlbd_type_info
+{
+	int partn_shift;
+	int disks_per_major;
+	char *devname;
+	char *diskname;
+};
+
+struct xlbd_major_info
+{
+	int major;
+	int index;
+	int usage;
+	struct xlbd_type_info *type;
+};
+
+struct blk_shadow {
+	struct blkif_request req;
+	unsigned long request;
+	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+	struct xenbus_device *xbdev;
+	dev_t dev;
+ 	struct gendisk *gd;
+	int vdevice;
+	blkif_vdev_t handle;
+	int connected;
+	int ring_ref;
+	struct blkif_front_ring ring;
+	unsigned int evtchn, irq;
+	struct xlbd_major_info *mi;
+	request_queue_t *rq;
+	struct work_struct work;
+	struct gnttab_free_callback callback;
+	struct blk_shadow shadow[BLK_RING_SIZE];
+	unsigned long shadow_free;
+
+	/**
+	 * The number of people holding this device open.  We won't allow a
+	 * hot-unplug unless this is 0.
+	 */
+	int users;
+};
+
+extern spinlock_t blkif_io_lock;
+
+extern int blkif_open(struct inode *inode, struct file *filep);
+extern int blkif_release(struct inode *inode, struct file *filep);
+extern int blkif_ioctl(struct inode *inode, struct file *filep,
+                       unsigned command, unsigned long argument);
+extern int blkif_check(dev_t dev);
+extern int blkif_revalidate(dev_t dev);
+extern void do_blkif_request (request_queue_t *rq);
+
+/* Virtual block device subsystem. */
+int xlvbd_alloc_major(void);
+/* Note that xlvbd_add doesn't call add_disk for you: you're expected
+   to call add_disk on info->gd once the disk is properly connected
+   up. */
+int xlvbd_add(blkif_sector_t capacity, int device,
+	      u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
+void xlvbd_del(struct blkfront_info *info);
+
+#endif /* __XEN_DRIVERS_BLOCK_H__ */
--- /dev/null
+++ linus-2.6/drivers/xen/blkfront/vbd.c
@@ -0,0 +1,216 @@
+/******************************************************************************
+ * vbd.c
+ * 
+ * XenLinux virtual block device driver (xvd).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "block.h"
+#include <linux/blkdev.h>
+#include <linux/list.h>
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+static struct xlbd_type_info xvd_type_info = {
+	.partn_shift = 4,
+	.disks_per_major = 16,
+	.devname = "xvd",
+	.diskname = "xvd"
+};
+
+static struct xlbd_major_info xvd_major_info = {
+	.major = 201,
+	.type = &xvd_type_info
+};
+
+/* Information about our VBDs. */
+#define MAX_VBDS 64
+static LIST_HEAD(vbds_list);
+
+static struct block_device_operations xlvbd_block_fops =
+{
+	.owner = THIS_MODULE,
+	.open = blkif_open,
+	.release = blkif_release,
+	.ioctl  = blkif_ioctl,
+};
+
+spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED;
+
+int
+xlvbd_alloc_major(void)
+{
+	printk("Registering block device major %i\n", xvd_major_info.major);
+	if (register_blkdev(xvd_major_info.major,
+			    xvd_major_info.type->devname)) {
+		WPRINTK("can't get major %d with name %s\n",
+			xvd_major_info.major, xvd_major_info.type->devname);
+		return -1;
+	}
+
+	devfs_mk_dir(xvd_major_info.type->devname);
+	return 0;
+}
+
+static int
+xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+{
+	request_queue_t *rq;
+
+	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+	if (rq == NULL)
+		return -1;
+
+	elevator_init(rq, "noop");
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_hardsect_size(rq, sector_size);
+	blk_queue_max_sectors(rq, 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+
+	gd->queue = rq;
+
+	return 0;
+}
+
+static int
+xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice,
+		    u16 vdisk_info, u16 sector_size,
+		    struct blkfront_info *info)
+{
+	struct gendisk *gd;
+	struct xlbd_major_info *mi;
+	int nr_minors = 1;
+	int err = -ENODEV;
+
+	BUG_ON(info->gd != NULL);
+	BUG_ON(info->mi != NULL);
+	BUG_ON(info->rq != NULL);
+
+	mi = &xvd_major_info;
+	info->mi = mi;
+
+	if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0)
+		nr_minors = 1 << mi->type->partn_shift;
+
+	gd = alloc_disk(nr_minors);
+	if (gd == NULL)
+		goto out;
+
+	if (nr_minors > 1)
+		sprintf(gd->disk_name, "%s%c", mi->type->diskname,
+			'a' + mi->index * mi->type->disks_per_major +
+			(minor >> mi->type->partn_shift));
+	else
+		sprintf(gd->disk_name, "%s%c%d", mi->type->diskname,
+			'a' + mi->index * mi->type->disks_per_major +
+			(minor >> mi->type->partn_shift),
+			minor & ((1 << mi->type->partn_shift) - 1));
+
+	gd->major = mi->major;
+	gd->first_minor = minor;
+	gd->fops = &xlvbd_block_fops;
+	gd->private_data = info;
+	gd->driverfs_dev = &(info->xbdev->dev);
+	set_capacity(gd, capacity);
+
+	if (xlvbd_init_blk_queue(gd, sector_size)) {
+		del_gendisk(gd);
+		goto out;
+	}
+
+	info->rq = gd->queue;
+
+	if (vdisk_info & VDISK_READONLY)
+		set_disk_ro(gd, 1);
+
+	if (vdisk_info & VDISK_REMOVABLE)
+		gd->flags |= GENHD_FL_REMOVABLE;
+
+	if (vdisk_info & VDISK_CDROM)
+		gd->flags |= GENHD_FL_CD;
+
+	info->gd = gd;
+
+	return 0;
+
+ out:
+	info->mi = NULL;
+	return err;
+}
+
+int
+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
+	  u16 sector_size, struct blkfront_info *info)
+{
+	struct block_device *bd;
+	int err = 0;
+
+	info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice));
+
+	bd = bdget(info->dev);
+	if (bd == NULL)
+		return -ENODEV;
+
+	err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice,
+				  vdisk_info, sector_size, info);
+
+	bdput(bd);
+	return err;
+}
+
+void
+xlvbd_del(struct blkfront_info *info)
+{
+	if (info->mi == NULL)
+		return;
+
+	BUG_ON(info->gd == NULL);
+	del_gendisk(info->gd);
+	put_disk(info->gd);
+	info->gd = NULL;
+
+	info->mi = NULL;
+
+	BUG_ON(info->rq == NULL);
+	blk_cleanup_queue(info->rq);
+	info->rq = NULL;
+}

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-09  7:00 ` [RFC PATCH 17/35] Segment register changes for Xen Chris Wright
@ 2006-05-09  7:16   ` Pavel Machek
  2006-05-10 20:09     ` Andi Kleen
  2006-05-12  0:28     ` [Xen-devel] " Rusty Russell
  2006-05-09 16:44   ` Andi Kleen
  2006-05-18 20:20   ` Zachary Amsden
  2 siblings, 2 replies; 185+ messages in thread
From: Pavel Machek @ 2006-05-09  7:16 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Hi!

> --- linus-2.6.orig/include/asm-i386/mach-default/mach_system.h
> +++ linus-2.6/include/asm-i386/mach-default/mach_system.h
> @@ -1,6 +1,8 @@
>  #ifndef __ASM_MACH_SYSTEM_H
>  #define __ASM_MACH_SYSTEM_H
>  
> +#define clearsegment(seg)

do {} while (0), please.

							Pavel
-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen.
  2006-05-09  7:00 ` [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen Chris Wright
@ 2006-05-09  7:21   ` Pavel Machek
  2006-05-10 20:23     ` Andi Kleen
  2006-05-09 14:49   ` Martin J. Bligh
  1 sibling, 1 reply; 185+ messages in thread
From: Pavel Machek @ 2006-05-09  7:21 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Hi!

> +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
> +{
> +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
> +	C(0); C(1); C(2);
> +#undef C
> +}

Why not use for loop here? gcc should be able to optimize it...

> +#define load_TR_desc()

do {} while (0)...

> +#define load_gdt(dtr) do {						\
...
> +} while (0)

So you know the trick :-)

> +#define load_idt(dtr) HYPERVISOR_set_trap_table(xen_trap_table)
> +#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
> +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))

__volatile (not __volatile__?). could you just use 'asm volatile'
without __s instead?

							Pavel
-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 00/35] Xen i386 paravirtualization support
@ 2006-05-09  8:49 Chris Wright
  2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
                   ` (35 more replies)
  0 siblings, 36 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09  8:49 UTC (permalink / raw)
  To: linux-kernel; +Cc: virtualization, xen-devel

Unlike full virtualization in which the virtual machine provides 
the same platform interface as running natively on the hardware,
paravirtualization requires modification to the guest operating system
to work with the platform interface provided by the hypervisor.

Xen was designed with performance in mind.  Calls to the hypervisor
are minimized, batched if necessary, and non-critical codepaths are left
unmodified in the case where the privileged instruction can be trapped and
emulated by the hypervisor.  The Xen API is designed to be OS agnostic and
has had Linux, NetBSD, FreeBSD, Solaris, Plan9 and Netware ported to it.
Xen also provides support for running directly on native hardware.

The following patch series provides the minimal support required to
launch Xen paravirtual guests on standard x86 hardware running the Xen
hypervisor.  These patches effectively port the Linux kernel to run on the
platform interface provided by Xen.  This port is done as an i386 subarch.
In the future, we will break this patchset up to place the general
infrastrcture and subarch bits that may have common users at the
beginning of the series, ripe for picking off and pushing upstream.

With these patches you will be able to launch an unprivileged guest
running the modified Linux kernel and unmodified userspace.  This guest
is x86, UP only, runs in shadow translated mode, and has no direct access
to hardware.  This simplifies the patchset to the minimum functionality
needed to support a paravirtualized guest.  It's worth noting that
a fair amount of this patchset deals with paravirtualizing I/O, not
just CPU-only.  The additional missing functionality is primarily about
full SMP support, optimizations such as direct writable page tables,
and the management interface.

At a high-level, the patches provide the following:

- Kconfig and Makefile changes required to support Xen
- subarch changes to allow more platform functionality to be
  implemented by an i386 subarch
- Xen subarch implementation
- start of day code for running in the hypervisor provided environment (paging
  enabled)
- basic Xen drivers to provide a fully functional guest

The Xen platform API encapsulates the following types of requirements:

- idt, gdt, ldt (descriptor table handling)
- cr2, fpu_taskswitch, debug registers (privileged register handling)
- mmu (page table, tlb, and cache handling)
- memory reservations
- time and timer
- vcpu (init, up/down vcpu)
- schedule (processor yield, shutdown, etc)
- event channel (generalized virtual interrupt handling)
- grant table (shared memory interface for high speed interdomain communication)
- block device I/O
- network device I/O
- console device I/O
- Xen feature map
- Xen version info

Thanks to all have reviewed earlier versions of these patches.
-chris
--

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
@ 2006-05-09 10:05   ` Adrian Bunk
  2006-05-09 11:06     ` Ed Tomlinson
                       ` (2 more replies)
  2006-05-09 14:47   ` Daniel Walker
                     ` (2 subsequent siblings)
  3 siblings, 3 replies; 185+ messages in thread
From: Adrian Bunk @ 2006-05-09 10:05 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, May 09, 2006 at 12:00:01AM -0700, Chris Wright wrote:
>...
> --- linus-2.6.orig/arch/i386/Kconfig
> +++ linus-2.6/arch/i386/Kconfig
>...
>  config X86_IO_APIC
>  	bool
> -	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
> +	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN))
>  	default y
>...

<nitpick>not required</nitpick>

> --- linus-2.6.orig/kernel/Kconfig.hz
> +++ linus-2.6/kernel/Kconfig.hz
> @@ -3,7 +3,7 @@
>  #
>  
>  choice
> -	prompt "Timer frequency"
> +	prompt "Timer frequency" if !XEN
>  	default HZ_250
>  	help
>  	 Allows the configuration of the timer frequency. It is customary
> @@ -40,7 +40,7 @@ endchoice
>  
>  config HZ
>  	int
> -	default 100 if HZ_100
> +	default 100 if HZ_100 || XEN
>  	default 250 if HZ_250
>  	default 1000 if HZ_1000
>...

Why?
  
cu
Adrian

-- 

       "Is there not promise of rain?" Ling Tan asked suddenly out
        of the darkness. There had been need of rain for many days.
       "Only a promise," Lao Er said.
                                       Pearl S. Buck - Dragon Seed


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09 10:05   ` Adrian Bunk
@ 2006-05-09 11:06     ` Ed Tomlinson
  2006-05-09 12:45     ` Christian Limpach
  2006-05-09 23:23     ` Chris Wright
  2 siblings, 0 replies; 185+ messages in thread
From: Ed Tomlinson @ 2006-05-09 11:06 UTC (permalink / raw)
  To: Adrian Bunk
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

On Tuesday 09 May 2006 06:05, Adrian Bunk wrote:
> On Tue, May 09, 2006 at 12:00:01AM -0700, Chris Wright wrote:
> >...
> > --- linus-2.6.orig/arch/i386/Kconfig
> > +++ linus-2.6/arch/i386/Kconfig
> >...
> >  config X86_IO_APIC
> >  	bool
> > -	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
> > +	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN))
> >  	default y
> >...
> 
> <nitpick>not required</nitpick>
> 
> > --- linus-2.6.orig/kernel/Kconfig.hz
> > +++ linus-2.6/kernel/Kconfig.hz
> > @@ -3,7 +3,7 @@
> >  #
> >  
> >  choice
> > -	prompt "Timer frequency"
> > +	prompt "Timer frequency" if !XEN
> >  	default HZ_250
> >  	help
> >  	 Allows the configuration of the timer frequency. It is customary
> > @@ -40,7 +40,7 @@ endchoice
> >  
> >  config HZ
> >  	int
> > -	default 100 if HZ_100
> > +	default 100 if HZ_100 || XEN
> >  	default 250 if HZ_250
> >  	default 1000 if HZ_1000
> >...
> 
> Why?

Guessing, but its probably to limit the number of parahypervisor calls.

Ed Tomlinson

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 16/35] subarch support for interrupt and exception gates
  2006-05-09  7:00 ` [RFC PATCH 16/35] subarch support for interrupt and exception gates Chris Wright
@ 2006-05-09 11:09   ` Andi Kleen
  2006-05-09 12:55     ` Christian Limpach
  2006-05-13 12:27   ` Andrew Morton
  1 sibling, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 11:09 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt


> +/*
> + * This needs to use 'idt_table' rather than 'idt', and
> + * thus use the _nonmapped_ version of the IDT, as the
> + * Pentium F0 0F bugfix can have resulted in the mapped
> + * IDT being write-protected.
> + */
> +void set_intr_gate(unsigned int n, void *addr)
> +{
> +	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
> +}

No need to duplicate the various set_*_gate functions into the subarchs.


> +static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
> +{
> +	/* _set_gate(n, 5, 0, 0, (gdt_entry<<3)); */
> +}

Looks weird, but can be handled in the low level function.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
@ 2006-05-09 11:55   ` Herbert Xu
  2006-05-09 12:43     ` Christian Limpach
  2006-05-09 11:58   ` Christoph Hellwig
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 185+ messages in thread
From: Herbert Xu @ 2006-05-09 11:55 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, Christian.Limpach, xen-devel,
	netdev, ian.pratt

Hi Chris:

Chris Wright <chrisw@sous-sol.org> wrote:
>
> +/** Send a packet on a net device to encourage switches to learn the
> + * MAC. We send a fake ARP request.
> + *
> + * @param dev device
> + * @return 0 on success, error code otherwise
> + */
> +static int send_fake_arp(struct net_device *dev)

I think we talked about this before.  I don't see why Xen is so special
that it needs its own gratuitous arp routine embedded in the driver.
If this is needed at all (presumably for migration) then it should be
performed by the management scripts which can send grat ARP packets just
as easily.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
  2006-05-09 11:55   ` [Xen-devel] " Herbert Xu
@ 2006-05-09 11:58   ` Christoph Hellwig
  2006-05-09 23:37     ` Chris Wright
  2006-05-09 18:56   ` Stephen Hemminger
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 185+ messages in thread
From: Christoph Hellwig @ 2006-05-09 11:58 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach, netdev

On Tue, May 09, 2006 at 12:00:34AM -0700, Chris Wright wrote:
> The network device frontend driver allows the kernel to access network
> devices exported exported by a virtual machine containing a physical
> network device driver.

Please don't add procfs code to new network drivers.  Especially if it's oopsable
like the code in this driver by simple device renaming.


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 35/35] Add Xen virtual block device driver.
  2006-05-09  7:00 ` [RFC PATCH 35/35] Add Xen virtual block " Chris Wright
@ 2006-05-09 12:01   ` Christoph Hellwig
  0 siblings, 0 replies; 185+ messages in thread
From: Christoph Hellwig @ 2006-05-09 12:01 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, May 09, 2006 at 12:00:35AM -0700, Chris Wright wrote:
> The block device frontend driver allows the kernel to access block
> devices exported exported by a virtual machine containing a physical
> block device driver.

Any reason you're using the old crappy xen I/O code instead of Rusty's
alternative version?

Also please stop this stupid front/back naming.  In Linux terminology the
frontend is the client if there's a need for a postfix at all, and the
backend the server.  Compare that to e.g. ibm vio.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 11:55   ` [Xen-devel] " Herbert Xu
@ 2006-05-09 12:43     ` Christian Limpach
  2006-05-09 13:01       ` Herbert Xu
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 12:43 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, netdev, ian.pratt

On Tue, May 09, 2006 at 09:55:33PM +1000, Herbert Xu wrote:
> Hi Chris:
> 
> Chris Wright <chrisw@sous-sol.org> wrote:
> >
> > +/** Send a packet on a net device to encourage switches to learn the
> > + * MAC. We send a fake ARP request.
> > + *
> > + * @param dev device
> > + * @return 0 on success, error code otherwise
> > + */
> > +static int send_fake_arp(struct net_device *dev)
> 
> I think we talked about this before.  I don't see why Xen is so special
> that it needs its own gratuitous arp routine embedded in the driver.
> If this is needed at all (presumably for migration) then it should be
> performed by the management scripts which can send grat ARP packets just
> as easily.

There's at least two reasons why having it in the driver is preferable:
- synchronizing sending the fake ARP request with when the device is
  operational -- you really want to make this well synchronized to keep
  unreachability as short as possible, especially when doing live
  migration
- anybody but the guest might not know (all) the MAC addresses for which
  to send a fake ARP request

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09 10:05   ` Adrian Bunk
  2006-05-09 11:06     ` Ed Tomlinson
@ 2006-05-09 12:45     ` Christian Limpach
  2006-05-09 23:23     ` Chris Wright
  2 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 12:45 UTC (permalink / raw)
  To: Adrian Bunk
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Tue, May 09, 2006 at 12:05:47PM +0200, Adrian Bunk wrote:
> On Tue, May 09, 2006 at 12:00:01AM -0700, Chris Wright wrote:
> > --- linus-2.6.orig/kernel/Kconfig.hz
> > +++ linus-2.6/kernel/Kconfig.hz
> > @@ -3,7 +3,7 @@
> >  #
> >  
> >  choice
> > -	prompt "Timer frequency"
> > +	prompt "Timer frequency" if !XEN
> >  	default HZ_250
> >  	help
> >  	 Allows the configuration of the timer frequency. It is customary
> > @@ -40,7 +40,7 @@ endchoice
> >  
> >  config HZ
> >  	int
> > -	default 100 if HZ_100
> > +	default 100 if HZ_100 || XEN
> >  	default 250 if HZ_250
> >  	default 1000 if HZ_1000
> >...
> 
> Why?

Because the hypervisor sends timer interrupts to the guest at a rate
of 100 Hz while the guest is running.  We might add support to have
an adjustable rate in the future but so far 100 Hz has worked quite
well for us.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 16/35] subarch support for interrupt and exception gates
  2006-05-09 11:09   ` Andi Kleen
@ 2006-05-09 12:55     ` Christian Limpach
  0 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 12:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Chris Wright, Ian Pratt, xen-devel, linux-kernel

On Tue, May 09, 2006 at 01:09:49PM +0200, Andi Kleen wrote:
> 
> > +/*
> > + * This needs to use 'idt_table' rather than 'idt', and
> > + * thus use the _nonmapped_ version of the IDT, as the
> > + * Pentium F0 0F bugfix can have resulted in the mapped
> > + * IDT being write-protected.
> > + */
> > +void set_intr_gate(unsigned int n, void *addr)
> > +{
> > +	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
> > +}
> 
> No need to duplicate the various set_*_gate functions into the subarchs.

Indeed, we'll change that.  Thanks.

    christian

> > +static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
> > +{
> > +	/* _set_gate(n, 5, 0, 0, (gdt_entry<<3)); */
> > +}
> 
> Looks weird, but can be handled in the low level function.
> 
> -Andi

> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.osdl.org
> https://lists.osdl.org/mailman/listinfo/virtualization


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 12:43     ` Christian Limpach
@ 2006-05-09 13:01       ` Herbert Xu
  2006-05-09 13:14         ` Andi Kleen
  2006-05-09 13:16         ` Christian Limpach
  0 siblings, 2 replies; 185+ messages in thread
From: Herbert Xu @ 2006-05-09 13:01 UTC (permalink / raw)
  To: Christian Limpach
  Cc: herbert, xen-devel, ian.pratt, netdev, linux-kernel, chrisw,
	virtualization

Christian Limpach <Christian.Limpach@cl.cam.ac.uk> wrote:
> 
> There's at least two reasons why having it in the driver is preferable:
> - synchronizing sending the fake ARP request with when the device is
>  operational -- you really want to make this well synchronized to keep
>  unreachability as short as possible, especially when doing live
>  migration
> - anybody but the guest might not know (all) the MAC addresses for which
>  to send a fake ARP request

Sure.  However, what's there to stop you from doing this in user-space
inside the guest?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 13:01       ` Herbert Xu
@ 2006-05-09 13:14         ` Andi Kleen
  2006-05-09 13:16         ` Christian Limpach
  1 sibling, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 13:14 UTC (permalink / raw)
  To: virtualization
  Cc: Herbert Xu, Christian Limpach, chrisw, ian.pratt, xen-devel,
	netdev, linux-kernel

On Tuesday 09 May 2006 15:01, Herbert Xu wrote:
> Christian Limpach <Christian.Limpach@cl.cam.ac.uk> wrote:
> > 
> > There's at least two reasons why having it in the driver is preferable:
> > - synchronizing sending the fake ARP request with when the device is
> >  operational -- you really want to make this well synchronized to keep
> >  unreachability as short as possible, especially when doing live
> >  migration
> > - anybody but the guest might not know (all) the MAC addresses for which
> >  to send a fake ARP request
> 
> Sure.  However, what's there to stop you from doing this in user-space
> inside the guest?

I guess they don't trust user space. But the standard ipup script
from iproute2 does this already so at least for modern distributions
it should just work.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 13:01       ` Herbert Xu
  2006-05-09 13:14         ` Andi Kleen
@ 2006-05-09 13:16         ` Christian Limpach
  2006-05-09 13:26           ` Herbert Xu
  1 sibling, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 13:16 UTC (permalink / raw)
  To: Herbert Xu
  Cc: xen-devel, ian.pratt, netdev, linux-kernel, chrisw, virtualization

On Tue, May 09, 2006 at 11:01:05PM +1000, Herbert Xu wrote:
> Christian Limpach <Christian.Limpach@cl.cam.ac.uk> wrote:
> > 
> > There's at least two reasons why having it in the driver is preferable:
> > - synchronizing sending the fake ARP request with when the device is
> >  operational -- you really want to make this well synchronized to keep
> >  unreachability as short as possible, especially when doing live
> >  migration
> > - anybody but the guest might not know (all) the MAC addresses for which
> >  to send a fake ARP request
> 
> Sure.  However, what's there to stop you from doing this in user-space
> inside the guest?

Possibly having to page in the process and switching to it would add
to the live migration time.  More importantly, having to install an
additional program in the guest is certainly not very convenient.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 22/35] subarch suport for idle loop (NO_IDLE_HZ for Xen)
  2006-05-09  7:00 ` [RFC PATCH 22/35] subarch suport for idle loop (NO_IDLE_HZ for Xen) Chris Wright
@ 2006-05-09 13:21   ` Andi Kleen
  2006-05-09 15:13     ` Christian Limpach
  0 siblings, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 13:21 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt


> +extern void stop_hz_timer(void);
> +extern void start_hz_timer(void);
> +
> +void xen_idle(void);
> +
>  static char * __init machine_specific_memory_setup(void)
>  {
>  	unsigned long max_pfn = xen_start_info->nr_pages;
> @@ -65,4 +70,23 @@ static void __init machine_specific_arch
>  		console_use_vt = 0;
>  		conswitchp = NULL;
>  	}
> +
> +	pm_idle = xen_idle;
> +}
> +
> +void xen_idle(void)

I think that should be in some .c file, not a .h

Probably applies to more of your functions too.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 13:16         ` Christian Limpach
@ 2006-05-09 13:26           ` Herbert Xu
  2006-05-09 14:00             ` Christian Limpach
  0 siblings, 1 reply; 185+ messages in thread
From: Herbert Xu @ 2006-05-09 13:26 UTC (permalink / raw)
  To: Christian Limpach
  Cc: herbert, xen-devel, ian.pratt, netdev, linux-kernel, chrisw,
	virtualization

Christian Limpach <Christian.Limpach@cl.cam.ac.uk> wrote:
> 
> Possibly having to page in the process and switching to it would add
> to the live migration time.  More importantly, having to install an
> additional program in the guest is certainly not very convenient.

Sorry I'm still not convinced.  What's there to stop me from suspending
my laptop to disk, moving it from port A to port B and resuming it?

Wouldn't I be in exactly the same situation? By the same reasoning we'd
be adding a gratuitous ARP routine to every single laptop network driver.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 29/35] Add the Xen virtual console driver.
  2006-05-09  7:00 ` [RFC PATCH 29/35] Add the Xen virtual console driver Chris Wright
@ 2006-05-09 13:26   ` Andi Kleen
  2006-05-09 15:03     ` Christian Limpach
  2006-05-13 12:27   ` Andrew Morton
  1 sibling, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 13:26 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt


> --- linus-2.6.orig/drivers/char/tty_io.c
> +++ linus-2.6/drivers/char/tty_io.c
> @@ -132,6 +132,8 @@ LIST_HEAD(tty_drivers);			/* linked list
>     vt.c for deeply disgusting hack reasons */
>  DEFINE_MUTEX(tty_mutex);
>  
> +int console_use_vt = 1;
> +

Can you explain why this variable is needed? It shouldn't. If you only
register your console as the primary console nothing else should
be printed.


> +/*** Useful function for console debugging -- goes straight to Xen. ***/
> +asmlinkage int xprintk(const char *fmt, ...)

This is called early_printk in the rest of i386. Please use that name too.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 13:26           ` Herbert Xu
@ 2006-05-09 14:00             ` Christian Limpach
  2006-05-09 14:30               ` David Boutcher
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 14:00 UTC (permalink / raw)
  To: Herbert Xu
  Cc: xen-devel, ian.pratt, netdev, linux-kernel, chrisw, virtualization

On Tue, May 09, 2006 at 11:26:03PM +1000, Herbert Xu wrote:
> Christian Limpach <Christian.Limpach@cl.cam.ac.uk> wrote:
> > 
> > Possibly having to page in the process and switching to it would add
> > to the live migration time.  More importantly, having to install an
> > additional program in the guest is certainly not very convenient.
> 
> Sorry I'm still not convinced.  What's there to stop me from suspending
> my laptop to disk, moving it from port A to port B and resuming it?
> 
> Wouldn't I be in exactly the same situation? By the same reasoning we'd
> be adding a gratuitous ARP routine to every single laptop network driver.

It is the same situation except that in the laptop case you don't care
that reconfiguring your network will take a second or a few.  For live
migration we're looking at network downtime from as low as 60ms to
something like 210ms on a busy virtual machine.  I'm not saying that
a userspace solution wouldn't work but it would probably add a measurable
delay to the network downtime during live migration.

You might also find the following paper an interesting read:
http://www.cl.cam.ac.uk/Research/SRG/netos/papers/2005-migration-nsdi-pre.pdf

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 14:00             ` Christian Limpach
@ 2006-05-09 14:30               ` David Boutcher
  2006-05-09 23:35                 ` Chris Wright
  0 siblings, 1 reply; 185+ messages in thread
From: David Boutcher @ 2006-05-09 14:30 UTC (permalink / raw)
  To: Christian Limpach
  Cc: chrisw, Herbert Xu, ian.pratt, linux-kernel, netdev,
	virtualization, xen-devel

virtualization-bounces@lists.osdl.org wrote on 05/09/2006 09:00:27 AM:
> On Tue, May 09, 2006 at 11:26:03PM +1000, Herbert Xu wrote:
> > Christian Limpach <Christian.Limpach@cl.cam.ac.uk> wrote:
> > > 
> > > Possibly having to page in the process and switching to it would add
> > > to the live migration time.  More importantly, having to install an
> > > additional program in the guest is certainly not very convenient.
> > 
> > Sorry I'm still not convinced.  What's there to stop me from 
suspending
> > my laptop to disk, moving it from port A to port B and resuming it?
> > 
> > Wouldn't I be in exactly the same situation? By the same reasoning 
we'd
> > be adding a gratuitous ARP routine to every single laptop network 
driver.
> 
> It is the same situation except that in the laptop case you don't care
> that reconfiguring your network will take a second or a few.  For live
> migration we're looking at network downtime from as low as 60ms to
> something like 210ms on a busy virtual machine.  I'm not saying that
> a userspace solution wouldn't work but it would probably add a 
measurable
> delay to the network downtime during live migration.

Then make a generic solution.  VMWare supports migration, the Power 
virtualization will get around to it eventually.  All will need something
similar.  So either make a common user-land tool, or (if you insist on
incorrectly driving this into the kernel) add some kind of common hook to
the TCP/IP stack.

Dave Boutcher
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
  2006-05-09 10:05   ` Adrian Bunk
@ 2006-05-09 14:47   ` Daniel Walker
  2006-05-09 15:16     ` Christian Limpach
  2006-05-09 16:42   ` Andi Kleen
  2006-05-10 15:36   ` [Xen-devel] " Alan Cox
  3 siblings, 1 reply; 185+ messages in thread
From: Daniel Walker @ 2006-05-09 14:47 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> plain text document attachment (config-xen)
> The XEN config option is selected from the i386 subarch menu by
> choosing the X86_XEN "Xen-compatible" subarch.
> 
> The XEN_SHADOW_MODE option defines the memory virtualization mode for
> the kernel -- with it enabled, the kernel expects the hypervisor to
> perform translation between pseudo-physical and machine addresses on
> its behalf.
> 
> The disabled config options are:
> - DOUBLEFAULT: are trapped by Xen and not virtualized
> - HZ: defaults to 100 in Xen VMs
> - Power management: not supported in unprivileged VMs
> - SMP: not supported in this set of patches
> - X86_{UP,LOCAL,IO}_APIC: not supported in unprivileged VMs
> 
> Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
> Signed-off-by: Chris Wright <chrisw@sous-sol.org>
> ---
>  arch/i386/Kconfig       |   18 ++++++++++++++----
>  arch/i386/Kconfig.debug |    1 +
>  drivers/xen/Kconfig     |   21 +++++++++++++++++++++
>  kernel/Kconfig.hz       |    4 ++--
>  kernel/Kconfig.preempt  |    1 +
>  5 files changed, 39 insertions(+), 6 deletions(-)
> 
> --- linus-2.6.orig/arch/i386/Kconfig
> +++ linus-2.6/arch/i386/Kconfig
> @@ -55,6 +55,7 @@ menu "Processor type and features"
>  
>  config SMP
>  	bool "Symmetric multi-processing support"
> +	depends on !X86_XEN
>  	---help---
>  	  This enables support for systems with more than one CPU. If you have
>  	  a system with only one CPU, like most personal computers, say N. If
> @@ -91,6 +92,12 @@ config X86_PC
>  	help
>  	  Choose this option if your computer is a standard PC or compatible.
>  
> +config X86_XEN
> +	bool "Xen-compatible"
> +	help
> +	  Choose this option if you plan to run this kernel on top of the
> +	  Xen Hypervisor.
> +

Couldn't you just add "depends on !SMP && .." to the config X86_XEN
block ? 

Daniel


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
                   ` (34 preceding siblings ...)
  2006-05-09  7:00 ` [RFC PATCH 35/35] Add Xen virtual block " Chris Wright
@ 2006-05-09 14:49 ` Martin J. Bligh
  2006-05-09 15:07   ` Christoph Hellwig
  35 siblings, 1 reply; 185+ messages in thread
From: Martin J. Bligh @ 2006-05-09 14:49 UTC (permalink / raw)
  To: Chris Wright; +Cc: linux-kernel, virtualization, xen-devel

Congrats on getting this thrashed out. A few comments, most of which are
boring style nits, but nonetheless ... will try to take a proper look
later.

General comment:

Why is this style used:

HYPERVISOR_foo_bar ?

ie the capitalization of HYPERVISOR. Doesn't seem to fit with the rest
of the kernel style.


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09  7:00 ` [RFC PATCH 15/35] subarch support for controlling interrupt delivery Chris Wright
@ 2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09 14:55     ` Nick Piggin
  2006-05-09 15:51     ` Christian Limpach
  0 siblings, 2 replies; 185+ messages in thread
From: Martin J. Bligh @ 2006-05-09 14:49 UTC (permalink / raw)
  To: Chris Wright; +Cc: linux-kernel, virtualization, xen-devel, Ian Pratt

> +/*
> + * The use of 'barrier' in the following reflects their use as local-lock
> + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
> + * critical operations are executed. All critical operations must complete
> + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
> + * includes these barriers, for example.
> + */

Seems like an odd comment to have in an i386 header file.

> +#define __cli()								\
> +do {									\
> +	struct vcpu_info *_vcpu;					\
> +	preempt_disable();						\
> +	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
> +	_vcpu->evtchn_upcall_mask = 1;					\
> +	preempt_enable_no_resched();					\
> +	barrier();							\
> +} while (0)

Should be a real function

> +#define __sti()								\
> +do {									\
> +	struct vcpu_info *_vcpu;					\
> +	barrier();							\
> +	preempt_disable();						\
> +	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
> +	_vcpu->evtchn_upcall_mask = 0;					\
> +	barrier(); /* unmask then check (avoid races) */		\
> +	if (unlikely(_vcpu->evtchn_upcall_pending))			\
> +		force_evtchn_callback();				\
> +	preempt_enable();						\
> +} while (0)

Should be a real function

> +#define __save_flags(x)							\
> +do {									\
> +	struct vcpu_info *_vcpu;					\
> +	preempt_disable();						\
> +	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
> +	(x) = _vcpu->evtchn_upcall_mask;				\
> +	preempt_enable();						\
> +} while (0)

Should be a real function

> +#define __restore_flags(x)						\
> +do {									\
> +	struct vcpu_info *_vcpu;					\
> +	barrier();							\
> +	preempt_disable();						\
> +	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
> +	if ((_vcpu->evtchn_upcall_mask = (x)) == 0) {			\
> +		barrier(); /* unmask then check (avoid races) */	\
> +		if (unlikely(_vcpu->evtchn_upcall_pending))		\
> +			force_evtchn_callback();			\
> +		preempt_enable();					\
> +	} else								\
> +		preempt_enable_no_resched();				\
> +} while (0)

Should be a real function

> +#define safe_halt()		((void)0)
> +#define halt()			((void)0)
> +
> +#define __save_and_cli(x)						\
> +do {									\
> +	struct vcpu_info *_vcpu;					\
> +	preempt_disable();						\
> +	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
> +	(x) = _vcpu->evtchn_upcall_mask;				\
> +	_vcpu->evtchn_upcall_mask = 1;					\
> +	preempt_enable_no_resched();					\
> +	barrier();							\
> +} while (0)

Should be a real function

> +#define local_irq_save(x)	__save_and_cli(x)
> +#define local_irq_restore(x)	__restore_flags(x)
> +#define local_save_flags(x)	__save_flags(x)
> +#define local_irq_disable()	__cli()
> +#define local_irq_enable()	__sti()
> +
> +/* Cannot use preempt_enable() here as we would recurse in preempt_sched(). */
> +#define irqs_disabled()							\
> +({	int ___x;							\
> +	struct vcpu_info *_vcpu;					\
> +	preempt_disable();						\
> +	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
> +	___x = (_vcpu->evtchn_upcall_mask != 0);			\
> +	preempt_enable_no_resched();					\
> +	___x; })
> +
> +#endif /* __KERNEL__ */

Should be a real function

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-09  7:00 ` [RFC PATCH 08/35] Add Xen-specific memory management definitions Chris Wright
@ 2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09 17:44     ` Christian Limpach
  2006-05-15  6:44   ` Pete Zaitcev
  2006-05-17 16:06   ` Pete Zaitcev
  2 siblings, 1 reply; 185+ messages in thread
From: Martin J. Bligh @ 2006-05-09 14:49 UTC (permalink / raw)
  To: Chris Wright; +Cc: linux-kernel, virtualization, xen-devel, Ian Pratt


> +#define virt_to_ptep(__va)						\
> +({									\
> +	pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));		\
> +	pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));	\
> +	pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));	\
> +	pte_offset_kernel(__pmd, (unsigned long)(__va));		\
> +})

Do we really need yet another function to do this?
Especially one in a mult-line #define instead of a real function call,
and that doesn't seem to error check at each step?

> +
> +#define arbitrary_virt_to_machine(__va)					\
> +({									\
> +	maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
> +	m | ((unsigned long)(__va) & (PAGE_SIZE-1));			\
> +})
> +
> +#define make_lowmem_page_readonly(va, feature) do {		\
> +	pte_t *pte;						\
> +	int rc;							\
> +								\
> +	if (xen_feature(feature))				\
> +		return;						\
> +								\
> +	pte = virt_to_ptep(va);					\
> +	rc = HYPERVISOR_update_va_mapping(			\
> +		(unsigned long)va, pte_wrprotect(*pte), 0);	\
> +	BUG_ON(rc);						\
> +} while (0)

Things this long should definitely not be #defines.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09  7:00 ` [RFC PATCH 03/35] Add Xen interface header files Chris Wright
@ 2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09 17:54     ` Christian Limpach
  2006-05-09 15:15   ` Christoph Hellwig
  2006-05-09 16:06   ` Daniel Walker
  2 siblings, 1 reply; 185+ messages in thread
From: Martin J. Bligh @ 2006-05-09 14:49 UTC (permalink / raw)
  To: Chris Wright; +Cc: linux-kernel, virtualization, xen-devel, Ian Pratt

This clearly doesn't belong:

> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */

????


> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */

???

And the rest of them.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen.
  2006-05-09  7:00 ` [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen Chris Wright
  2006-05-09  7:21   ` Pavel Machek
@ 2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09 18:14     ` Christian Limpach
  1 sibling, 1 reply; 185+ messages in thread
From: Martin J. Bligh @ 2006-05-09 14:49 UTC (permalink / raw)
  To: Chris Wright; +Cc: linux-kernel, virtualization, xen-devel, Ian Pratt

> +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
> +{
> +#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
> +	C(0); C(1); C(2);
> +#undef C
> +}

Please just expand this or make it a real function call (static inline),
not a temporary macro ..

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 14:49   ` Martin J. Bligh
@ 2006-05-09 14:55     ` Nick Piggin
  2006-05-09 15:51     ` Christian Limpach
  1 sibling, 0 replies; 185+ messages in thread
From: Nick Piggin @ 2006-05-09 14:55 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

Martin J. Bligh wrote:
>> +/*
>> + * The use of 'barrier' in the following reflects their use as 
>> local-lock
>> + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ 
>> following
>> + * critical operations are executed. All critical operations must 
>> complete
>> + * /before/ reentrancy is permitted (e.g., __sti()). Alpha 
>> architecture also
>> + * includes these barriers, for example.
>> + */
> 
> 
> Seems like an odd comment to have in an i386 header file.

Also, it is only talking about compiler barriers, which have nothing
to do with the architecture.

And preempt_* macros should contain the correct compiler barriers, so
several can be removed.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 29/35] Add the Xen virtual console driver.
  2006-05-09 13:26   ` Andi Kleen
@ 2006-05-09 15:03     ` Christian Limpach
  0 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 15:03 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Chris Wright, Ian Pratt, xen-devel, linux-kernel

On Tue, May 09, 2006 at 03:26:10PM +0200, Andi Kleen wrote:
> 
> > --- linus-2.6.orig/drivers/char/tty_io.c
> > +++ linus-2.6/drivers/char/tty_io.c
> > @@ -132,6 +132,8 @@ LIST_HEAD(tty_drivers);			/* linked list
> >     vt.c for deeply disgusting hack reasons */
> >  DEFINE_MUTEX(tty_mutex);
> >  
> > +int console_use_vt = 1;
> > +
> 
> Can you explain why this variable is needed? It shouldn't. If you only
> register your console as the primary console nothing else should
> be printed.

It is needed for having a kernel which can run both as a dom0 and domU
kernel.  For dom0, you want to build with CONFIG_VT enabled while for
domU, the CONFIG_VT code doesn't let the xen console be the primary
console unless you specify it on the command line.

I've removed this from the patchqueue since a kernel built from the
patchqueue won't run as a dom0 kernel anyway.

> > +/*** Useful function for console debugging -- goes straight to Xen. ***/
> > +asmlinkage int xprintk(const char *fmt, ...)
> 
> This is called early_printk in the rest of i386. Please use that name too.

We have proper early_printk support but it's not part of the patchqueue.
This is more of a debug function where you would rename the kernel/printk
to something else and rename this to printk.  I've removed it.

     christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-09 14:49 ` [RFC PATCH 00/35] Xen i386 paravirtualization support Martin J. Bligh
@ 2006-05-09 15:07   ` Christoph Hellwig
  2006-05-09 15:12     ` Martin J. Bligh
  2006-05-09 15:20     ` Andi Kleen
  0 siblings, 2 replies; 185+ messages in thread
From: Christoph Hellwig @ 2006-05-09 15:07 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Chris Wright, linux-kernel, virtualization, xen-devel

On Tue, May 09, 2006 at 07:49:37AM -0700, Martin J. Bligh wrote:
> Congrats on getting this thrashed out. A few comments, most of which are
> boring style nits, but nonetheless ... will try to take a proper look
> later.
> 
> General comment:
> 
> Why is this style used:
> 
> HYPERVISOR_foo_bar ?
> 
> ie the capitalization of HYPERVISOR. Doesn't seem to fit with the rest
> of the kernel style.

It's also wrong.  There's more than one hypervisor and Xen shouldn't just
grab this namespace.  make it xen_ or xenhv_.


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-09 15:07   ` Christoph Hellwig
@ 2006-05-09 15:12     ` Martin J. Bligh
  2006-05-09 15:20     ` Andi Kleen
  1 sibling, 0 replies; 185+ messages in thread
From: Martin J. Bligh @ 2006-05-09 15:12 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Chris Wright, linux-kernel, virtualization, xen-devel

Christoph Hellwig wrote:
> On Tue, May 09, 2006 at 07:49:37AM -0700, Martin J. Bligh wrote:
> 
>>Congrats on getting this thrashed out. A few comments, most of which are
>>boring style nits, but nonetheless ... will try to take a proper look
>>later.
>>
>>General comment:
>>
>>Why is this style used:
>>
>>HYPERVISOR_foo_bar ?
>>
>>ie the capitalization of HYPERVISOR. Doesn't seem to fit with the rest
>>of the kernel style.
> 
> 
> It's also wrong.  There's more than one hypervisor and Xen shouldn't just
> grab this namespace.  make it xen_ or xenhv_.

I think the intent was to create something generic enough for others to
use. There were other projects that already intended to use the same
model ... and please, lets not have one stack of this stuff for each
hypervisor. So in general, I think the approach is correct, I was just
whining about style stuff ;-)

M.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 22/35] subarch suport for idle loop (NO_IDLE_HZ for Xen)
  2006-05-09 13:21   ` Andi Kleen
@ 2006-05-09 15:13     ` Christian Limpach
  0 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 15:13 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Chris Wright, Ian Pratt, xen-devel, linux-kernel

On Tue, May 09, 2006 at 03:21:19PM +0200, Andi Kleen wrote:
> 
> > +extern void stop_hz_timer(void);
> > +extern void start_hz_timer(void);
> > +
> > +void xen_idle(void);
> > +
> >  static char * __init machine_specific_memory_setup(void)
> >  {
> >  	unsigned long max_pfn = xen_start_info->nr_pages;
> > @@ -65,4 +70,23 @@ static void __init machine_specific_arch
> >  		console_use_vt = 0;
> >  		conswitchp = NULL;
> >  	}
> > +
> > +	pm_idle = xen_idle;
> > +}
> > +
> > +void xen_idle(void)
> 
> I think that should be in some .c file, not a .h
> 
> Probably applies to more of your functions too.

I guess I agree, although I was mostly following the examples in other
mach setup_arch_*.h files.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09  7:00 ` [RFC PATCH 03/35] Add Xen interface header files Chris Wright
  2006-05-09 14:49   ` Martin J. Bligh
@ 2006-05-09 15:15   ` Christoph Hellwig
  2006-05-09 19:35     ` Hollis Blanchard
  2006-05-09 22:36     ` Ingo Oeser
  2006-05-09 16:06   ` Daniel Walker
  2 siblings, 2 replies; 185+ messages in thread
From: Christoph Hellwig @ 2006-05-09 15:15 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
> Signed-off-by: Chris Wright <chrisw@sous-sol.org>
> ---
>  include/xen/interface/arch-x86_32.h   |  197 +++++++++++++++

that kind of stuff needs to go to asm/

>  include/xen/interface/event_channel.h |  205 +++++++++++++++

instead of interface please use something shorter, we'll see this
all over the includes statements.  intf for example.

> +#ifdef __XEN__
> +#define __DEFINE_GUEST_HANDLE(name, type) \
> +    typedef struct { type *p; } __guest_handle_ ## name
> +#else
> +#define __DEFINE_GUEST_HANDLE(name, type) \
> +    typedef type * __guest_handle_ ## name
> +#endif

please get rid of all these stupid typedefs

> +#ifndef __ASSEMBLY__
> +/* Guest handles for primitive C types. */
> +__DEFINE_GUEST_HANDLE(uchar, unsigned char);
> +__DEFINE_GUEST_HANDLE(uint,  unsigned int);
> +__DEFINE_GUEST_HANDLE(ulong, unsigned long);

don't use uchar/uint/ulong types ever.  And in things like
hypervisor/kernel interfaces always use __u* types.


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09 14:47   ` Daniel Walker
@ 2006-05-09 15:16     ` Christian Limpach
  2006-05-09 16:00       ` Daniel Walker
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 15:16 UTC (permalink / raw)
  To: Daniel Walker
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Tue, May 09, 2006 at 07:47:12AM -0700, Daniel Walker wrote:
> On Tue, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> > The disabled config options are:
> > - DOUBLEFAULT: are trapped by Xen and not virtualized
> > - HZ: defaults to 100 in Xen VMs
> > - Power management: not supported in unprivileged VMs
> > - SMP: not supported in this set of patches
> > - X86_{UP,LOCAL,IO}_APIC: not supported in unprivileged VMs
> > 
> > +++ linus-2.6/arch/i386/Kconfig
> > @@ -55,6 +55,7 @@ menu "Processor type and features"
> >  
> >  config SMP
> >  	bool "Symmetric multi-processing support"
> > +	depends on !X86_XEN
> >  	---help---
> >  	  This enables support for systems with more than one CPU. If you have
> >  	  a system with only one CPU, like most personal computers, say N. If
> > @@ -91,6 +92,12 @@ config X86_PC
> >  	help
> >  	  Choose this option if your computer is a standard PC or compatible.
> >  
> > +config X86_XEN
> > +	bool "Xen-compatible"
> > +	help
> > +	  Choose this option if you plan to run this kernel on top of the
> > +	  Xen Hypervisor.
> > +
> 
> Couldn't you just add "depends on !SMP && .." to the config X86_XEN
> block ? 

I guess you could, but it would make it rather non-obvious and tedious
to enable X86_XEN then, wouldn't it?

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-09 15:07   ` Christoph Hellwig
  2006-05-09 15:12     ` Martin J. Bligh
@ 2006-05-09 15:20     ` Andi Kleen
  2006-05-09 15:22       ` Christoph Hellwig
  1 sibling, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 15:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Chris Wright, linux-kernel, virtualization, xen-devel

Christoph Hellwig <hch@infradead.org> writes:

> On Tue, May 09, 2006 at 07:49:37AM -0700, Martin J. Bligh wrote:
> > Congrats on getting this thrashed out. A few comments, most of which are
> > boring style nits, but nonetheless ... will try to take a proper look
> > later.
> > 
> > General comment:
> > 
> > Why is this style used:
> > 
> > HYPERVISOR_foo_bar ?
> > 
> > ie the capitalization of HYPERVISOR. Doesn't seem to fit with the rest
> > of the kernel style.
> 
> It's also wrong.  There's more than one hypervisor and Xen shouldn't just
> grab this namespace.  make it xen_ or xenhv_.

You should reject the recent "hypervisor file system" with the same
argument then.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-09 15:20     ` Andi Kleen
@ 2006-05-09 15:22       ` Christoph Hellwig
  2006-05-09 15:45         ` Pekka Enberg
  2006-05-14  1:35         ` Andrew Morton
  0 siblings, 2 replies; 185+ messages in thread
From: Christoph Hellwig @ 2006-05-09 15:22 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Christoph Hellwig, Chris Wright, linux-kernel, virtualization, xen-devel

On Tue, May 09, 2006 at 05:20:11PM +0200, Andi Kleen wrote:
> > It's also wrong.  There's more than one hypervisor and Xen shouldn't just
> > grab this namespace.  make it xen_ or xenhv_.
> 
> You should reject the recent "hypervisor file system" with the same
> argument then.

I prefer it would become lparfs or something like that indeed.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-09 15:22       ` Christoph Hellwig
@ 2006-05-09 15:45         ` Pekka Enberg
  2006-05-14  1:35         ` Andrew Morton
  1 sibling, 0 replies; 185+ messages in thread
From: Pekka Enberg @ 2006-05-09 15:45 UTC (permalink / raw)
  To: Christoph Hellwig, Andi Kleen, Chris Wright, linux-kernel,
	virtualization, xen-devel

On Tue, May 09, 2006 at 05:20:11PM +0200, Andi Kleen wrote:
> > > It's also wrong.  There's more than one hypervisor and Xen shouldn't just
> > > grab this namespace.  make it xen_ or xenhv_.
> >
> > You should reject the recent "hypervisor file system" with the same
> > argument then.

On 5/9/06, Christoph Hellwig <hch@infradead.org> wrote:
> I prefer it would become lparfs or something like that indeed.

AFAIK it's called s390-hypfs now.

                                      Pekka

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09 14:55     ` Nick Piggin
@ 2006-05-09 15:51     ` Christian Limpach
  2006-05-09 16:02       ` Martin J. Bligh
  2006-05-09 16:07       ` Andi Kleen
  1 sibling, 2 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 15:51 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 07:49:42AM -0700, Martin J. Bligh wrote:
> >+#define __cli()							 \
> >+do {									\
> >+	struct vcpu_info *_vcpu;					\
> >+	preempt_disable();						\
> >+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
> >+	_vcpu->evtchn_upcall_mask = 1;					\
> >+	preempt_enable_no_resched();					\
> >+	barrier();							\
> >+} while (0)
> 
> Should be a real function

Yes, except it's not trivially done because if __cli was an inline
function, you need to have everything that is used in the declaration
defined when the function is declared as opposed to when the #define
gets used.  I'll give it another try, but it very quickly becomes
#include hell.

Anybody want to comment on the performance impact of making
local_irq_* non-inline functions?

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09 15:16     ` Christian Limpach
@ 2006-05-09 16:00       ` Daniel Walker
  2006-05-09 23:25         ` Chris Wright
  0 siblings, 1 reply; 185+ messages in thread
From: Daniel Walker @ 2006-05-09 16:00 UTC (permalink / raw)
  To: Christian Limpach
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Tue, 2006-05-09 at 16:16 +0100, Christian Limpach wrote:
> > >  
> > > +config X86_XEN
> > > +	bool "Xen-compatible"
> > > +	help
> > > +	  Choose this option if you plan to run this kernel on top of the
> > > +	  Xen Hypervisor.
> > > +
> > 
> > Couldn't you just add "depends on !SMP && .." to the config X86_XEN
> > block ? 
> 
> I guess you could, but it would make it rather non-obvious and tedious
> to enable X86_XEN then, wouldn't it?

I guess that true .. Might be better just to support SMP then ..

Daniel


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 15:51     ` Christian Limpach
@ 2006-05-09 16:02       ` Martin J. Bligh
  2006-05-09 16:07       ` Andi Kleen
  1 sibling, 0 replies; 185+ messages in thread
From: Martin J. Bligh @ 2006-05-09 16:02 UTC (permalink / raw)
  To: Christian Limpach
  Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

Christian Limpach wrote:
> On Tue, May 09, 2006 at 07:49:42AM -0700, Martin J. Bligh wrote:
> 
>>>+#define __cli()							 \
>>>+do {									\
>>>+	struct vcpu_info *_vcpu;					\
>>>+	preempt_disable();						\
>>>+	_vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id];		\
>>>+	_vcpu->evtchn_upcall_mask = 1;					\
>>>+	preempt_enable_no_resched();					\
>>>+	barrier();							\
>>>+} while (0)
>>
>>Should be a real function
> 
> 
> Yes, except it's not trivially done because if __cli was an inline
> function, you need to have everything that is used in the declaration
> defined when the function is declared as opposed to when the #define
> gets used.  I'll give it another try, but it very quickly becomes
> #include hell.
> 
> Anybody want to comment on the performance impact of making
> local_irq_* non-inline functions?

I wasn't concerned with inline vs non-inline - that's your choice.
Just the inherent foulness of multi-line macros ;-)

M.



^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09  7:00 ` [RFC PATCH 03/35] Add Xen interface header files Chris Wright
  2006-05-09 14:49   ` Martin J. Bligh
  2006-05-09 15:15   ` Christoph Hellwig
@ 2006-05-09 16:06   ` Daniel Walker
  2006-05-09 16:18     ` Christian Limpach
  2 siblings, 1 reply; 185+ messages in thread
From: Daniel Walker @ 2006-05-09 16:06 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> plain text document attachment (xen-interface-headers)
> Add Xen interface header files. These are taken fairly directly from
> the Xen tree and hence the style is not entirely in accordance with
> Linux guidelines. There is a tension between fitting with Linux coding
> rules and ease of maintenance.
> 
> Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
> Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
> Signed-off-by: Chris Wright <chrisw@sous-sol.org>
> ---
>  include/xen/interface/arch-x86_32.h   |  197 +++++++++++++++
>  include/xen/interface/event_channel.h |  205 +++++++++++++++
>  include/xen/interface/features.h      |   53 ++++
>  include/xen/interface/grant_table.h   |  311 +++++++++++++++++++++++
>  include/xen/interface/io/blkif.h      |   85 ++++++


Shouldn't these be under asm-i386 , or are they used by other
architecture ? 

Daniel


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09  7:00 ` [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver Chris Wright
@ 2006-05-09 16:06   ` Alexey Dobriyan
  2006-05-09 16:28     ` Andi Kleen
  2006-05-09 19:40   ` Greg KH
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 185+ messages in thread
From: Alexey Dobriyan @ 2006-05-09 16:06 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

> +/* Simplified asprintf. */
> +char *kasprintf(const char *fmt, ...)
> +{
> +	va_list ap;
> +	unsigned int len;
> +	char *p, dummy[1];
> +
> +	va_start(ap, fmt);
> +	/* FIXME: vsnprintf has a bug, NULL should work */
> +	len = vsnprintf(dummy, 0, fmt, ap);
> +	va_end(ap);
> +
> +	p = kmalloc(len + 1, GFP_KERNEL);
> +	if (!p)
> +		return NULL;
> +	va_start(ap, fmt);
> +	vsprintf(p, fmt, ap);
> +	va_end(ap);
> +	return p;
> +}

This should go to lib/


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 15:51     ` Christian Limpach
  2006-05-09 16:02       ` Martin J. Bligh
@ 2006-05-09 16:07       ` Andi Kleen
  2006-05-09 16:29         ` Christian Limpach
  1 sibling, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 16:07 UTC (permalink / raw)
  To: virtualization
  Cc: Christian Limpach, Martin J. Bligh, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt


> 
> Anybody want to comment on the performance impact of making
> local_irq_* non-inline functions?

I would guess for that much inline code it will be even a win to not
inline because it will save icache.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09 16:06   ` Daniel Walker
@ 2006-05-09 16:18     ` Christian Limpach
  2006-05-09 16:29       ` Daniel Walker
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 16:18 UTC (permalink / raw)
  To: Daniel Walker
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Tue, May 09, 2006 at 09:06:12AM -0700, Daniel Walker wrote:
> On Tue, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> > plain text document attachment (xen-interface-headers)
> > Add Xen interface header files. These are taken fairly directly from
> > the Xen tree and hence the style is not entirely in accordance with
> > Linux guidelines. There is a tension between fitting with Linux coding
> > rules and ease of maintenance.
> > 
> > Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
> > Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
> > Signed-off-by: Chris Wright <chrisw@sous-sol.org>
> > ---
> >  include/xen/interface/arch-x86_32.h   |  197 +++++++++++++++
> >  include/xen/interface/event_channel.h |  205 +++++++++++++++
> >  include/xen/interface/features.h      |   53 ++++
> >  include/xen/interface/grant_table.h   |  311 +++++++++++++++++++++++
> >  include/xen/interface/io/blkif.h      |   85 ++++++
> 
> Shouldn't these be under asm-i386 , or are they used by other
> architecture ? 

The full set of interface headers supports several architectures.

I think having all the header files in one place is preferable,
but will gladly move them wherever is agreed on to be best ;-)

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09  7:00 ` [RFC PATCH 25/35] Add Xen time abstractions Chris Wright
@ 2006-05-09 16:23   ` Daniel Walker
  2006-05-09 16:38     ` Christian Limpach
  2006-05-09 21:50   ` Andi Kleen
  2006-05-12 21:44   ` Pavel Machek
  2 siblings, 1 reply; 185+ messages in thread
From: Daniel Walker @ 2006-05-09 16:23 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, 2006-05-09 at 00:00 -0700, Chris Wright wrote:

> +timers-y			:= timers/
> +timers-$(CONFIG_XEN)		:=


Is this line suppose to be empty ?

Daniel


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09 16:06   ` Alexey Dobriyan
@ 2006-05-09 16:28     ` Andi Kleen
  0 siblings, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 16:28 UTC (permalink / raw)
  To: virtualization
  Cc: Alexey Dobriyan, Chris Wright, xen-devel, linux-kernel, Ian Pratt

On Tuesday 09 May 2006 18:06, Alexey Dobriyan wrote:
> > +/* Simplified asprintf. */
> > +char *kasprintf(const char *fmt, ...)
> > +{
> > +	va_list ap;
> > +	unsigned int len;
> > +	char *p, dummy[1];
> > +
> > +	va_start(ap, fmt);
> > +	/* FIXME: vsnprintf has a bug, NULL should work */
> > +	len = vsnprintf(dummy, 0, fmt, ap);
> > +	va_end(ap);
> > +
> > +	p = kmalloc(len + 1, GFP_KERNEL);
> > +	if (!p)
> > +		return NULL;
> > +	va_start(ap, fmt);
> > +	vsprintf(p, fmt, ap);
> > +	va_end(ap);
> > +	return p;
> > +}
> 
> This should go to lib/

First for kernel usage I think it should have a maximum length parameter
to avoid dumb code from being easily exploited.

And the bug should be fixed in vsnprintf instead of being worked
around.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09 16:18     ` Christian Limpach
@ 2006-05-09 16:29       ` Daniel Walker
  0 siblings, 0 replies; 185+ messages in thread
From: Daniel Walker @ 2006-05-09 16:29 UTC (permalink / raw)
  To: Christian Limpach
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Tue, 2006-05-09 at 17:18 +0100, Christian Limpach wrote:

> 
> The full set of interface headers supports several architectures.
> 
> I think having all the header files in one place is preferable,
> but will gladly move them wherever is agreed on to be best ;-)

I'd say include/linux/xen/ would be a better choice, if it's multi
architecture ..

Daniel


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 16:07       ` Andi Kleen
@ 2006-05-09 16:29         ` Christian Limpach
  2006-05-09 16:31           ` Andi Kleen
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 16:29 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Martin J. Bligh, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 06:07:57PM +0200, Andi Kleen wrote:
> 
> > 
> > Anybody want to comment on the performance impact of making
> > local_irq_* non-inline functions?
> 
> I would guess for that much inline code it will be even a win to not
> inline because it will save icache.

Maybe, although some of the macros compile down to only 2-3 instructions.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 16:29         ` Christian Limpach
@ 2006-05-09 16:31           ` Andi Kleen
  2006-05-09 20:42             ` Christian Limpach
  0 siblings, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 16:31 UTC (permalink / raw)
  To: Christian Limpach
  Cc: virtualization, Martin J. Bligh, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt

On Tuesday 09 May 2006 18:29, Christian Limpach wrote:
> On Tue, May 09, 2006 at 06:07:57PM +0200, Andi Kleen wrote:
> > 
> > > 
> > > Anybody want to comment on the performance impact of making
> > > local_irq_* non-inline functions?
> > 
> > I would guess for that much inline code it will be even a win to not
> > inline because it will save icache.
> 
> Maybe, although some of the macros compile down to only 2-3 instructions.

Can you post before/after vmlinux size numbers for inline/out of line?

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09 16:23   ` Daniel Walker
@ 2006-05-09 16:38     ` Christian Limpach
  2006-05-09 19:27       ` Adrian Bunk
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 16:38 UTC (permalink / raw)
  To: Daniel Walker
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Tue, May 09, 2006 at 09:23:08AM -0700, Daniel Walker wrote:
> On Tue, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> 
> > +timers-y			:= timers/
> > +timers-$(CONFIG_XEN)		:=
> 
> 
> Is this line suppose to be empty ?

Yes.  We have our own version of time.c which doesn't use any of the
timer code in timers but works for both i386 and x86_64 instead.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
  2006-05-09 10:05   ` Adrian Bunk
  2006-05-09 14:47   ` Daniel Walker
@ 2006-05-09 16:42   ` Andi Kleen
  2006-05-10 15:36   ` [Xen-devel] " Alan Cox
  3 siblings, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 16:42 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt

On Tuesday 09 May 2006 09:00, Chris Wright wrote:
> The XEN config option is selected from the i386 subarch menu by
> choosing the X86_XEN "Xen-compatible" subarch.

I really dislike all these negative option checks.

I think it would be better if you defined a positive symbol like
BARE_METAL (better name?)and define that only for the non XEN case.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-09  7:00 ` [RFC PATCH 17/35] Segment register changes for Xen Chris Wright
  2006-05-09  7:16   ` Pavel Machek
@ 2006-05-09 16:44   ` Andi Kleen
  2006-05-18 20:20   ` Zachary Amsden
  2 siblings, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 16:44 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt

 			/* fall through */
> --- linus-2.6.orig/include/asm-i386/mach-default/mach_system.h
> +++ linus-2.6/include/asm-i386/mach-default/mach_system.h
> @@ -1,6 +1,8 @@
>  #ifndef __ASM_MACH_SYSTEM_H
>  #define __ASM_MACH_SYSTEM_H
>  
> +#define clearsegment(seg)

I don't think you can give such a specific hook such a generic name.

I would just add an ifdef around the real user with a comment.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 11/35] Add support for Xen to entry.S.
  2006-05-09  7:00 ` [RFC PATCH 11/35] Add support for Xen to entry.S Chris Wright
@ 2006-05-09 16:51   ` Andi Kleen
  0 siblings, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 16:51 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt

On Tuesday 09 May 2006 09:00, Chris Wright wrote:

> +#define sizeof_vcpu_shift		6

This should be generated in asm-offsets.c

> +
> +#ifdef CONFIG_SMP
> +#define GET_VCPU_INFO		movl TI_cpu(%ebp),%esi			; \
> +				shl  $sizeof_vcpu_shift,%esi		; \
> +				addl HYPERVISOR_shared_info,%esi

I think you need some comments on the register usage in the macros.
Otherwise people hacking on it later will go crazy.

>  restore_all:
> +#ifndef CONFIG_XEN
>  	movl EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
>  	# Warning: OLDSS(%esp) contains the wrong/random values if we
>  	# are returning to the kernel.
> @@ -258,12 +289,32 @@ restore_all:
>  	cmpl $((4 << 8) | 3), %eax
>  	je ldt_ss			# returning to user-space with LDT SS
>  restore_nocheck:
> +#else

Needs comment

> +restore_nocheck:
> +	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
> +	movb CS(%esp), %al
> +	andl $(VM_MASK | 3), %eax
> +	cmpl $3, %eax
> +	jne hypervisor_iret
> +	ENABLE_INTERRUPTS
>

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 26/35] Add Xen subarch reboot support
  2006-05-09  7:00 ` [RFC PATCH 26/35] Add Xen subarch reboot support Chris Wright
@ 2006-05-09 17:02   ` Andi Kleen
  2006-05-12 21:46     ` Pavel Machek
  0 siblings, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 17:02 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt


> +++ linus-2.6/drivers/xen/core/reboot.c
> @@ -0,0 +1,232 @@
> +#define __KERNEL_SYSCALLS__
> +#include <linux/version.h>
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/unistd.h>
> +#include <linux/module.h>
> +#include <linux/reboot.h>
> +#include <linux/sysrq.h>
> +#include <linux/stringify.h>
> +#include <linux/syscalls.h>
> +#include <linux/cpu.h>
> +#include <linux/kthread.h>

Do you really need all these includes?

> +#if defined(__i386__) || defined(__x86_64__)

aka CONFIG_X86

> +/*
> + * Power off function, if any
> + */
> +void (*pm_power_off)(void);
> +EXPORT_SYMBOL(pm_power_off);
> +#endif
> +
> +extern void ctrl_alt_del(void);


That should be in some header

> +
> +/* Ignore multiple shutdown requests. */
> +static int shutting_down = SHUTDOWN_INVALID;
> +static void __shutdown_handler(void *unused);
> +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
> +
> +static int shutdown_process(void *__unused)
> +{
> +	static char *envp[] = { "HOME=/", "TERM=linux",
> +				"PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
> +	static char *poweroff_argv[] = { "/sbin/poweroff", NULL };

This should be configurable, probably in a sysctl

> +
> +	if ((shutting_down == SHUTDOWN_POWEROFF) ||
> +	    (shutting_down == SHUTDOWN_HALT)) {
> +		if (execve(poweroff_argv[0], poweroff_argv, envp) < 0) {
> +			sys_reboot(LINUX_REBOOT_MAGIC1,
> +				   LINUX_REBOOT_MAGIC2,
> +				   LINUX_REBOOT_CMD_POWER_OFF,
> +				   NULL);
> +		}
> +	}
> +
> +	shutting_down = SHUTDOWN_INVALID; /* could try again */
> +
> +	return 0;
> +}
> +
> +static void __shutdown_handler(void *unused)
> +{
> +	int err = 0;
> +
> +	if (shutting_down != SHUTDOWN_SUSPEND)


The whole shutting_down handling looks racy. Probably needs some locking?

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-09 14:49   ` Martin J. Bligh
@ 2006-05-09 17:44     ` Christian Limpach
  0 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 17:44 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 07:49:45AM -0700, Martin J. Bligh wrote:
> 
> >+#define virt_to_ptep(__va)						\
> >+({									\
> >+	pgd_t *__pgd = pgd_offset_k((unsigned long)(__va));		\
> >+	pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va));	\
> >+	pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va));	\
> >+	pte_offset_kernel(__pmd, (unsigned long)(__va));		\
> >+})
> 
> Do we really need yet another function to do this?
> Especially one in a mult-line #define instead of a real function call,
> and that doesn't seem to error check at each step?

Indeed, I'll use lookup_address instead.

> >+
> >+#define arbitrary_virt_to_machine(__va)				 \
> >+({									\
> >+	maddr_t m = (maddr_t)pte_mfn(*virt_to_ptep(__va)) << PAGE_SHIFT;\
> >+	m | ((unsigned long)(__va) & (PAGE_SIZE-1));			\
> >+})
> >+
> >+#define make_lowmem_page_readonly(va, feature) do {		\
> >+	pte_t *pte;						\
> >+	int rc;							\
> >+								\
> >+	if (xen_feature(feature))				\
> >+		return;						\
> >+								\
> >+	pte = virt_to_ptep(va);					\
> >+	rc = HYPERVISOR_update_va_mapping(			\
> >+		(unsigned long)va, pte_wrprotect(*pte), 0);	\
> >+	BUG_ON(rc);						\
> >+} while (0)
> 
> Things this long should definitely not be #defines.

I've changed these to be functions and moved them into a .c file
under arch/i386/mach-xen.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09 14:49   ` Martin J. Bligh
@ 2006-05-09 17:54     ` Christian Limpach
  0 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 17:54 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 07:49:49AM -0700, Martin J. Bligh wrote:
> This clearly doesn't belong:
> 
> >+/*
> >+ * Local variables:
> >+ * mode: C
> >+ * c-set-style: "BSD"
> >+ * c-basic-offset: 4
> >+ * tab-width: 4
> >+ * indent-tabs-mode: nil
> >+ * End:
> >+ */
> 
> ????

Removed.  Thanks.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen.
  2006-05-09 14:49   ` Martin J. Bligh
@ 2006-05-09 18:14     ` Christian Limpach
  2006-05-09 18:21       ` Martin Bligh
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 18:14 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 07:49:54AM -0700, Martin J. Bligh wrote:
> >+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
> >+{
> >+#define C(i) 
> >HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
> >+	C(0); C(1); C(2);
> >+#undef C
> >+}
> 
> Please just expand this or make it a real function call (static inline),
> not a temporary macro ..

Yes, I've added an inline function to do a single descriptor.

Should I change the non-xen case as well?  It was the inspiration
for this code ;-)

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen.
  2006-05-09 18:14     ` Christian Limpach
@ 2006-05-09 18:21       ` Martin Bligh
  0 siblings, 0 replies; 185+ messages in thread
From: Martin Bligh @ 2006-05-09 18:21 UTC (permalink / raw)
  To: Christian Limpach
  Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

Christian Limpach wrote:
> On Tue, May 09, 2006 at 07:49:54AM -0700, Martin J. Bligh wrote:
> 
>>>+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
>>>+{
>>>+#define C(i) 
>>>HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i])
>>>+	C(0); C(1); C(2);
>>>+#undef C
>>>+}
>>
>>Please just expand this or make it a real function call (static inline),
>>not a temporary macro ..
> 
> 
> Yes, I've added an inline function to do a single descriptor.
> 
> Should I change the non-xen case as well?  It was the inspiration
> for this code ;-)

If it looks anything like that, then I'd vote yes ;-)

M.


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
  2006-05-09 11:55   ` [Xen-devel] " Herbert Xu
  2006-05-09 11:58   ` Christoph Hellwig
@ 2006-05-09 18:56   ` Stephen Hemminger
  2006-05-09 23:39     ` Chris Wright
  2006-05-09 20:25   ` Stephen Hemminger
  2006-05-09 22:41   ` [Xen-devel] " Herbert Xu
  4 siblings, 1 reply; 185+ messages in thread
From: Stephen Hemminger @ 2006-05-09 18:56 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach, netdev

The stuff in /proc could easily just be added attributes to the class_device kobject
of the net device (and then show up in sysfs).


> +
> +#define GRANT_INVALID_REF	0
> +
> +#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
> +#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
> +
> +static inline void init_skb_shinfo(struct sk_buff *skb)
> +{
> +	atomic_set(&(skb_shinfo(skb)->dataref), 1);
> +	skb_shinfo(skb)->nr_frags = 0;
> +	skb_shinfo(skb)->frag_list = NULL;
> +}
> +

Could you use existing sk_buff_head instead of inventing your
own skb queue?

> +struct netfront_info
> +{
> +	struct list_head list;
> +	struct net_device *netdev;
> +
> +	struct net_device_stats stats;
> +	unsigned int tx_full;
> +
> +	struct netif_tx_front_ring tx;
> +	struct netif_rx_front_ring rx;
> +
> +	spinlock_t   tx_lock;
> +	spinlock_t   rx_lock;
> +
> +	unsigned int handle;
> +	unsigned int evtchn, irq;
> +
> +	/* What is the status of our connection to the remote backend? */
> +#define BEST_CLOSED       0
> +#define BEST_DISCONNECTED 1
> +#define BEST_CONNECTED    2
> +	unsigned int backend_state;
> +
> +	/* Is this interface open or closed (down or up)? */
> +#define UST_CLOSED        0
> +#define UST_OPEN          1
> +	unsigned int user_state;
> +
> +	/* Receive-ring batched refills. */
> +#define RX_MIN_TARGET 8
> +#define RX_DFL_MIN_TARGET 64
> +#define RX_MAX_TARGET NET_RX_RING_SIZE
> +	int rx_min_target, rx_max_target, rx_target;
> +	struct sk_buff_head rx_batch;
> +
> +	struct timer_list rx_refill_timer;
> +
> +	/*
> +	 * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
> +	 * array is an index into a chain of free entries.
> +	 */
> +	struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
> +	struct sk_buff *rx_skbs[NET_RX_RING_SIZE+1];
> +
> +	grant_ref_t gref_tx_head;
> +	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
> +	grant_ref_t gref_rx_head;
> +	grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1];
> +
> +	struct xenbus_device *xbdev;
> +	int tx_ring_ref;
> +	int rx_ring_ref;
> +	u8 mac[ETH_ALEN];

Isn't mac address already stored in dev->dev_addr and/or dev->perm_addr?


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09 16:38     ` Christian Limpach
@ 2006-05-09 19:27       ` Adrian Bunk
  0 siblings, 0 replies; 185+ messages in thread
From: Adrian Bunk @ 2006-05-09 19:27 UTC (permalink / raw)
  To: Christian Limpach
  Cc: Daniel Walker, Chris Wright, linux-kernel, virtualization,
	xen-devel, Ian Pratt

On Tue, May 09, 2006 at 05:38:14PM +0100, Christian Limpach wrote:
> On Tue, May 09, 2006 at 09:23:08AM -0700, Daniel Walker wrote:
> > On Tue, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> > 
> > > +timers-y			:= timers/
> > > +timers-$(CONFIG_XEN)		:=
> > 
> > 
> > Is this line suppose to be empty ?
> 
> Yes.  We have our own version of time.c which doesn't use any of the
> timer code in timers but works for both i386 and x86_64 instead.

Please add a comment, at first sight it's not even obvious that this 
line does anything (+= and := are easy to confuse).

>     christian

cu
Adrian

-- 

       "Is there not promise of rain?" Ling Tan asked suddenly out
        of the darkness. There had been need of rain for many days.
       "Only a promise," Lao Er said.
                                       Pearl S. Buck - Dragon Seed


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09 15:15   ` Christoph Hellwig
@ 2006-05-09 19:35     ` Hollis Blanchard
  2006-05-09 19:48       ` [Xen-devel] " Anthony Liguori
  2006-05-09 22:34       ` Christoph Hellwig
  2006-05-09 22:36     ` Ingo Oeser
  1 sibling, 2 replies; 185+ messages in thread
From: Hollis Blanchard @ 2006-05-09 19:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

On Tue, 2006-05-09 at 16:15 +0100, Christoph Hellwig wrote:
> 
> > +#ifdef __XEN__
> > +#define __DEFINE_GUEST_HANDLE(name, type) \
> > +    typedef struct { type *p; } __guest_handle_ ## name
> > +#else
> > +#define __DEFINE_GUEST_HANDLE(name, type) \
> > +    typedef type * __guest_handle_ ## name
> > +#endif
> 
> please get rid of all these stupid typedefs 

These typedefs are a new hack to work around a basic interface problem:
instead of explicitly-sized types, Xen uses longs and pointers in its
interface. On PowerPC in particular, where we need a 32-bit userland
communicating with a 64-bit hypervisor, those types don't work.

However, the maintainers are reluctant to switch the interface to use
explicitly-sized types because it would break binary compatibility.
These ugly "HANDLE" macros allow PowerPC to do what we need without
affecting binary compatibility on x86.

-- 
Hollis Blanchard
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09  7:00 ` [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver Chris Wright
  2006-05-09 16:06   ` Alexey Dobriyan
@ 2006-05-09 19:40   ` Greg KH
  2006-05-09 21:53     ` Chris Wright
  2006-05-09 19:49   ` Greg KH
  2006-05-13 12:28   ` Andrew Morton
  3 siblings, 1 reply; 185+ messages in thread
From: Greg KH @ 2006-05-09 19:40 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, May 09, 2006 at 12:00:33AM -0700, Chris Wright wrote:
> +/* xenbus_probe.c */
> +extern char *kasprintf(const char *fmt, ...);

Belongs in a .h file.

> +#define DPRINTK(fmt, args...) \
> +    pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)

Please use the dev_dbg() function instead of DPRINTK() or pr_debug().
It's much better and uniquely identifies the driver and device that you
are referring to.

Also, all of the printk() calls in these files should be switched to
dev_err() or dev_warn() for the same reason.

> +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
> +		      struct xenbus_watch *watch,
> +		      void (*callback)(struct xenbus_watch *,
> +				       const char **, unsigned int))
> +{
> +	int err;
> +
> +	watch->node = path;
> +	watch->callback = callback;
> +
> +	err = register_xenbus_watch(watch);
> +
> +	if (err) {
> +		watch->node = NULL;
> +		watch->callback = NULL;
> +		xenbus_dev_fatal(dev, err, "adding watch on %s", path);
> +	}
> +
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(xenbus_watch_path);
> +
> +
> +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
> +		       const char *path2, struct xenbus_watch *watch,
> +		       void (*callback)(struct xenbus_watch *,
> +					const char **, unsigned int))
> +{
> +	int err;
> +	char *state = kasprintf("%s/%s", path, path2);
> +	if (!state) {
> +		xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
> +		return -ENOMEM;
> +	}
> +	err = xenbus_watch_path(dev, state, watch, callback);
> +
> +	if (err)
> +		kfree(state);
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(xenbus_watch_path2);
> +
> +
> +int xenbus_switch_state(struct xenbus_device *dev,
> +			xenbus_transaction_t xbt,
> +			XenbusState state)

I'm guessing that XenbusState is a typedef?  Please fix the naming to be
Linux kernel compatible.

> +{
> +	/* We check whether the state is currently set to the given value, and
> +	   if not, then the state is set.  We don't want to unconditionally
> +	   write the given state, because we don't want to fire watches
> +	   unnecessarily.  Furthermore, if the node has gone, we don't write
> +	   to it, as the device will be tearing down, and we don't want to
> +	   resurrect that directory.
> +	 */
> +
> +	int current_state;
> +	int err;
> +
> +	if (state == dev->state)
> +		return 0;
> +
> +	err = xenbus_scanf(xbt, dev->nodename, "state", "%d",
> +			       &current_state);
> +	if (err != 1)
> +		return 0;
> +
> +	err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
> +	if (err) {
> +		if (state != XenbusStateClosing) /* Avoid looping */
> +			xenbus_dev_fatal(dev, err, "writing new state");
> +		return err;
> +	}
> +
> +	dev->state = state;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(xenbus_switch_state);
> +
> +
> +/**
> + * Return the path to the error node for the given device, or NULL on failure.
> + * If the value returned is non-NULL, then it is the caller's to kfree.
> + */
> +static char *error_path(struct xenbus_device *dev)
> +{
> +	return kasprintf("error/%s", dev->nodename);
> +}
> +
> +
> +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
> +		va_list ap)

Global function?  With no description of what it does?  (hint,
describing it in the .h file, in pseudo-kerneldoc form doesn't really
count, it only makes the tools break...)

> +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
> +		      ...)

No kerneldoc for all of the global functions?

> +extern void xenbus_probe(void *);
> +extern int xenstored_ready;

Should be in a .h file.

> +#include <asm/io.h>
> +#include <asm/page.h>
> +#include <asm/pgtable.h>
> +#include <asm/hypervisor.h>
> +#include <xen/xenbus.h>
> +#ifdef XEN_XENBUS_PROC_INTERFACE
> +#include <xen/xen_proc.h>
> +#endif

#ifdef is not needed.  Put it in the .h file.

> +#include <xen/evtchn.h>
> +
> +#include "xenbus_comms.h"
> +
> +extern struct mutex xenwatch_mutex;

Should be in a .h file.

> +struct xen_bus_type
> +{
> +	char *root;
> +	unsigned int levels;
> +	int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
> +	int (*probe)(const char *type, const char *dir);
> +	struct bus_type bus;
> +	struct device dev;
> +};

Why have you embedded both a struct bus_type and a struct device into
this structure?  How is the lifecycle handled due to 2 different
reference counted structures?

Also, I note that you statically create this, which isn't the nicest,
why not dynamically create it?

And why a different probe function per bus types?

And the bus id is part of the struct bus_type, why a separate function
to retrieve it?

And what are you doing with the different "levels"?  Is there some
description of how you are using sysfs for this?  Busses should not be
"nested", devices should.  How does sysfs look with this code in it?
What is the /sys/bus/ structure?  What is the /sys/devices/ structure?

> +/* device/<type>/<id> => <type>-<id> */
> +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
> +{
> +	nodename = strchr(nodename, '/');
> +	if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
> +		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
> +		return -EINVAL;
> +	}
> +
> +	strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
> +	if (!strchr(bus_id, '/')) {
> +		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
> +		return -EINVAL;
> +	}
> +	*strchr(bus_id, '/') = '-';
> +	return 0;
> +}

And why all the string logic for device ids and names?  Is that the only
unique way to identify the different devices on your bus?  Why not just
give them a numerical id then?  It would save you a lot of
computation...

> +/* Bus type for frontend drivers. */
> +static int xenbus_probe_frontend(const char *type, const char *name);
> +static struct xen_bus_type xenbus_frontend = {
> +	.root = "device",
> +	.levels = 2, 		/* device/type/<id> */
> +	.get_bus_id = frontend_bus_id,
> +	.probe = xenbus_probe_frontend,
> +	.bus = {
> +		.name  = "xen",
> +		.match = xenbus_match,
> +	},
> +	.dev = {
> +		.bus_id = "xen",
> +	},
> +};
> +
> +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
> +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
> +{
> +	int domid, err;
> +	const char *devid, *type, *frontend;
> +	unsigned int typelen;
> +
> +	type = strchr(nodename, '/');
> +	if (!type)
> +		return -EINVAL;
> +	type++;
> +	typelen = strcspn(type, "/");
> +	if (!typelen || type[typelen] != '/')
> +		return -EINVAL;
> +
> +	devid = strrchr(nodename, '/') + 1;
> +
> +	err = xenbus_gather(XBT_NULL, nodename, "frontend-id", "%i", &domid,
> +			    "frontend", NULL, &frontend,
> +			    NULL);
> +	if (err)
> +		return err;
> +	if (strlen(frontend) == 0)
> +		err = -ERANGE;
> +	if (!err && !xenbus_exists(XBT_NULL, frontend, ""))
> +		err = -ENOENT;
> +
> +	kfree(frontend);
> +
> +	if (err)
> +		return err;
> +
> +	if (snprintf(bus_id, BUS_ID_SIZE,
> +		     "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
> +		return -ENOSPC;
> +	return 0;
> +}
> +
> +static int xenbus_uevent_backend(struct device *dev, char **envp,
> +				 int num_envp, char *buffer, int buffer_size);
> +static int xenbus_probe_backend(const char *type, const char *domid);
> +static struct xen_bus_type xenbus_backend = {
> +	.root = "backend",
> +	.levels = 3, 		/* backend/type/<frontend>/<id> */
> +	.get_bus_id = backend_bus_id,
> +	.probe = xenbus_probe_backend,
> +	.bus = {
> +		.name  = "xen-backend",
> +		.match = xenbus_match,
> +		.uevent = xenbus_uevent_backend,
> +	},
> +	.dev = {
> +		.bus_id = "xen-backend",
> +	},
> +};

What is the "frontend/backend" relationship here?

> +static int xenbus_uevent_backend(struct device *dev, char **envp,
> +				 int num_envp, char *buffer, int buffer_size)
> +{
> +	struct xenbus_device *xdev;
> +	struct xenbus_driver *drv;
> +	int i = 0;
> +	int length = 0;
> +
> +	DPRINTK("");
> +
> +	if (dev == NULL)
> +		return -ENODEV;
> +
> +	xdev = to_xenbus_device(dev);
> +	if (xdev == NULL)
> +		return -ENODEV;
> +
> +	/* stuff we want to pass to /sbin/hotplug */
> +	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
> +		       "XENBUS_TYPE=%s", xdev->devicetype);
> +
> +	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
> +		       "XENBUS_PATH=%s", xdev->nodename);
> +
> +	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
> +		       "XENBUS_BASE_PATH=%s", xenbus_backend.root);

Why not use the standard "TYPE" "PATH" and no "XENBUS" stuff?

> +static int xenbus_register_driver_common(struct xenbus_driver *drv,
> +					 struct xen_bus_type *bus)
> +{
> +	int ret;
> +
> +	drv->driver.name = drv->name;
> +	drv->driver.bus = &bus->bus;
> +	drv->driver.owner = drv->owner;

Hint, put the owner in the function, which will force the caller to pass
it in.  It's forgotten to be set in the structure a lot.  Look at USB
and PCI register functions now for an example of this.

> +	drv->driver.probe = xenbus_dev_probe;
> +	drv->driver.remove = xenbus_dev_remove;
> +
> +	mutex_lock(&xenwatch_mutex);
> +	ret = driver_register(&drv->driver);
> +	mutex_unlock(&xenwatch_mutex);

What's with the lock?  What is wrong with the driver core lock that is
taken?  What are you trying to protect?

> +void xenbus_suspend(void)
> +{
> +	DPRINTK("");
> +
> +	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
> +	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev);
> +	xs_suspend();
> +}
> +EXPORT_SYMBOL_GPL(xenbus_suspend);

I think the driver core will handle walking your devices and suspending
them.  You don't have to do it by hand like this.

> +void xenbus_resume(void)
> +{
> +	xb_init_comms();
> +	xs_resume();
> +	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
> +	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev);
> +}
> +EXPORT_SYMBOL_GPL(xenbus_resume);

Same thing for resume.

> +#ifdef XEN_XENBUS_PROC_INTERFACE
> +static struct file_operations xsd_kva_fops;
> +static struct proc_dir_entry *xsd_kva_intf;
> +static struct proc_dir_entry *xsd_port_intf;

#ifdef not needed if your .h file is written correctly.

> +static int __init xenbus_probe_init(void)
> +{
> +	int err = 0, dom0;
> +
> +	DPRINTK("");
> +
> +	if (xen_init() < 0) {
> +		DPRINTK("failed");
> +		return -ENODEV;
> +	}
> +
> +	/* Register ourselves with the kernel bus & device subsystems */
> +	bus_register(&xenbus_frontend.bus);
> +	bus_register(&xenbus_backend.bus);
> +	device_register(&xenbus_frontend.dev);
> +	device_register(&xenbus_backend.dev);

No error handling?

> +
> +	/*
> +	 * Domain0 doesn't have a store_evtchn or store_mfn yet.
> +	 */
> +	dom0 = (xen_start_info->store_evtchn == 0);
> +
> +#ifdef XEN_XENBUS_PROC_INTERFACE

No #ifdef needed if your .h file is written correctly.

> +/* xenbus_probe.c */
> +extern char *kasprintf(const char *fmt, ...);

Should be in a .h file

> +/*
> + * Details of the xenwatch callback kernel thread. The thread waits on the
> + * watch_events_waitq for work to do (queued on watch_events list). When it
> + * wakes up it acquires the xenwatch_mutex before reading the list and
> + * carrying out work.
> + */
> +static pid_t xenwatch_pid;
> +/* static */ DEFINE_MUTEX(xenwatch_mutex);

Drop the static comment?

> +/* Create a new directory. */
> +int xenbus_mkdir(xenbus_transaction_t t,
> +		 const char *dir, const char *node)
> +{
> +	char *path;
> +	int ret;
> +
> +	path = join(dir, node);
> +	if (IS_ERR(path))
> +		return PTR_ERR(path);
> +
> +	ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
> +	kfree(path);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(xenbus_mkdir);

Create a new directory in what?  sysfs?

> +/* Destroy a file or directory (directories must be empty). */
> +int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node)
> +{
> +	char *path;
> +	int ret;
> +
> +	path = join(dir, node);
> +	if (IS_ERR(path))
> +		return PTR_ERR(path);
> +
> +	ret = xs_error(xs_single(t, XS_RM, path, NULL));
> +	kfree(path);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(xenbus_rm);

Remove a file or directory in what?  sysfs?

> +#define XBT_NULL 0

What is this for?  What's wrong with just "NULL"?

> +/* A xenbus device. */
> +struct xenbus_device {
> +	const char *devicetype;
> +	const char *nodename;
> +	const char *otherend;
> +	int otherend_id;
> +	struct xenbus_watch otherend_watch;
> +	struct device dev;
> +	XenbusState state;
> +	void *data;

The data field can be dropped.  Use the space in the struct device for
this.

> +typedef u32 xenbus_transaction_t;

Why the typedef?  Please don't.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09 19:35     ` Hollis Blanchard
@ 2006-05-09 19:48       ` Anthony Liguori
  2006-05-09 22:34       ` Christoph Hellwig
  1 sibling, 0 replies; 185+ messages in thread
From: Anthony Liguori @ 2006-05-09 19:48 UTC (permalink / raw)
  To: Hollis Blanchard
  Cc: Christoph Hellwig, Chris Wright, virtualization, xen-devel,
	linux-kernel, Ian Pratt

Hollis Blanchard wrote:
> On Tue, 2006-05-09 at 16:15 +0100, Christoph Hellwig wrote:
>   
>>> +#ifdef __XEN__
>>> +#define __DEFINE_GUEST_HANDLE(name, type) \
>>> +    typedef struct { type *p; } __guest_handle_ ## name
>>> +#else
>>> +#define __DEFINE_GUEST_HANDLE(name, type) \
>>> +    typedef type * __guest_handle_ ## name
>>> +#endif
>>>       
>> please get rid of all these stupid typedefs 
>>     
>
> These typedefs are a new hack to work around a basic interface problem:
> instead of explicitly-sized types, Xen uses longs and pointers in its
> interface. On PowerPC in particular, where we need a 32-bit userland
> communicating with a 64-bit hypervisor, those types don't work.
>
> However, the maintainers are reluctant to switch the interface to use
> explicitly-sized types because it would break binary compatibility.
> These ugly "HANDLE" macros allow PowerPC to do what we need without
> affecting binary compatibility on x86.
>   

Is this strictly true though?  The ABI for Power and x86 are not 
necessarily dependent on each other.  One could just as easily define a 
typedef like:

#if defined(__ppc__)
typedef uint64_t guest_handle_t;
#elif defined(__x86__)
typedef unsigned long guest_handle_t;
#endif

I thought the use of GUEST_HANDLE was to maintain type safety.  It 
certainly helps the issue you point out but it's not strictly necessary.

IMHO, this trick makes the code pretty ugly.  I'd rather see it 
disappear in favor of something more akin to the above.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 32/35] Add Xen driver utility functions.
  2006-05-09  7:00 ` [RFC PATCH 32/35] Add Xen driver utility functions Chris Wright
@ 2006-05-09 19:48   ` Greg KH
  2006-05-09 21:50   ` Andi Kleen
  1 sibling, 0 replies; 185+ messages in thread
From: Greg KH @ 2006-05-09 19:48 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach, Jan Beulich

On Tue, May 09, 2006 at 12:00:32AM -0700, Chris Wright wrote:
> +EXPORT_SYMBOL_GPL(alloc_vm_area);

> +EXPORT_SYMBOL_GPL(free_vm_area);

> +EXPORT_SYMBOL_GPL(lock_vm_area);

> +EXPORT_SYMBOL_GPL(unlock_vm_area);

These are all pretty generic function names.  Perhaps they belong in the
core kernel code instead of a Xen specific file?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09  7:00 ` [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver Chris Wright
  2006-05-09 16:06   ` Alexey Dobriyan
  2006-05-09 19:40   ` Greg KH
@ 2006-05-09 19:49   ` Greg KH
  2006-05-09 19:58     ` Chris Wright
  2006-05-13 12:28   ` Andrew Morton
  3 siblings, 1 reply; 185+ messages in thread
From: Greg KH @ 2006-05-09 19:49 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, May 09, 2006 at 12:00:33AM -0700, Chris Wright wrote:
> +#ifdef XEN_XENBUS_PROC_INTERFACE
> +#include <xen/xen_proc.h>
> +#endif

Oh, you all never define this anywhere in the series, so anything
protected by it should be removed.

And I sure hope you don't have a xen_proc.h file anywhere, we do not
need any new non-process files going into /proc...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09 19:49   ` Greg KH
@ 2006-05-09 19:58     ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 19:58 UTC (permalink / raw)
  To: Greg KH
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

* Greg KH (greg@kroah.com) wrote:
> And I sure hope you don't have a xen_proc.h file anywhere, we do not
> need any new non-process files going into /proc...

I'll be happy once we've got all the /proc abuse eliminated, sorry some
of this snuck through.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
                     ` (2 preceding siblings ...)
  2006-05-09 18:56   ` Stephen Hemminger
@ 2006-05-09 20:25   ` Stephen Hemminger
  2006-05-09 20:26     ` Keir Fraser
  2006-05-09 20:32     ` Chris Wright
  2006-05-09 22:41   ` [Xen-devel] " Herbert Xu
  4 siblings, 2 replies; 185+ messages in thread
From: Stephen Hemminger @ 2006-05-09 20:25 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach, netdev

> +static int setup_device(struct xenbus_device *dev, struct
> netfront_info *info) +{
> +	struct netif_tx_sring *txs;
> +	struct netif_rx_sring *rxs;
> +	int err;
> +	struct net_device *netdev = info->netdev;
> +
> +	info->tx_ring_ref = GRANT_INVALID_REF;
> +	info->rx_ring_ref = GRANT_INVALID_REF;
> +	info->rx.sring = NULL;
> +	info->tx.sring = NULL;
> +	info->irq = 0;
> +
> +	txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
> +	if (!txs) {
> +		err = -ENOMEM;
> +		xenbus_dev_fatal(dev, err, "allocating tx ring
> page");
> +		goto fail;
> +	}
> +	rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
> +	if (!rxs) {
> +		err = -ENOMEM;
> +		xenbus_dev_fatal(dev, err, "allocating rx ring
> page");
> +		free_page((unsigned long)txs);
> +		goto fail;
> +	}
> +	info->backend_state = BEST_DISCONNECTED;
> +
> +	SHARED_RING_INIT(txs);
> +	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
> +
> +	SHARED_RING_INIT(rxs);
> +	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
> +
> +	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
> +	if (err < 0)
> +		goto fail;
> +	info->tx_ring_ref = err;
> +
> +	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
> +	if (err < 0)
> +		goto fail;
> +	info->rx_ring_ref = err;
> +
> +	err = xenbus_alloc_evtchn(dev, &info->evtchn);
> +	if (err)
> +		goto fail;
> +
> +	memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
> +	network_connect(netdev);
> +	info->irq = bind_evtchn_to_irqhandler(
> +		info->evtchn, netif_int, SA_SAMPLE_RANDOM,
> netdev->name,
> 

This doesn't look like a real random entropy source. packets
arriving from another domain are easily timed.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 20:25   ` Stephen Hemminger
@ 2006-05-09 20:26     ` Keir Fraser
  2006-05-09 20:39       ` Stephen Hemminger
  2006-05-09 20:46       ` Roland Dreier
  2006-05-09 20:32     ` Chris Wright
  1 sibling, 2 replies; 185+ messages in thread
From: Keir Fraser @ 2006-05-09 20:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: virtualization, Ian Pratt, xen-devel, linux-kernel, Chris Wright, netdev


On 9 May 2006, at 21:25, Stephen Hemminger wrote:

>> +	memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
>> +	network_connect(netdev);
>> +	info->irq = bind_evtchn_to_irqhandler(
>> +		info->evtchn, netif_int, SA_SAMPLE_RANDOM,
>> netdev->name,
>>
>
> This doesn't look like a real random entropy source. packets
> arriving from another domain are easily timed.

Where should we get our entropy from in a VM environment? Leaving the 
pool empty can cause processes to hang.

  -- Keir


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 20:25   ` Stephen Hemminger
  2006-05-09 20:26     ` Keir Fraser
@ 2006-05-09 20:32     ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 20:32 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach, netdev

* Stephen Hemminger (shemminger@osdl.org) wrote:
> > +	info->irq = bind_evtchn_to_irqhandler(
> > +		info->evtchn, netif_int, SA_SAMPLE_RANDOM,
> > netdev->name,
> 
> This doesn't look like a real random entropy source. packets
> arriving from another domain are easily timed.

Heh, given the path they take, that sadly may not be the case ;-)
But point well-taken, that's easy to drop.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 20:26     ` Keir Fraser
@ 2006-05-09 20:39       ` Stephen Hemminger
  2006-05-09 20:46       ` Roland Dreier
  1 sibling, 0 replies; 185+ messages in thread
From: Stephen Hemminger @ 2006-05-09 20:39 UTC (permalink / raw)
  To: linux-kernel

On Tue, 9 May 2006 21:26:11 +0100
Keir Fraser <Keir.Fraser@cl.cam.ac.uk> wrote:

> 
> On 9 May 2006, at 21:25, Stephen Hemminger wrote:
> 
> >> +	memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
> >> +	network_connect(netdev);
> >> +	info->irq = bind_evtchn_to_irqhandler(
> >> +		info->evtchn, netif_int, SA_SAMPLE_RANDOM,
> >> netdev->name,
> >>
> >
> > This doesn't look like a real random entropy source. packets
> > arriving from another domain are easily timed.
> 
> Where should we get our entropy from in a VM environment? Leaving the 
> pool empty can cause processes to hang.
> 

You probably need to get entropy from dom0 and real hardware sources.
Could you piggyback on some other perodic polling/message passing to
push some entropy out?

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 16:31           ` Andi Kleen
@ 2006-05-09 20:42             ` Christian Limpach
  2006-05-09 21:56               ` Andi Kleen
  2006-05-09 21:56               ` Chris Wright
  0 siblings, 2 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-09 20:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Martin J. Bligh, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 06:31:37PM +0200, Andi Kleen wrote:
> On Tuesday 09 May 2006 18:29, Christian Limpach wrote:
> > On Tue, May 09, 2006 at 06:07:57PM +0200, Andi Kleen wrote:
> > > 
> > > > 
> > > > Anybody want to comment on the performance impact of making
> > > > local_irq_* non-inline functions?
> > > 
> > > I would guess for that much inline code it will be even a win to not
> > > inline because it will save icache.
> > 
> > Maybe, although some of the macros compile down to only 2-3 instructions.
> 
> Can you post before/after vmlinux size numbers for inline/out of line?

Sure, although it is a bit tricky since the #define's pass non-pointer
arguments by reference.  This would also make it quite ugly to change
these.

Everything[1] in line:
-rwxr-xr-x  1 cl349 cl349  2633640 May  9 19:42 vmlinux-inline-stripped
Everything out of line:
-rwxr-xr-x  1 cl349 cl349  2621352 May  9 19:45 vmlinux-outline-stripped

Additionally, I changed did a build with only __sti and __restore_flags
out of line and the others in line:
-rwxr-xr-x  1 cl349 cl349  2617256 May  9 19:50 vmlinux-hybrid-stripped

__sti and __restore_flags are the ones which generate more code,
so it seemed more sensible to make the out of line.

Any conlusions?

    christian

[1] __cli, __sti, __save_flags, __restore_flags, __save_and_cli, irqs_disabled


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 20:26     ` Keir Fraser
  2006-05-09 20:39       ` Stephen Hemminger
@ 2006-05-09 20:46       ` Roland Dreier
  2006-05-10 18:28         ` Andi Kleen
  1 sibling, 1 reply; 185+ messages in thread
From: Roland Dreier @ 2006-05-09 20:46 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Stephen Hemminger, virtualization, Ian Pratt, xen-devel,
	linux-kernel, Chris Wright, netdev

    Keir> Where should we get our entropy from in a VM environment?
    Keir> Leaving the pool empty can cause processes to hang.

You could have something like a virtual HW RNG driver (with a frontend
and backend), which steals from the dom0 /dev/random pool.

 - R.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09  7:00 ` [RFC PATCH 25/35] Add Xen time abstractions Chris Wright
  2006-05-09 16:23   ` Daniel Walker
@ 2006-05-09 21:50   ` Andi Kleen
  2006-05-09 23:03     ` Ingo Oeser
  2006-05-12 21:44   ` Pavel Machek
  2 siblings, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 21:50 UTC (permalink / raw)
  To: virtualization; +Cc: Chris Wright, linux-kernel, xen-devel, Ian Pratt

On Tuesday 09 May 2006 09:00, Chris Wright wrote:
> Add support for Xen time abstractions. To avoid expensive traps into
> the hypervisor, the passage of time is extrapolated from the local TSC
> and a set of timestamps and scaling factors exported to the guest via
> shared memory. Xen also provides a periodic interrupt facility which
> is used to drive updates of xtime and jiffies, and perform the usual
> process accounting and profiling.

There is far too much code duplication in there. I think you need to
refactor the main time.c a bit first and strip that down.

Also you can drop all the __x86_64__ support for now.

-Andi

> 

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 32/35] Add Xen driver utility functions.
  2006-05-09  7:00 ` [RFC PATCH 32/35] Add Xen driver utility functions Chris Wright
  2006-05-09 19:48   ` Greg KH
@ 2006-05-09 21:50   ` Andi Kleen
  1 sibling, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 21:50 UTC (permalink / raw)
  To: virtualization
  Cc: Chris Wright, linux-kernel, xen-devel, Jan Beulich, Ian Pratt

On Tuesday 09 May 2006 09:00, Chris Wright wrote:
> Allocate/destroy a 'vmalloc' VM area: alloc_vm_area and free_vm_area
> The alloc function ensures that page tables are constructed for the
> region of kernel virtual address space and mapped into init_mm.
> 
> Lock an area so that PTEs are accessible in the current address space:
> lock_vm_area and unlock_vm_area
> The lock function prevents context switches to a lazy mm that doesn't
> have the area mapped into its page tables.  It also ensures that the
> page tables are mapped into the current mm by causing the page fault
> handler to copy the page directory pointers from init_mm into the
> current mm.

Having that in drivers/xen looks wrong.  It should be probably somewhere generic.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09 19:40   ` Greg KH
@ 2006-05-09 21:53     ` Chris Wright
  2006-05-09 22:01       ` Greg KH
  0 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09 21:53 UTC (permalink / raw)
  To: Greg KH
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

* Greg KH (greg@kroah.com) wrote:
> On Tue, May 09, 2006 at 12:00:33AM -0700, Chris Wright wrote:
> > +/* xenbus_probe.c */
> > +extern char *kasprintf(const char *fmt, ...);
> 
> Belongs in a .h file.

*nod*

> > +#define DPRINTK(fmt, args...) \
> > +    pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
> 
> Please use the dev_dbg() function instead of DPRINTK() or pr_debug().
> It's much better and uniquely identifies the driver and device that you
> are referring to.
> 
> Also, all of the printk() calls in these files should be switched to
> dev_err() or dev_warn() for the same reason.

will do, thanks.

> > +int xenbus_watch_path(struct xenbus_device *dev, const char *path,
> > +		      struct xenbus_watch *watch,
> > +		      void (*callback)(struct xenbus_watch *,
> > +				       const char **, unsigned int))
> > +{
> > +	int err;
> > +
> > +	watch->node = path;
> > +	watch->callback = callback;
> > +
> > +	err = register_xenbus_watch(watch);
> > +
> > +	if (err) {
> > +		watch->node = NULL;
> > +		watch->callback = NULL;
> > +		xenbus_dev_fatal(dev, err, "adding watch on %s", path);
> > +	}
> > +
> > +	return err;
> > +}
> > +EXPORT_SYMBOL_GPL(xenbus_watch_path);
> > +
> > +
> > +int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
> > +		       const char *path2, struct xenbus_watch *watch,
> > +		       void (*callback)(struct xenbus_watch *,
> > +					const char **, unsigned int))
> > +{
> > +	int err;
> > +	char *state = kasprintf("%s/%s", path, path2);
> > +	if (!state) {
> > +		xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
> > +		return -ENOMEM;
> > +	}
> > +	err = xenbus_watch_path(dev, state, watch, callback);
> > +
> > +	if (err)
> > +		kfree(state);
> > +	return err;
> > +}
> > +EXPORT_SYMBOL_GPL(xenbus_watch_path2);
> > +
> > +
> > +int xenbus_switch_state(struct xenbus_device *dev,
> > +			xenbus_transaction_t xbt,
> > +			XenbusState state)
> 
> I'm guessing that XenbusState is a typedef?  Please fix the naming to be
> Linux kernel compatible.

good point, these are slowly being eliminated, noted.

> > +{
> > +	/* We check whether the state is currently set to the given value, and
> > +	   if not, then the state is set.  We don't want to unconditionally
> > +	   write the given state, because we don't want to fire watches
> > +	   unnecessarily.  Furthermore, if the node has gone, we don't write
> > +	   to it, as the device will be tearing down, and we don't want to
> > +	   resurrect that directory.
> > +	 */
> > +
> > +	int current_state;
> > +	int err;
> > +
> > +	if (state == dev->state)
> > +		return 0;
> > +
> > +	err = xenbus_scanf(xbt, dev->nodename, "state", "%d",
> > +			       &current_state);
> > +	if (err != 1)
> > +		return 0;
> > +
> > +	err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
> > +	if (err) {
> > +		if (state != XenbusStateClosing) /* Avoid looping */
> > +			xenbus_dev_fatal(dev, err, "writing new state");
> > +		return err;
> > +	}
> > +
> > +	dev->state = state;
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(xenbus_switch_state);
> > +
> > +
> > +/**
> > + * Return the path to the error node for the given device, or NULL on failure.
> > + * If the value returned is non-NULL, then it is the caller's to kfree.
> > + */
> > +static char *error_path(struct xenbus_device *dev)
> > +{
> > +	return kasprintf("error/%s", dev->nodename);
> > +}
> > +
> > +
> > +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
> > +		va_list ap)
> 
> Global function?  With no description of what it does?  (hint,
> describing it in the .h file, in pseudo-kerneldoc form doesn't really
> count, it only makes the tools break...)

yeah, all of _dev_error cleanup is already on the todo list.
for starters, good names and better docs will help.

> > +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
> > +		      ...)
> 
> No kerneldoc for all of the global functions?

will update with kerneldoc.

> > +extern void xenbus_probe(void *);
> > +extern int xenstored_ready;
> 
> Should be in a .h file.

yes

> > +#include <asm/io.h>
> > +#include <asm/page.h>
> > +#include <asm/pgtable.h>
> > +#include <asm/hypervisor.h>
> > +#include <xen/xenbus.h>
> > +#ifdef XEN_XENBUS_PROC_INTERFACE
> > +#include <xen/xen_proc.h>
> > +#endif
> 
> #ifdef is not needed.  Put it in the .h file.

better if we can eliminate, but .h for now.

> > +#include <xen/evtchn.h>
> > +
> > +#include "xenbus_comms.h"
> > +
> > +extern struct mutex xenwatch_mutex;
> 
> Should be in a .h file.

yes

> > +struct xen_bus_type
> > +{
> > +	char *root;
> > +	unsigned int levels;
> > +	int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename);
> > +	int (*probe)(const char *type, const char *dir);
> > +	struct bus_type bus;
> > +	struct device dev;
> > +};
> 
> Why have you embedded both a struct bus_type and a struct device into
> this structure?  How is the lifecycle handled due to 2 different
> reference counted structures?
> 
> Also, I note that you statically create this, which isn't the nicest,
> why not dynamically create it?
> 
> And why a different probe function per bus types?
> 
> And the bus id is part of the struct bus_type, why a separate function
> to retrieve it?

Good questions, I need to dig a little deeper on that.  Xenbus is an
interesting beast on its own.

> And what are you doing with the different "levels"?  Is there some
> description of how you are using sysfs for this?  Busses should not be
> "nested", devices should.  How does sysfs look with this code in it?
> What is the /sys/bus/ structure?  What is the /sys/devices/ structure?

e.g.

/sys/bus/xen
|-- devices
|   |-- vbd-51713 -> ../../../devices/xen/vbd-51713
|   `-- vif-0 -> ../../../devices/xen/vif-0
`-- drivers
    `-- vbd
        |-- bind
        |-- unbind
        `-- vbd-51713 -> ../../../../devices/xen/vbd-51713

/sys/devices/xen
|-- uevent
|-- vbd-51713
|   |-- block:xvda1 -> ../../../block/xvda1
|   |-- bus -> ../../../bus/xen
|   |-- devtype
|   |-- driver -> ../../../bus/xen/drivers/vbd
|   |-- nodename
|   `-- uevent
`-- vif-0
    |-- bus -> ../../../bus/xen
    |-- devtype
    |-- nodename
    `-- uevent

> > +/* device/<type>/<id> => <type>-<id> */
> > +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
> > +{
> > +	nodename = strchr(nodename, '/');
> > +	if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) {
> > +		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
> > +		return -EINVAL;
> > +	}
> > +
> > +	strlcpy(bus_id, nodename + 1, BUS_ID_SIZE);
> > +	if (!strchr(bus_id, '/')) {
> > +		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
> > +		return -EINVAL;
> > +	}
> > +	*strchr(bus_id, '/') = '-';
> > +	return 0;
> > +}
> 
> And why all the string logic for device ids and names?  Is that the only
> unique way to identify the different devices on your bus?  Why not just
> give them a numerical id then?  It would save you a lot of
> computation...
> 
> > +/* Bus type for frontend drivers. */
> > +static int xenbus_probe_frontend(const char *type, const char *name);
> > +static struct xen_bus_type xenbus_frontend = {
> > +	.root = "device",
> > +	.levels = 2, 		/* device/type/<id> */
> > +	.get_bus_id = frontend_bus_id,
> > +	.probe = xenbus_probe_frontend,
> > +	.bus = {
> > +		.name  = "xen",
> > +		.match = xenbus_match,
> > +	},
> > +	.dev = {
> > +		.bus_id = "xen",
> > +	},
> > +};
> > +
> > +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
> > +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename)
> > +{
> > +	int domid, err;
> > +	const char *devid, *type, *frontend;
> > +	unsigned int typelen;
> > +
> > +	type = strchr(nodename, '/');
> > +	if (!type)
> > +		return -EINVAL;
> > +	type++;
> > +	typelen = strcspn(type, "/");
> > +	if (!typelen || type[typelen] != '/')
> > +		return -EINVAL;
> > +
> > +	devid = strrchr(nodename, '/') + 1;
> > +
> > +	err = xenbus_gather(XBT_NULL, nodename, "frontend-id", "%i", &domid,
> > +			    "frontend", NULL, &frontend,
> > +			    NULL);
> > +	if (err)
> > +		return err;
> > +	if (strlen(frontend) == 0)
> > +		err = -ERANGE;
> > +	if (!err && !xenbus_exists(XBT_NULL, frontend, ""))
> > +		err = -ENOENT;
> > +
> > +	kfree(frontend);
> > +
> > +	if (err)
> > +		return err;
> > +
> > +	if (snprintf(bus_id, BUS_ID_SIZE,
> > +		     "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE)
> > +		return -ENOSPC;
> > +	return 0;
> > +}
> > +
> > +static int xenbus_uevent_backend(struct device *dev, char **envp,
> > +				 int num_envp, char *buffer, int buffer_size);
> > +static int xenbus_probe_backend(const char *type, const char *domid);
> > +static struct xen_bus_type xenbus_backend = {
> > +	.root = "backend",
> > +	.levels = 3, 		/* backend/type/<frontend>/<id> */
> > +	.get_bus_id = backend_bus_id,
> > +	.probe = xenbus_probe_backend,
> > +	.bus = {
> > +		.name  = "xen-backend",
> > +		.match = xenbus_match,
> > +		.uevent = xenbus_uevent_backend,
> > +	},
> > +	.dev = {
> > +		.bus_id = "xen-backend",
> > +	},
> > +};
> 
> What is the "frontend/backend" relationship here?

do you mean in sysfs?  or more in general?

> > +static int xenbus_uevent_backend(struct device *dev, char **envp,
> > +				 int num_envp, char *buffer, int buffer_size)
> > +{
> > +	struct xenbus_device *xdev;
> > +	struct xenbus_driver *drv;
> > +	int i = 0;
> > +	int length = 0;
> > +
> > +	DPRINTK("");
> > +
> > +	if (dev == NULL)
> > +		return -ENODEV;
> > +
> > +	xdev = to_xenbus_device(dev);
> > +	if (xdev == NULL)
> > +		return -ENODEV;
> > +
> > +	/* stuff we want to pass to /sbin/hotplug */
> > +	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
> > +		       "XENBUS_TYPE=%s", xdev->devicetype);
> > +
> > +	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
> > +		       "XENBUS_PATH=%s", xdev->nodename);
> > +
> > +	add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length,
> > +		       "XENBUS_BASE_PATH=%s", xenbus_backend.root);
> 
> Why not use the standard "TYPE" "PATH" and no "XENBUS" stuff?
> 
> > +static int xenbus_register_driver_common(struct xenbus_driver *drv,
> > +					 struct xen_bus_type *bus)
> > +{
> > +	int ret;
> > +
> > +	drv->driver.name = drv->name;
> > +	drv->driver.bus = &bus->bus;
> > +	drv->driver.owner = drv->owner;
> 
> Hint, put the owner in the function, which will force the caller to pass
> it in.  It's forgotten to be set in the structure a lot.  Look at USB
> and PCI register functions now for an example of this.

OK, thanks.

> > +	drv->driver.probe = xenbus_dev_probe;
> > +	drv->driver.remove = xenbus_dev_remove;
> > +
> > +	mutex_lock(&xenwatch_mutex);
> > +	ret = driver_register(&drv->driver);
> > +	mutex_unlock(&xenwatch_mutex);
> 
> What's with the lock?  What is wrong with the driver core lock that is
> taken?  What are you trying to protect?

It's serializing with the xenwatch thread, which is a outside the driver
core.  I think it may be over eager, and can rely on driver core for
normal driver registration, and push it deeper to truly protect the
watch lists and registration.  I'll add this to the things to
investigate more deeply.

> > +void xenbus_suspend(void)
> > +{
> > +	DPRINTK("");
> > +
> > +	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
> > +	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev);
> > +	xs_suspend();
> > +}
> > +EXPORT_SYMBOL_GPL(xenbus_suspend);
> 
> I think the driver core will handle walking your devices and suspending
> them.  You don't have to do it by hand like this.

Hmm, that would be nice, I'll look into that.

> > +void xenbus_resume(void)
> > +{
> > +	xb_init_comms();
> > +	xs_resume();
> > +	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
> > +	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev);
> > +}
> > +EXPORT_SYMBOL_GPL(xenbus_resume);
> 
> Same thing for resume.
> 
> > +#ifdef XEN_XENBUS_PROC_INTERFACE
> > +static struct file_operations xsd_kva_fops;
> > +static struct proc_dir_entry *xsd_kva_intf;
> > +static struct proc_dir_entry *xsd_port_intf;
> 
> #ifdef not needed if your .h file is written correctly.

*nod*

> > +static int __init xenbus_probe_init(void)
> > +{
> > +	int err = 0, dom0;
> > +
> > +	DPRINTK("");
> > +
> > +	if (xen_init() < 0) {
> > +		DPRINTK("failed");
> > +		return -ENODEV;
> > +	}
> > +
> > +	/* Register ourselves with the kernel bus & device subsystems */
> > +	bus_register(&xenbus_frontend.bus);
> > +	bus_register(&xenbus_backend.bus);
> > +	device_register(&xenbus_frontend.dev);
> > +	device_register(&xenbus_backend.dev);
> 
> No error handling?

will fix

> > +
> > +	/*
> > +	 * Domain0 doesn't have a store_evtchn or store_mfn yet.
> > +	 */
> > +	dom0 = (xen_start_info->store_evtchn == 0);
> > +
> > +#ifdef XEN_XENBUS_PROC_INTERFACE
> 
> No #ifdef needed if your .h file is written correctly.

*nod*

> > +/* xenbus_probe.c */
> > +extern char *kasprintf(const char *fmt, ...);
> 
> Should be in a .h file

yup, same as above.

> > +/*
> > + * Details of the xenwatch callback kernel thread. The thread waits on the
> > + * watch_events_waitq for work to do (queued on watch_events list). When it
> > + * wakes up it acquires the xenwatch_mutex before reading the list and
> > + * carrying out work.
> > + */
> > +static pid_t xenwatch_pid;
> > +/* static */ DEFINE_MUTEX(xenwatch_mutex);
> 
> Drop the static comment?

yes

> > +/* Create a new directory. */
> > +int xenbus_mkdir(xenbus_transaction_t t,
> > +		 const char *dir, const char *node)
> > +{
> > +	char *path;
> > +	int ret;
> > +
> > +	path = join(dir, node);
> > +	if (IS_ERR(path))
> > +		return PTR_ERR(path);
> > +
> > +	ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
> > +	kfree(path);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(xenbus_mkdir);
> 
> Create a new directory in what?  sysfs?
> 
> > +/* Destroy a file or directory (directories must be empty). */
> > +int xenbus_rm(xenbus_transaction_t t, const char *dir, const char *node)
> > +{
> > +	char *path;
> > +	int ret;
> > +
> > +	path = join(dir, node);
> > +	if (IS_ERR(path))
> > +		return PTR_ERR(path);
> > +
> > +	ret = xs_error(xs_single(t, XS_RM, path, NULL));
> > +	kfree(path);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(xenbus_rm);
> 
> Remove a file or directory in what?  sysfs?

These are in xenstore.  It's pretty much internal to xen.  Deserves a
clearer (kerneldoc) comment.

> > +#define XBT_NULL 0
> 
> What is this for?  What's wrong with just "NULL"?

It's special transaction ID.  Obfuscated by both the name, and the typedef.

> > +/* A xenbus device. */
> > +struct xenbus_device {
> > +	const char *devicetype;
> > +	const char *nodename;
> > +	const char *otherend;
> > +	int otherend_id;
> > +	struct xenbus_watch otherend_watch;
> > +	struct device dev;
> > +	XenbusState state;
> > +	void *data;
> 
> The data field can be dropped.  Use the space in the struct device for
> this.

At quick glance I agree.  Will give that a try.

> > +typedef u32 xenbus_transaction_t;
> 
> Why the typedef?  Please don't.

To see if you were reading? ;-)  The typedef elimination is ongoing,
will add that as well.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 20:42             ` Christian Limpach
@ 2006-05-09 21:56               ` Andi Kleen
  2006-05-10 10:35                 ` Christian Limpach
  2006-05-09 21:56               ` Chris Wright
  1 sibling, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 21:56 UTC (permalink / raw)
  To: Christian Limpach
  Cc: virtualization, Martin J. Bligh, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt


> Everything[1] in line:
> -rwxr-xr-x  1 cl349 cl349  2633640 May  9 19:42 vmlinux-inline-stripped
> Everything out of line:
> -rwxr-xr-x  1 cl349 cl349  2621352 May  9 19:45 vmlinux-outline-stripped
> 
> Additionally, I changed did a build with only __sti and __restore_flags
> out of line and the others in line:
> -rwxr-xr-x  1 cl349 cl349  2617256 May  9 19:50 vmlinux-hybrid-stripped
> 
> __sti and __restore_flags are the ones which generate more code,
> so it seemed more sensible to make the out of line.
> 
> Any conlusions?

It looks like hybrid is a clear winner at least from the code size, isn't it?

I doubt you will be able to benchmark the difference for anything else
anyways so might as well aim for that.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 20:42             ` Christian Limpach
  2006-05-09 21:56               ` Andi Kleen
@ 2006-05-09 21:56               ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 21:56 UTC (permalink / raw)
  To: Christian Limpach
  Cc: Andi Kleen, virtualization, Martin J. Bligh, Chris Wright,
	xen-devel, linux-kernel, Ian Pratt

* Christian Limpach (Christian.Limpach@cl.cam.ac.uk) wrote:
> Everything[1] in line:
> -rwxr-xr-x  1 cl349 cl349  2633640 May  9 19:42 vmlinux-inline-stripped
> Everything out of line:
> -rwxr-xr-x  1 cl349 cl349  2621352 May  9 19:45 vmlinux-outline-stripped

Have the output of 'size vmlinux*' handy?  Be nice to get the extra
details.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09 21:53     ` Chris Wright
@ 2006-05-09 22:01       ` Greg KH
  2006-05-09 22:50         ` Chris Wright
  2006-05-09 23:43         ` Anthony Liguori
  0 siblings, 2 replies; 185+ messages in thread
From: Greg KH @ 2006-05-09 22:01 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, May 09, 2006 at 02:53:14PM -0700, Chris Wright wrote:
> * Greg KH (greg@kroah.com) wrote:
> > And what are you doing with the different "levels"?  Is there some
> > description of how you are using sysfs for this?  Busses should not be
> > "nested", devices should.  How does sysfs look with this code in it?
> > What is the /sys/bus/ structure?  What is the /sys/devices/ structure?
> 
> e.g.
> 
> /sys/bus/xen
> |-- devices
> |   |-- vbd-51713 -> ../../../devices/xen/vbd-51713
> |   `-- vif-0 -> ../../../devices/xen/vif-0
> `-- drivers
>     `-- vbd
>         |-- bind
>         |-- unbind
>         `-- vbd-51713 -> ../../../../devices/xen/vbd-51713
> 
> /sys/devices/xen
> |-- uevent
> |-- vbd-51713
> |   |-- block:xvda1 -> ../../../block/xvda1
> |   |-- bus -> ../../../bus/xen
> |   |-- devtype
> |   |-- driver -> ../../../bus/xen/drivers/vbd
> |   |-- nodename
> |   `-- uevent
> `-- vif-0
>     |-- bus -> ../../../bus/xen
>     |-- devtype
>     |-- nodename
>     `-- uevent

<snip>

> > What is the "frontend/backend" relationship here?
> 
> do you mean in sysfs?  or more in general?

Either.  You seem to mention a lot of nested depths in sysfs or "files",
yet your above tree doesn't show that.  And I don't understand what you
mean by frontend/backend here either?  Is it a sysfs thing?  Or a Xen
thing?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09 19:35     ` Hollis Blanchard
  2006-05-09 19:48       ` [Xen-devel] " Anthony Liguori
@ 2006-05-09 22:34       ` Christoph Hellwig
  1 sibling, 0 replies; 185+ messages in thread
From: Christoph Hellwig @ 2006-05-09 22:34 UTC (permalink / raw)
  To: Hollis Blanchard
  Cc: Christoph Hellwig, Chris Wright, virtualization, xen-devel,
	linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 02:35:09PM -0500, Hollis Blanchard wrote:
> These typedefs are a new hack to work around a basic interface problem:
> instead of explicitly-sized types, Xen uses longs and pointers in its
> interface. On PowerPC in particular, where we need a 32-bit userland
> communicating with a 64-bit hypervisor, those types don't work.
> 
> However, the maintainers are reluctant to switch the interface to use
> explicitly-sized types because it would break binary compatibility.
> These ugly "HANDLE" macros allow PowerPC to do what we need without
> affecting binary compatibility on x86.

this stuff needs to be fixed on x86 aswell.  if the xen people don't
even fix up their code because of silly abi concerns we should better
not merge it at all.


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 03/35] Add Xen interface header files
  2006-05-09 15:15   ` Christoph Hellwig
  2006-05-09 19:35     ` Hollis Blanchard
@ 2006-05-09 22:36     ` Ingo Oeser
  1 sibling, 0 replies; 185+ messages in thread
From: Ingo Oeser @ 2006-05-09 22:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

Hi Christoph,

On Tuesday, 9. May 2006 17:15, Christoph Hellwig wrote:
> > Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
> > Signed-off-by: Chris Wright <chrisw@sous-sol.org>
> > ---
> >  include/xen/interface/arch-x86_32.h   |  197 +++++++++++++++
> 
> that kind of stuff needs to go to asm/
> 
> >  include/xen/interface/event_channel.h |  205 +++++++++++++++
> 
> instead of interface please use something shorter, we'll see this
> all over the includes statements.  intf for example.

I like them and think they are quite clear.

Documentation/CodingStyle Chapter 4: Naming
seem to apply here.

And since you type the include only ONCE per file,
this looks like a good trade, doesn't it?


Regards

Ingo Oeser

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
                     ` (3 preceding siblings ...)
  2006-05-09 20:25   ` Stephen Hemminger
@ 2006-05-09 22:41   ` Herbert Xu
  2006-05-09 23:51     ` Chris Wright
  4 siblings, 1 reply; 185+ messages in thread
From: Herbert Xu @ 2006-05-09 22:41 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, Christian.Limpach, xen-devel,
	netdev, ian.pratt

Chris Wright <chrisw@sous-sol.org> wrote:
>
> +       netdev->features        = NETIF_F_IP_CSUM;

Any reason why IP_CSUM was chosen instead of HW_CSUM? Doing the latter
would seem to be in fact easier for a virtual driver, no?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 04/35] Hypervisor interface header files.
  2006-05-09  7:00 ` [RFC PATCH 04/35] Hypervisor " Chris Wright
@ 2006-05-09 22:43   ` Ingo Oeser
  2006-05-09 23:01     ` Chris Wright
  0 siblings, 1 reply; 185+ messages in thread
From: Ingo Oeser @ 2006-05-09 22:43 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Hi Chris,

first of all: Thanks to the Xen-Team for doing this!

On Tuesday, 9. May 2006 09:00, Chris Wright wrote:
> Define macros and inline functions for doing hypercalls into the
> hypervisor.
> +static inline int
> +HYPERVISOR_set_trap_table(
> +	struct trap_info *table)
> +{
> +	return _hypercall1(int, set_trap_table, table);
> +}

This is outright ugly and non-conformant to 
Documentation/CodingStyle Chapter 2

Fixing this also saves some code lines.

It also looks like generated code. Maybe you can fix your generator
instead?


Regards

Ingo Oeser

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09 22:01       ` Greg KH
@ 2006-05-09 22:50         ` Chris Wright
  2006-05-09 23:43         ` Anthony Liguori
  1 sibling, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 22:50 UTC (permalink / raw)
  To: Greg KH
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

* Greg KH (greg@kroah.com) wrote:
> Either.  You seem to mention a lot of nested depths in sysfs or "files",
> yet your above tree doesn't show that.  And I don't understand what you
> mean by frontend/backend here either?  Is it a sysfs thing?  Or a Xen
> thing?

The files are xenstore, it's part of the communication between frontend
and backend.  The frontend is the device driver in the guest domain
which is just an I/O channel to the backend driver.  The backend is in
the driver domain where the physical hardware can be driven.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 05/35] Add sync bitops
  2006-05-09  7:00 ` [RFC PATCH 05/35] Add sync bitops Chris Wright
@ 2006-05-09 22:56   ` Christoph Lameter
  2006-05-09 23:04     ` Andi Kleen
  2006-05-09 23:07     ` Chris Wright
  0 siblings, 2 replies; 185+ messages in thread
From: Christoph Lameter @ 2006-05-09 22:56 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

On Tue, 9 May 2006, Chris Wright wrote:

> Add "always lock'd" implementations of set_bit, clear_bit and
> change_bit and the corresponding test_and_ functions.  Also add
> "always lock'd" implementation of cmpxchg.  These give guaranteed
> strong synchronisation and are required for non-SMP kernels running on
> an SMP hypervisor.

Could you explain why this is done and what is exactly meant with "always 
looked"? Wh the performance impact?

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 04/35] Hypervisor interface header files.
  2006-05-09 22:43   ` Ingo Oeser
@ 2006-05-09 23:01     ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:01 UTC (permalink / raw)
  To: Ingo Oeser
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

* Ingo Oeser (ioe-lkml@rameria.de) wrote:
> On Tuesday, 9. May 2006 09:00, Chris Wright wrote:
> > Define macros and inline functions for doing hypercalls into the
> > hypervisor.
> > +static inline int
> > +HYPERVISOR_set_trap_table(
> > +	struct trap_info *table)
> > +{
> > +	return _hypercall1(int, set_trap_table, table);
> > +}
> 
> This is outright ugly and non-conformant to 
> Documentation/CodingStyle Chapter 2

Yes, it's non-conformant, will fix that up.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09 21:50   ` Andi Kleen
@ 2006-05-09 23:03     ` Ingo Oeser
  2006-05-09 23:09       ` Andi Kleen
  2006-05-09 23:13       ` Chris Wright
  0 siblings, 2 replies; 185+ messages in thread
From: Ingo Oeser @ 2006-05-09 23:03 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Chris Wright, linux-kernel, xen-devel, Ian Pratt

Hi Andi,

On Tuesday, 9. May 2006 23:50, Andi Kleen wrote:
> On Tuesday 09 May 2006 09:00, Chris Wright wrote:
> > Add support for Xen time abstractions. To avoid expensive traps into
> > the hypervisor, the passage of time is extrapolated from the local TSC
> > and a set of timestamps and scaling factors exported to the guest via
> > shared memory. Xen also provides a periodic interrupt facility which
> > is used to drive updates of xtime and jiffies, and perform the usual
> > process accounting and profiling.
> 
> There is far too much code duplication in there. I think you need to
> refactor the main time.c a bit first and strip that down.
> 
> Also you can drop all the __x86_64__ support for now.

Isn't time and timer handling a moving target anyway?
The refactoring will be done by the timer people in a completly different
manner anyway.

Are you sure, you want to disturb these efforts by requiring another
refactoring here?


Regards

Ingo Oeser

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 05/35] Add sync bitops
  2006-05-09 22:56   ` Christoph Lameter
@ 2006-05-09 23:04     ` Andi Kleen
  2006-05-09 23:07     ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 23:04 UTC (permalink / raw)
  To: virtualization
  Cc: Christoph Lameter, Chris Wright, xen-devel, linux-kernel, Ian Pratt

On Wednesday 10 May 2006 00:56, Christoph Lameter wrote:
> On Tue, 9 May 2006, Chris Wright wrote:
> 
> > Add "always lock'd" implementations of set_bit, clear_bit and
> > change_bit and the corresponding test_and_ functions.  Also add
> > "always lock'd" implementation of cmpxchg.  These give guaranteed
> > strong synchronisation and are required for non-SMP kernels running on
> > an SMP hypervisor.
> 
> Could you explain why this is done and what is exactly meant with "always 
> looked"? Wh the performance impact?

When UP guest runs on SMP hypervisor they still need the LOCK prefix
to talk to the hypervisor through shared memory in a smp safe way.

Normally UP kernels don't use any LOCK prefixes.

I suggested to refactor the bitops this way earlier for this.

-Andi


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 05/35] Add sync bitops
  2006-05-09 22:56   ` Christoph Lameter
  2006-05-09 23:04     ` Andi Kleen
@ 2006-05-09 23:07     ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:07 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

* Christoph Lameter (clameter@sgi.com) wrote:
> On Tue, 9 May 2006, Chris Wright wrote:
> 
> > Add "always lock'd" implementations of set_bit, clear_bit and
> > change_bit and the corresponding test_and_ functions.  Also add
> > "always lock'd" implementation of cmpxchg.  These give guaranteed
> > strong synchronisation and are required for non-SMP kernels running on
> > an SMP hypervisor.
> 
> Could you explain why this is done and what is exactly meant with "always 
> looked"? Wh the performance impact?

The standard UP bitops are not atomic.  But a UP guest may be on SMP
machine, and the bitmaps here are shared between guests.  The always
locked means the lock prefix is not conditional on either UP build
(or smp alternatives patching), and memory barriers are in place.
There's no performance penalty unless you use them, as it's a new set
(somewhat similar to the bitops changes you were looking into).  Although,
this is the simplest, with no multiplexing, simply new interface, synch_*.
Open to ideas here.  Xen is another possible consumer of your bitops
changes.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09 23:03     ` Ingo Oeser
@ 2006-05-09 23:09       ` Andi Kleen
  2006-05-09 23:13       ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-09 23:09 UTC (permalink / raw)
  To: Ingo Oeser
  Cc: virtualization, Chris Wright, linux-kernel, xen-devel, Ian Pratt


> Isn't time and timer handling a moving target anyway?
> The refactoring will be done by the timer people in a completly different
> manner anyway.
> 
> Are you sure, you want to disturb these efforts by requiring another
> refactoring here?

Yes I am.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09 23:03     ` Ingo Oeser
  2006-05-09 23:09       ` Andi Kleen
@ 2006-05-09 23:13       ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:13 UTC (permalink / raw)
  To: Ingo Oeser
  Cc: Andi Kleen, virtualization, Chris Wright, linux-kernel,
	xen-devel, Ian Pratt

* Ingo Oeser (ioe-lkml@rameria.de) wrote:
> On Tuesday, 9. May 2006 23:50, Andi Kleen wrote:
> > On Tuesday 09 May 2006 09:00, Chris Wright wrote:
> > > Add support for Xen time abstractions. To avoid expensive traps into
> > > the hypervisor, the passage of time is extrapolated from the local TSC
> > > and a set of timestamps and scaling factors exported to the guest via
> > > shared memory. Xen also provides a periodic interrupt facility which
> > > is used to drive updates of xtime and jiffies, and perform the usual
> > > process accounting and profiling.
> > 
> > There is far too much code duplication in there. I think you need to
> > refactor the main time.c a bit first and strip that down.
> > 
> > Also you can drop all the __x86_64__ support for now.
> 
> Isn't time and timer handling a moving target anyway?
> The refactoring will be done by the timer people in a completly different
> manner anyway.
> 
> Are you sure, you want to disturb these efforts by requiring another
> refactoring here?

Yes.  Otherwise we end up with either duplicated code if the moving
target winds up not moving, or outdated code if it does.  I agree with
Andi.  It's on the todo list to refactor, but I wanted to get the
patches out even though it's a work in progress.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09 10:05   ` Adrian Bunk
  2006-05-09 11:06     ` Ed Tomlinson
  2006-05-09 12:45     ` Christian Limpach
@ 2006-05-09 23:23     ` Chris Wright
  2 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:23 UTC (permalink / raw)
  To: Adrian Bunk
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

* Adrian Bunk (bunk@stusta.de) wrote:
> On Tue, May 09, 2006 at 12:00:01AM -0700, Chris Wright wrote:
> >...
> > --- linus-2.6.orig/arch/i386/Kconfig
> > +++ linus-2.6/arch/i386/Kconfig
> >...
> >  config X86_IO_APIC
> >  	bool
> > -	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
> > +	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN))
> >  	default y
> >...
> 
> <nitpick>not required</nitpick>

True, although SMP is just disabled in this patchset which is a subset
of full Xen support.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09 16:00       ` Daniel Walker
@ 2006-05-09 23:25         ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:25 UTC (permalink / raw)
  To: Daniel Walker
  Cc: Christian Limpach, Chris Wright, linux-kernel, virtualization,
	xen-devel, Ian Pratt

* Daniel Walker (dwalker@mvista.com) wrote:
> I guess that true .. Might be better just to support SMP then ..

Yes, and of course Xen does.  This is just the smallest functional set of
patches to get discussion, so the SMP bits were dropped for now.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 14:30               ` David Boutcher
@ 2006-05-09 23:35                 ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:35 UTC (permalink / raw)
  To: David Boutcher
  Cc: Christian Limpach, chrisw, Herbert Xu, ian.pratt, linux-kernel,
	netdev, virtualization, xen-devel

* David Boutcher (boutcher@us.ibm.com) wrote:
> Then make a generic solution.  VMWare supports migration, the Power 
> virtualization will get around to it eventually.  All will need something
> similar.  So either make a common user-land tool, or (if you insist on
> incorrectly driving this into the kernel) add some kind of common hook to
> the TCP/IP stack.

I'm not that fond of the in-kernel solution either.  HA failover does
this stuff in userspace, and has the same gratuitous arp requirements.
Perhaps we should see some numbers showing the migration latency
introduced.  At the very least, it's easy to factor out as suggested.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 11:58   ` Christoph Hellwig
@ 2006-05-09 23:37     ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:37 UTC (permalink / raw)
  To: Christoph Hellwig, Chris Wright, linux-kernel, virtualization,
	xen-devel, Ian Pratt, Christian Limpach, netdev

* Christoph Hellwig (hch@infradead.org) wrote:
> On Tue, May 09, 2006 at 12:00:34AM -0700, Chris Wright wrote:
> > The network device frontend driver allows the kernel to access network
> > devices exported exported by a virtual machine containing a physical
> > network device driver.
> 
> Please don't add procfs code to new network drivers.  Especially if it's oopsable
> like the code in this driver by simple device renaming.

Agreed, no reason to keep the cruft around.  I thought I had a comment
of the sort in there.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 18:56   ` Stephen Hemminger
@ 2006-05-09 23:39     ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:39 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach, netdev

* Stephen Hemminger (shemminger@osdl.org) wrote:
> The stuff in /proc could easily just be added attributes to the class_device kobject
> of the net device (and then show up in sysfs).

Agreed, it's on the todo list to drop proc support there.  Thought that
was marked in the patch.

> > +#define GRANT_INVALID_REF	0
> > +
> > +#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE)
> > +#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE)
> > +
> > +static inline void init_skb_shinfo(struct sk_buff *skb)
> > +{
> > +	atomic_set(&(skb_shinfo(skb)->dataref), 1);
> > +	skb_shinfo(skb)->nr_frags = 0;
> > +	skb_shinfo(skb)->frag_list = NULL;
> > +}
> 
> Could you use existing sk_buff_head instead of inventing your
> own skb queue?

Hmm, there is some standard skb_queue_tail happening.  I don't have a
clear idea what you mean.

> > +	u8 mac[ETH_ALEN];
> 
> Isn't mac address already stored in dev->dev_addr and/or dev->perm_addr?

Yes, I don't see the reason to keep in twice.  It's basically a temp
buffer, but it certainly appears we can eliminate it.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09 22:01       ` Greg KH
  2006-05-09 22:50         ` Chris Wright
@ 2006-05-09 23:43         ` Anthony Liguori
  1 sibling, 0 replies; 185+ messages in thread
From: Anthony Liguori @ 2006-05-09 23:43 UTC (permalink / raw)
  To: Greg KH; +Cc: Chris Wright, virtualization, xen-devel, linux-kernel, Ian Pratt

Greg KH wrote:
> On Tue, May 09, 2006 at 02:53:14PM -0700, Chris Wright wrote:
>   
>>> What is the "frontend/backend" relationship here?
>>>       
>> do you mean in sysfs?  or more in general?
>>     
>
> Either.  You seem to mention a lot of nested depths in sysfs or "files",
> yet your above tree doesn't show that.  And I don't understand what you
> mean by frontend/backend here either?  Is it a sysfs thing?  Or a Xen
> thing?
>   

Hi Greg,

XenStore is a shared namespace (similar to sysfs or open firmware) 
between domains.  The interdomain communication primitives exposed by 
Xen are very lowlevel (virtual IRQ and shared memory).  XenStore is 
implemented on top of these primitives and provides some higher level 
operations (read a key, write a key, enumerate a directory, notify when 
a key changes value).

We use XenStore to implement our virtual drivers (this infrastructure is 
called XenBus).  The drivers are split between a backend and frontend.  
The frontend is the portion of the driver that runs in the guest and the 
backend is the portion of the driver that runs in the host (and actually 
virtualizes the underlying device).

The xenbus_mkdir, etc. functions you see operate on XenStore.

Regards,

Anthony Liguori

> thanks,
>
> greg k-h
>   
> ------------------------------------------------------------------------
>
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.osdl.org
> https://lists.osdl.org/mailman/listinfo/virtualization
>   


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 22:41   ` [Xen-devel] " Herbert Xu
@ 2006-05-09 23:51     ` Chris Wright
  2006-05-10  6:36       ` Keir Fraser
  0 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-09 23:51 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Chris Wright, linux-kernel, virtualization, Christian.Limpach,
	xen-devel, netdev, ian.pratt

* Herbert Xu (herbert@gondor.apana.org.au) wrote:
> Chris Wright <chrisw@sous-sol.org> wrote:
> >
> > +       netdev->features        = NETIF_F_IP_CSUM;
> 
> Any reason why IP_CSUM was chosen instead of HW_CSUM? Doing the latter
> would seem to be in fact easier for a virtual driver, no?

That, I really don't know.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 23:51     ` Chris Wright
@ 2006-05-10  6:36       ` Keir Fraser
  0 siblings, 0 replies; 185+ messages in thread
From: Keir Fraser @ 2006-05-10  6:36 UTC (permalink / raw)
  To: Chris Wright
  Cc: virtualization, linux-kernel, xen-devel, Herbert Xu, ian.pratt, netdev


On 10 May 2006, at 00:51, Chris Wright wrote:

> * Herbert Xu (herbert@gondor.apana.org.au) wrote:
>> Chris Wright <chrisw@sous-sol.org> wrote:
>>>
>>> +       netdev->features        = NETIF_F_IP_CSUM;
>>
>> Any reason why IP_CSUM was chosen instead of HW_CSUM? Doing the latter
>> would seem to be in fact easier for a virtual driver, no?
>
> That, I really don't know.

Checksum offload was added late to the virtual transport and currently 
not enough info is carried to identify protocol checksum fields in 
arbitrary locations. When we rev the virtual interface, and include a 
proper checksum-offset field, we'll be able to switch to 
NETIF_F_HW_CSUM.

  -- Keir


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-09 21:56               ` Andi Kleen
@ 2006-05-10 10:35                 ` Christian Limpach
  2006-05-10 10:54                   ` Andi Kleen
  0 siblings, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-10 10:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Martin J. Bligh, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt

On Tue, May 09, 2006 at 11:56:28PM +0200, Andi Kleen wrote:
> 
> > Everything[1] in line:
> > -rwxr-xr-x  1 cl349 cl349  2633640 May  9 19:42 vmlinux-inline-stripped
> > Everything out of line:
> > -rwxr-xr-x  1 cl349 cl349  2621352 May  9 19:45 vmlinux-outline-stripped
> > 
> > Additionally, I changed did a build with only __sti and __restore_flags
> > out of line and the others in line:
> > -rwxr-xr-x  1 cl349 cl349  2617256 May  9 19:50 vmlinux-hybrid-stripped
> > 
> > __sti and __restore_flags are the ones which generate more code,
> > so it seemed more sensible to make the out of line.
> > 
> > Any conlusions?
> 
> It looks like hybrid is a clear winner at least from the code size, isn't it?

Yes, which is why I measured that one as well.

Now, the original concern was that we have the five operations implemented
as multi-line macros and doing a hybrid solution doesn't really address
that.

Also, it's not quite clear to me what's the best way to turn three of
the five into functions, whether inline or not.

For measuring the sizes, I did the following:
add void ___restore_flags(unsigned long *x) with the implementation
and then:
#define __restore_flags(x) ___restore_flags(&(x))

Alternatively, would it make sense to change __restore_flags to take
a pointer to flags instead?  That would be quite an invasive change...

Any thoughts?

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 15/35] subarch support for controlling interrupt delivery
  2006-05-10 10:35                 ` Christian Limpach
@ 2006-05-10 10:54                   ` Andi Kleen
  0 siblings, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-10 10:54 UTC (permalink / raw)
  To: Christian Limpach
  Cc: virtualization, Martin J. Bligh, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt

es, which is why I measured that one as well.
> 
> Now, the original concern was that we have the five operations implemented
> as multi-line macros and doing a hybrid solution doesn't really address
> that.

If it's straight-forward to convert to an inline do it. If not keep
it as a macro. After all code style is just a tool, not something
self serving.

> 
> Also, it's not quite clear to me what's the best way to turn three of
> the five into functions, whether inline or not.
> 
> For measuring the sizes, I did the following:
> add void ___restore_flags(unsigned long *x) with the implementation
> and then:
> #define __restore_flags(x) ___restore_flags(&(x))

Yes that is the standard way to do it 

> Alternatively, would it make sense to change __restore_flags to take
> a pointer to flags instead?  That would be quite an invasive change...

No.

-Andi


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
                     ` (2 preceding siblings ...)
  2006-05-09 16:42   ` Andi Kleen
@ 2006-05-10 15:36   ` Alan Cox
  2006-05-10 15:48     ` Christian Limpach
  3 siblings, 1 reply; 185+ messages in thread
From: Alan Cox @ 2006-05-10 15:36 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, Christian Limpach, xen-devel, Ian Pratt

On Maw, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> plain text document atodiad (config-xen)
> The XEN config option is selected from the i386 subarch menu by
> choosing the X86_XEN "Xen-compatible" subarch.

You need this as well. At least if I read the logic right with regards
to Xen and traps it is safe to do the following (although probably not
safe to run Xen on such a physical system anyway)

Signed-off-by: Alan Cox <alan@redhat.com>

--- arch/i386/Kconfig.cpu~	2006-05-10 15:51:44.956941304 +0100
+++ arch/i386/Kconfig.cpu	2006-05-10 15:51:44.956941304 +0100
@@ -251,7 +251,7 @@
 
 config X86_F00F_BUG
 	bool
-	depends on M586MMX || M586TSC || M586 || M486 || M386
+	depends on ( M586MMX || M586TSC || M586 || M486 || M386 ) && !XEN
 	default y
 
 config X86_WP_WORKS_OK


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] [RFC PATCH 01/35] Add XEN config options and disable unsupported config options.
  2006-05-10 15:36   ` [Xen-devel] " Alan Cox
@ 2006-05-10 15:48     ` Christian Limpach
  0 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-10 15:48 UTC (permalink / raw)
  To: Alan Cox; +Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Wed, May 10, 2006 at 04:36:58PM +0100, Alan Cox wrote:
> On Maw, 2006-05-09 at 00:00 -0700, Chris Wright wrote:
> > plain text document atodiad (config-xen)
> > The XEN config option is selected from the i386 subarch menu by
> > choosing the X86_XEN "Xen-compatible" subarch.
> 
> You need this as well. At least if I read the logic right with regards
> to Xen and traps it is safe to do the following (although probably not
> safe to run Xen on such a physical system anyway)

Yes.  In our tree, we have a config option which completely removes
all the hardware idt table code (X86_NO_IDT) and stores the trap
table as a table suitable to pass directly to the hypervisor.

That's not so useful if you want to build a kernel which can run
both on a hypervisor and on native.  I guess you would need to
disable the X86_F00F_BUG code at runtime in such a kernel.

For the non-runtime case, I wonder if it's preferable to disable
X86_F00F_BUG like you suggest or if it would be better to disable
the cpu types listed?

    christian

> 
> Signed-off-by: Alan Cox <alan@redhat.com>
> 
> --- arch/i386/Kconfig.cpu~	2006-05-10 15:51:44.956941304 +0100
> +++ arch/i386/Kconfig.cpu	2006-05-10 15:51:44.956941304 +0100
> @@ -251,7 +251,7 @@
>  
>  config X86_F00F_BUG
>  	bool
> -	depends on M586MMX || M586TSC || M586 || M486 || M386
> +	depends on ( M586MMX || M586TSC || M586 || M486 || M386 ) && !XEN
>  	default y
>  
>  config X86_WP_WORKS_OK
> 
> 

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-09 20:46       ` Roland Dreier
@ 2006-05-10 18:28         ` Andi Kleen
  2006-05-11  0:33           ` Herbert Xu
  0 siblings, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-10 18:28 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Keir Fraser, Stephen Hemminger, virtualization, Ian Pratt,
	xen-devel, linux-kernel, Chris Wright, netdev

On Tuesday 09 May 2006 22:46, Roland Dreier wrote:
>     Keir> Where should we get our entropy from in a VM environment?
>     Keir> Leaving the pool empty can cause processes to hang.
>
> You could have something like a virtual HW RNG driver (with a frontend
> and backend), which steals from the dom0 /dev/random pool.

They already have a vTPM - iirc TPMs support random numbers so
that could be used. But it's probably complicated to use.

But if sampling virtual events for randomness is really unsafe (is it 
really?) then native guests in Xen would also get bad random numbers
and this would need to be somehow addressed.

I haven't seen real evidence yet why the virtual events should 
provide less randomness than the hardware.

-And

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-09  7:16   ` Pavel Machek
@ 2006-05-10 20:09     ` Andi Kleen
  2006-05-10 20:30       ` Pavel Machek
  2006-05-12  0:28     ` [Xen-devel] " Rusty Russell
  1 sibling, 1 reply; 185+ messages in thread
From: Andi Kleen @ 2006-05-10 20:09 UTC (permalink / raw)
  To: virtualization
  Cc: Pavel Machek, Chris Wright, xen-devel, linux-kernel, Ian Pratt

On Tuesday 09 May 2006 09:16, Pavel Machek wrote:
> Hi!
> 
> > --- linus-2.6.orig/include/asm-i386/mach-default/mach_system.h
> > +++ linus-2.6/include/asm-i386/mach-default/mach_system.h
> > @@ -1,6 +1,8 @@
> >  #ifndef __ASM_MACH_SYSTEM_H
> >  #define __ASM_MACH_SYSTEM_H
> >  
> > +#define clearsegment(seg)
> 
> do {} while (0), please.

It's not needed. Think about it.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen.
  2006-05-09  7:21   ` Pavel Machek
@ 2006-05-10 20:23     ` Andi Kleen
  0 siblings, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-10 20:23 UTC (permalink / raw)
  To: virtualization
  Cc: Pavel Machek, Chris Wright, xen-devel, linux-kernel, Ian Pratt

On Tuesday 09 May 2006 09:21, Pavel Machek wrote:
> Hi!
> 
> > +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
> > +{
> > +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
> > +	C(0); C(1); C(2);
> > +#undef C
> > +}
> 
> Why not use for loop here? gcc should be able to optimize it...

I don't think you can really blame the Xen people for that code - it's 
just the old code moved.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-10 20:09     ` Andi Kleen
@ 2006-05-10 20:30       ` Pavel Machek
  2006-05-11 10:34         ` Avi Kivity
  0 siblings, 1 reply; 185+ messages in thread
From: Pavel Machek @ 2006-05-10 20:30 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Chris Wright, xen-devel, linux-kernel, Ian Pratt

On St 10-05-06 22:09:04, Andi Kleen wrote:
> On Tuesday 09 May 2006 09:16, Pavel Machek wrote:
> > Hi!
> > 
> > > --- linus-2.6.orig/include/asm-i386/mach-default/mach_system.h
> > > +++ linus-2.6/include/asm-i386/mach-default/mach_system.h
> > > @@ -1,6 +1,8 @@
> > >  #ifndef __ASM_MACH_SYSTEM_H
> > >  #define __ASM_MACH_SYSTEM_H
> > >  
> > > +#define clearsegment(seg)
> > 
> > do {} while (0), please.
> 
> It's not needed. Think about it.

Really? If someone does 

	if (something)
		clearsegment(seg)
	somethingelse();

... he'll get very confusing behaviour instead of compile error. 

Okay, that's weaker argument than expected...

Also clearsegment(x) clearsegment(y); will compile when it should not.

Also clearsegment(i++) will behave strangely. So perhaps 

#define clearsegment(seg) do { seg; } while (0)

is best variant?
								Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-09  7:00 ` [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch Chris Wright
@ 2006-05-10 23:28   ` Zachary Amsden
  2006-05-11  7:47     ` [Xen-devel] " Gerd Hoffmann
  2006-05-11 16:43     ` Christian Limpach
  0 siblings, 2 replies; 185+ messages in thread
From: Zachary Amsden @ 2006-05-10 23:28 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Chris Wright wrote:
> Change LOAD_OFFSET so that the kernel has virtual addresses in the elf header fields.
>
> Unlike bare metal kernels, Xen kernels start with virtual address
> management turned on and thus the addresses to load to should be
> virtual addresses.

This patch interferes with using a traditional bootloader.  The loader 
for Xen should be smarter - it already has VIRT_BASE from the xen_guest 
section, and can simply add the relocation to these header fields.  This 
is unnecessary, and one of the many reasons a Xen kernel can't run in a 
normal environment.

Zach

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-10 18:28         ` Andi Kleen
@ 2006-05-11  0:33           ` Herbert Xu
  2006-05-11  7:49             ` Keir Fraser
  0 siblings, 1 reply; 185+ messages in thread
From: Herbert Xu @ 2006-05-11  0:33 UTC (permalink / raw)
  To: Andi Kleen
  Cc: rdreier, Keir.Fraser, shemminger, virtualization, ian.pratt,
	xen-devel, linux-kernel, chrisw, netdev

Andi Kleen <ak@suse.de> wrote:
> 
> But if sampling virtual events for randomness is really unsafe (is it 
> really?) then native guests in Xen would also get bad random numbers
> and this would need to be somehow addressed.

Good point.  I wonder what VMWare does in this situation.
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-10 23:28   ` Zachary Amsden
@ 2006-05-11  7:47     ` Gerd Hoffmann
  2006-05-11  8:51       ` Chris Wright
  2006-05-11 16:43     ` Christian Limpach
  1 sibling, 1 reply; 185+ messages in thread
From: Gerd Hoffmann @ 2006-05-11  7:47 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Chris Wright, virtualization, Christian Limpach, xen-devel,
	linux-kernel, Ian Pratt

[-- Attachment #1: Type: text/plain, Size: 938 bytes --]

Zachary Amsden wrote:
> Chris Wright wrote:
>> Change LOAD_OFFSET so that the kernel has virtual addresses in the elf
>> header fields.
>>
>> Unlike bare metal kernels, Xen kernels start with virtual address
>> management turned on and thus the addresses to load to should be
>> virtual addresses.
> 
> This patch interferes with using a traditional bootloader.  The loader
> for Xen should be smarter - it already has VIRT_BASE from the xen_guest
> section, and can simply add the relocation to these header fields.  This
> is unnecessary, and one of the many reasons a Xen kernel can't run in a
> normal environment.

I fully agree.  Attached below is a patch (against xen unstable
mercurial tree) which does exactly that ;)

cheers,

  Gerd

-- 
Gerd Hoffmann <kraxel@suse.de>
Erst mal heiraten, ein, zwei Kinder, und wenn alles läuft
geh' ich nach drei Jahren mit der Familie an die Börse.
http://www.suse.de/~kraxel/julika-dora.jpeg

[-- Attachment #2: load-offset-submit.diff --]
[-- Type: text/x-patch, Size: 8349 bytes --]

diff -r 1e3977e029fd linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h
--- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h	Mon May  8 18:21:41 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h	Thu May 11 09:10:41 2006
@@ -289,10 +289,6 @@
 #endif
 #define __KERNEL_START		(__PAGE_OFFSET + __PHYSICAL_START)
 
-#undef LOAD_OFFSET
-#define LOAD_OFFSET		0
-
-
 #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
 #define VMALLOC_RESERVE		((unsigned long)__VMALLOC_RESERVE)
 #define MAXMEM			(__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
diff -r 1e3977e029fd linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h	Mon May  8 18:21:41 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h	Thu May 11 09:10:41 2006
@@ -260,9 +260,6 @@
 #define __PAGE_OFFSET           0xffff880000000000
 #endif /* !__ASSEMBLY__ */
 
-#undef LOAD_OFFSET
-#define LOAD_OFFSET		0
-
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
 
diff -r 1e3977e029fd tools/libxc/xc_load_elf.c
--- a/tools/libxc/xc_load_elf.c	Mon May  8 18:21:41 2006
+++ b/tools/libxc/xc_load_elf.c	Thu May 11 09:10:41 2006
@@ -59,6 +59,7 @@
     Elf_Phdr *phdr;
     Elf_Shdr *shdr;
     unsigned long kernstart = ~0UL, kernend=0UL;
+    unsigned long sstart, send;
     const char *shstrtab;
     char *guestinfo=NULL, *p;
     int h;
@@ -132,6 +133,8 @@
         }
         if ( (strstr(guestinfo, "PAE=yes") != NULL) )
             dsi->pae_kernel = 1;
+        if ( (p = strstr(guestinfo, "VIRT_BASE=")) != NULL )
+            dsi->virt_base = strtoul(p+10, &p, 0);
 
         break;
     }
@@ -153,11 +156,30 @@
         phdr = (Elf_Phdr *)(image + ehdr->e_phoff + (h*ehdr->e_phentsize));
         if ( !is_loadable_phdr(phdr) )
             continue;
-        if ( phdr->p_paddr < kernstart )
-            kernstart = phdr->p_paddr;
-        if ( (phdr->p_paddr + phdr->p_memsz) > kernend )
-            kernend = phdr->p_paddr + phdr->p_memsz;
-    }
+        sstart = phdr->p_paddr;
+        send   = phdr->p_paddr + phdr->p_memsz;
+        /*
+         * bug comparibility alert: old linux kernels used to have
+         * virtual addresses in the paddr headers, whereas newer ones
+         * (since kexec merge, around 2.6.14) correctly use physical
+         * addresses.
+         *
+         * As we want to be able to boot both kinds of kernels we'll
+         * do some guesswork here: If paddr is greater than virt_base
+         * we assume it is a old kernel and use it as-is.  Otherwise
+         * we'll add virt_base to get the correct address.
+         */
+        if (sstart < dsi->virt_base) {
+            sstart += dsi->virt_base;
+            send   += dsi->virt_base;
+        }
+        if ( sstart < kernstart )
+            kernstart = sstart;
+        if ( send > kernend )
+            kernend = send;
+    }
+    if (dsi->virt_base > 0 && ehdr->e_entry < dsi->virt_base)
+	ehdr->e_entry += dsi->virt_base;
 
     if ( (kernstart > kernend) ||
          (ehdr->e_entry < kernstart) ||
@@ -204,7 +226,11 @@
 
         for ( done = 0; done < phdr->p_filesz; done += chunksz )
         {
-            pa = (phdr->p_paddr + done) - dsi->v_start;
+            /* bug compatibility alert, see above */
+            pa = phdr->p_paddr + done;
+            if (pa > dsi->virt_base)
+                pa -= dsi->virt_base;
+
             va = xc_map_foreign_range(
                 xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]);
             chunksz = phdr->p_filesz - done;
@@ -217,7 +243,11 @@
 
         for ( ; done < phdr->p_memsz; done += chunksz )
         {
-            pa = (phdr->p_paddr + done) - dsi->v_start;
+            /* bug compatibility alert, see above */
+            pa = phdr->p_paddr + done;
+            if (pa > dsi->virt_base)
+                pa -= dsi->virt_base;
+
             va = xc_map_foreign_range(
                 xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]);
             chunksz = phdr->p_memsz - done;
diff -r 1e3977e029fd tools/libxc/xg_private.h
--- a/tools/libxc/xg_private.h	Mon May  8 18:21:41 2006
+++ b/tools/libxc/xg_private.h	Thu May 11 09:10:41 2006
@@ -135,6 +135,7 @@
     unsigned long v_kernstart;
     unsigned long v_kernend;
     unsigned long v_kernentry;
+    unsigned long virt_base;
 
     unsigned int  load_symtab;
     unsigned int  pae_kernel;
diff -r 1e3977e029fd xen/common/elf.c
--- a/xen/common/elf.c	Mon May  8 18:21:41 2006
+++ b/xen/common/elf.c	Thu May 11 09:10:41 2006
@@ -24,6 +24,7 @@
     Elf_Phdr *phdr;
     Elf_Shdr *shdr;
     unsigned long kernstart = ~0UL, kernend=0UL;
+    unsigned long sstart, send;
     char *shstrtab, *guestinfo=NULL, *p;
     char *elfbase = (char *)dsi->image_addr;
     int h;
@@ -76,6 +77,8 @@
             return -EINVAL;
         }
 
+        if ( (p = strstr(guestinfo, "VIRT_BASE=")) != NULL )
+            dsi->virt_base = simple_strtoul(p+10, &p, 0);
         break;
     }
 
@@ -86,11 +89,40 @@
         phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
         if ( !is_loadable_phdr(phdr) )
             continue;
-        if ( phdr->p_paddr < kernstart )
-            kernstart = phdr->p_paddr;
-        if ( (phdr->p_paddr + phdr->p_memsz) > kernend )
-            kernend = phdr->p_paddr + phdr->p_memsz;
-    }
+        sstart = phdr->p_paddr;
+        send   = phdr->p_paddr + phdr->p_memsz;
+        /*
+         * bug comparibility alert: old linux kernels used to have
+         * virtual addresses in the paddr headers, whereas newer ones
+         * (since kexec merge, around 2.6.14) correctly use physical
+         * addresses.
+         *
+         * As we want to be able to boot both kinds of kernels we'll
+         * do some guesswork here: If paddr is greater than virt_base
+         * we assume it is a old kernel and use it as-is.  Otherwise
+         * we'll add virt_base to get the correct address.
+         */
+        if (sstart < dsi->virt_base) {
+            sstart += dsi->virt_base;
+            send   += dsi->virt_base;
+        }
+        printk("%s: program hdr: %08lx (=vaddr)  "
+               "paddr: %08lx  filesz: %08lx  memsz: %08lx  =>  %08lx-%08lx\n",
+               __FUNCTION__,
+               (unsigned long)phdr->p_vaddr,
+               (unsigned long)phdr->p_paddr,
+               (unsigned long)phdr->p_filesz,
+               (unsigned long)phdr->p_memsz,
+               sstart, send);
+        if ( sstart < kernstart )
+            kernstart = sstart;
+        if ( send > kernend )
+            kernend = send;
+    }
+    if (dsi->virt_base > 0 && ehdr->e_entry < dsi->virt_base)
+	ehdr->e_entry += dsi->virt_base;
+    printk("%s: entry point: %08lx\n", __FUNCTION__,
+           (unsigned long)ehdr->e_entry);
 
     if ( (kernstart > kernend) || 
          (ehdr->e_entry < kernstart) || 
@@ -126,6 +158,7 @@
     char *elfbase = (char *)dsi->image_addr;
     Elf_Ehdr *ehdr = (Elf_Ehdr *)dsi->image_addr;
     Elf_Phdr *phdr;
+    unsigned long vaddr;
     int h;
   
     for ( h = 0; h < ehdr->e_phnum; h++ ) 
@@ -133,11 +166,15 @@
         phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
         if ( !is_loadable_phdr(phdr) )
             continue;
+        vaddr = phdr->p_paddr;
+        if (vaddr < dsi->virt_base)
+            vaddr += dsi->virt_base;
         if ( phdr->p_filesz != 0 )
-            memcpy((char *)phdr->p_paddr, elfbase + phdr->p_offset, 
+            memcpy((char *)vaddr,
+                   elfbase + phdr->p_offset, 
                    phdr->p_filesz);
         if ( phdr->p_memsz > phdr->p_filesz )
-            memset((char *)phdr->p_paddr + phdr->p_filesz, 0, 
+            memset((char *)phdr->p_vaddr + phdr->p_filesz, 0, 
                    phdr->p_memsz - phdr->p_filesz);
     }
 
diff -r 1e3977e029fd xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Mon May  8 18:21:41 2006
+++ b/xen/include/xen/sched.h	Thu May 11 09:10:41 2006
@@ -172,6 +172,7 @@
     unsigned long v_kernstart;
     unsigned long v_kernend;
     unsigned long v_kernentry;
+    unsigned long virt_base;
     /* Initialised by loader: Private. */
     unsigned int  load_symtab;
     unsigned long symtab_addr;

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-11  0:33           ` Herbert Xu
@ 2006-05-11  7:49             ` Keir Fraser
  2006-05-11  8:04               ` Herbert Xu
  2006-05-11  9:47               ` Andi Kleen
  0 siblings, 2 replies; 185+ messages in thread
From: Keir Fraser @ 2006-05-11  7:49 UTC (permalink / raw)
  To: Herbert Xu
  Cc: xen-devel, ian.pratt, rdreier, linux-kernel, netdev, Andi Kleen,
	virtualization, chrisw, shemminger


On 11 May 2006, at 01:33, Herbert Xu wrote:

>> But if sampling virtual events for randomness is really unsafe (is it
>> really?) then native guests in Xen would also get bad random numbers
>> and this would need to be somehow addressed.
>
> Good point.  I wonder what VMWare does in this situation.

Well, there's not much they can do except maybe jitter interrupt 
delivery. I doubt they do that though.

The original complaint in our case was that we take entropy from 
interrupts caused by other local VMs, as well as external sources. 
There was a feeling that the former was more predictable and could form 
the basis of an attack. I have to say I'm unconvinced: I don't really 
see that it's significantly easier to inject precisely-timed interrupts 
into a local VM. Certainly not to better than +/- a few microseconds. 
As long as you add cycle-counter info to the entropy pool, the least 
significant bits of that will always be noise.

The alternatives are unattractive:
  1. We have no good way to distinguish interrupts caused by packets 
from local VMs versus packets from remote hosts. Both get muxed on the 
same virtual interface.
  2. An entropy front/back is tricky -- how do we decide how much 
entropy to pull from domain0? How much should domain0 be prepared to 
give other domains? How easy is it to DoS domain0 by draining its 
entropy pool? Yuk.

  -- Keir


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-11  7:49             ` Keir Fraser
@ 2006-05-11  8:04               ` Herbert Xu
  2006-05-11  9:47               ` Andi Kleen
  1 sibling, 0 replies; 185+ messages in thread
From: Herbert Xu @ 2006-05-11  8:04 UTC (permalink / raw)
  To: Keir Fraser
  Cc: xen-devel, ian.pratt, rdreier, linux-kernel, netdev, Andi Kleen,
	virtualization, chrisw, shemminger

On Thu, May 11, 2006 at 08:49:04AM +0100, Keir Fraser wrote:
> 
> The alternatives are unattractive:
>  1. We have no good way to distinguish interrupts caused by packets 
> from local VMs versus packets from remote hosts. Both get muxed on the 
> same virtual interface.
>  2. An entropy front/back is tricky -- how do we decide how much 
> entropy to pull from domain0? How much should domain0 be prepared to 
> give other domains? How easy is it to DoS domain0 by draining its 
> entropy pool? Yuk.

IMHO there just isn't enough real entropy to go around in one physical
machine without a proper HRNG.  So either use urandom in all the guests
or for those that really have to use /dev/random, install a hardware
RNG (or wait for it :).

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-11  7:47     ` [Xen-devel] " Gerd Hoffmann
@ 2006-05-11  8:51       ` Chris Wright
  2006-05-11  9:06         ` Gerd Hoffmann
  0 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-05-11  8:51 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: Zachary Amsden, Chris Wright, virtualization, Christian Limpach,
	xen-devel, linux-kernel, Ian Pratt

* Gerd Hoffmann (kraxel@suse.de) wrote:
> I fully agree.  Attached below is a patch (against xen unstable
> mercurial tree) which does exactly that ;)

Thanks Gerd, I thought you had been working on that.  Was the concern
with vaddr vs. paddr worked out?

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-11  8:51       ` Chris Wright
@ 2006-05-11  9:06         ` Gerd Hoffmann
  0 siblings, 0 replies; 185+ messages in thread
From: Gerd Hoffmann @ 2006-05-11  9:06 UTC (permalink / raw)
  To: Chris Wright
  Cc: Zachary Amsden, virtualization, Christian Limpach, xen-devel,
	linux-kernel, Ian Pratt

Chris Wright wrote:
> * Gerd Hoffmann (kraxel@suse.de) wrote:
>> I fully agree.  Attached below is a patch (against xen unstable
>> mercurial tree) which does exactly that ;)
> 
> Thanks Gerd, I thought you had been working on that.  Was the concern
> with vaddr vs. paddr worked out?

Not yet, and I didn't feel comfortable pushing it just before the 3.0.2
release, but I think _now_ would be a good time to finally merge it.
Having physical addresses in paddr seems to be common practice, and IMO
xen should follow that as it makes life easier for everybody.

It's not a big problem that xen guests boot with paging enabled, and as
Zachary already pointed out it's trivial to use virt_base from the
xen_guest elf section to create correct initial page tables.

Even maintaining backward compatibility with some guesswork is possible
as it is _very_ unlikely that the paddr field holds physical addresses
larger than virt_base ;)

cheers,

  Gerd

-- 
Gerd Hoffmann <kraxel@suse.de>
Erst mal heiraten, ein, zwei Kinder, und wenn alles läuft
geh' ich nach drei Jahren mit der Familie an die Börse.
http://www.suse.de/~kraxel/julika-dora.jpeg

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-11  7:49             ` Keir Fraser
  2006-05-11  8:04               ` Herbert Xu
@ 2006-05-11  9:47               ` Andi Kleen
  2006-05-11 16:18                 ` Stephen Hemminger
  2006-05-11 16:48                 ` Rick Jones
  1 sibling, 2 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-11  9:47 UTC (permalink / raw)
  To: Keir Fraser, tytso
  Cc: Herbert Xu, xen-devel, ian.pratt, rdreier, linux-kernel, netdev,
	virtualization, chrisw, shemminger

On Thursday 11 May 2006 09:49, Keir Fraser wrote:
> On 11 May 2006, at 01:33, Herbert Xu wrote:
> >> But if sampling virtual events for randomness is really unsafe (is it
> >> really?) then native guests in Xen would also get bad random numbers
> >> and this would need to be somehow addressed.
> >
> > Good point.  I wonder what VMWare does in this situation.
>
> Well, there's not much they can do except maybe jitter interrupt
> delivery. I doubt they do that though.
>
> The original complaint in our case was that we take entropy from
> interrupts caused by other local VMs, as well as external sources.
> There was a feeling that the former was more predictable and could form
> the basis of an attack. I have to say I'm unconvinced: I don't really
> see that it's significantly easier to inject precisely-timed interrupts
> into a local VM. Certainly not to better than +/- a few microseconds.
> As long as you add cycle-counter info to the entropy pool, the least
> significant bits of that will always be noise.

I think I agree - e.g. i would expect the virtual interrupts to have
enough jitter too. Maybe it would be good if someone could
run a few statistics on the resulting numbers?

Ok the randomness added doesn't consist only of the least significant
bits. Currently it adds jiffies+full 32bit cycle count.  I guess if it was
a real problem the code could be changed to leave out the jiffies and 
only add maybe a 8 bit word from the low bits. But that would only
help for the para case because the algorithm for native guests
cannot be changed.

>   2. An entropy front/back is tricky -- how do we decide how much
> entropy to pull from domain0? How much should domain0 be prepared to
> give other domains? How easy is it to DoS domain0 by draining its
> entropy pool? Yuk.

I claim (without having read any code) that in theory you need to have solved 
that problem already in the vTPM @)

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-10 20:30       ` Pavel Machek
@ 2006-05-11 10:34         ` Avi Kivity
  2006-05-11 10:41           ` Andi Kleen
  0 siblings, 1 reply; 185+ messages in thread
From: Avi Kivity @ 2006-05-11 10:34 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Andi Kleen, virtualization, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt

Pavel Machek wrote:
> Really? If someone does 
>
> 	if (something)
> 		clearsegment(seg)
> 	somethingelse();
>
> ... he'll get very confusing behaviour instead of compile error. 
>
> Okay, that's weaker argument than expected...
>
> Also clearsegment(x) clearsegment(y); will compile when it should not.
>
> Also clearsegment(i++) will behave strangely. So perhaps 
>
> #define clearsegment(seg) do { seg; } while (0)
>   

static inline void clearsegment(int seg) {}

?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-11 10:34         ` Avi Kivity
@ 2006-05-11 10:41           ` Andi Kleen
  0 siblings, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-11 10:41 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Pavel Machek, virtualization, Chris Wright, xen-devel,
	linux-kernel, Ian Pratt

On Thursday 11 May 2006 12:34, Avi Kivity wrote:
> Pavel Machek wrote:
> > Really? If someone does 
> >
> > 	if (something)
> > 		clearsegment(seg)
> > 	somethingelse();
> >
> > ... he'll get very confusing behaviour instead of compile error. 
> >
> > Okay, that's weaker argument than expected...
> >
> > Also clearsegment(x) clearsegment(y); will compile when it should not.
> >
> > Also clearsegment(i++) will behave strangely. So perhaps 
> >
> > #define clearsegment(seg) do { seg; } while (0)
> >   
> 
> static inline void clearsegment(int seg) {}


It's all mood because the complete function is wrongly named
and probably should just go.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-11  9:47               ` Andi Kleen
@ 2006-05-11 16:18                 ` Stephen Hemminger
  2006-05-11 16:48                 ` Rick Jones
  1 sibling, 0 replies; 185+ messages in thread
From: Stephen Hemminger @ 2006-05-11 16:18 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Keir Fraser, tytso, Herbert Xu, xen-devel, ian.pratt, rdreier,
	linux-kernel, netdev, virtualization, chrisw

On Thu, 11 May 2006 11:47:52 +0200
Andi Kleen <ak@suse.de> wrote:

> On Thursday 11 May 2006 09:49, Keir Fraser wrote:
> > On 11 May 2006, at 01:33, Herbert Xu wrote:
> > >> But if sampling virtual events for randomness is really unsafe (is it
> > >> really?) then native guests in Xen would also get bad random numbers
> > >> and this would need to be somehow addressed.
> > >
> > > Good point.  I wonder what VMWare does in this situation.
> >
> > Well, there's not much they can do except maybe jitter interrupt
> > delivery. I doubt they do that though.
> >
> > The original complaint in our case was that we take entropy from
> > interrupts caused by other local VMs, as well as external sources.
> > There was a feeling that the former was more predictable and could form
> > the basis of an attack. I have to say I'm unconvinced: I don't really
> > see that it's significantly easier to inject precisely-timed interrupts
> > into a local VM. Certainly not to better than +/- a few microseconds.
> > As long as you add cycle-counter info to the entropy pool, the least
> > significant bits of that will always be noise.
> 
> I think I agree - e.g. i would expect the virtual interrupts to have
> enough jitter too. Maybe it would be good if someone could
> run a few statistics on the resulting numbers?
> 
> Ok the randomness added doesn't consist only of the least significant
> bits. Currently it adds jiffies+full 32bit cycle count.  I guess if it was
> a real problem the code could be changed to leave out the jiffies and 
> only add maybe a 8 bit word from the low bits. But that would only
> help for the para case because the algorithm for native guests
> cannot be changed.
> 
> >   2. An entropy front/back is tricky -- how do we decide how much
> > entropy to pull from domain0? How much should domain0 be prepared to
> > give other domains? How easy is it to DoS domain0 by draining its
> > entropy pool? Yuk.
> 
> I claim (without having read any code) that in theory you need to have solved 
> that problem already in the vTPM @)
> 

The base question under all this is "how good does an entropy source have
to be?" and then "what guarantees do we make about the entropy inputs used
by /dev/random?".  If we can resolve those, then the virtual environment
answer should fall out.

This is a area where the security tin-foil hat types take over, and it
gets real hard to make "good enough" argument. People have built an expectation
that /dev/random has really strong entropy, good enough to generate long term
keys etc.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-10 23:28   ` Zachary Amsden
  2006-05-11  7:47     ` [Xen-devel] " Gerd Hoffmann
@ 2006-05-11 16:43     ` Christian Limpach
  2006-05-12  6:47       ` [Xen-devel] " Jan Beulich
  1 sibling, 1 reply; 185+ messages in thread
From: Christian Limpach @ 2006-05-11 16:43 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt

On Wed, May 10, 2006 at 04:28:51PM -0700, Zachary Amsden wrote:
> Chris Wright wrote:
> >Change LOAD_OFFSET so that the kernel has virtual addresses in the elf 
> >header fields.
> >
> >Unlike bare metal kernels, Xen kernels start with virtual address
> >management turned on and thus the addresses to load to should be
> >virtual addresses.
> 
> This patch interferes with using a traditional bootloader.  The loader 
> for Xen should be smarter - it already has VIRT_BASE from the xen_guest 
> section, and can simply add the relocation to these header fields.  This 
> is unnecessary, and one of the many reasons a Xen kernel can't run in a 
> normal environment.

It's certainly not as simple as you make it sound, if you want to
support existing kernels without having to guess how the kernel image
was built.

I've updated our loader to support this now, so that this patch is
no longer necessary.  I have at the same time added a new field to
xen_guest which allows specifying the entry point, allowing us to have
a different entry point when running the kernel image on Xen.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-11  9:47               ` Andi Kleen
  2006-05-11 16:18                 ` Stephen Hemminger
@ 2006-05-11 16:48                 ` Rick Jones
  2006-05-11 16:55                   ` Stephen Hemminger
  2006-05-11 17:30                   ` Andi Kleen
  1 sibling, 2 replies; 185+ messages in thread
From: Rick Jones @ 2006-05-11 16:48 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Keir Fraser, tytso, Herbert Xu, xen-devel, ian.pratt, rdreier,
	linux-kernel, netdev, virtualization, chrisw, shemminger

 From the peanut gallery...

Can remote TCP ISN's be considered a source of entropy these days?  How 
about checksums?

rick

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-11 16:48                 ` Rick Jones
@ 2006-05-11 16:55                   ` Stephen Hemminger
  2006-05-11 17:30                   ` Andi Kleen
  1 sibling, 0 replies; 185+ messages in thread
From: Stephen Hemminger @ 2006-05-11 16:55 UTC (permalink / raw)
  To: linux-kernel

On Thu, 11 May 2006 09:48:11 -0700
Rick Jones <rick.jones2@hp.com> wrote:

>  From the peanut gallery...
> 
> Can remote TCP ISN's be considered a source of entropy these days?  How 
> about checksums?
> 
> rick

No, they are spoofable.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 34/35] Add the Xen virtual network device driver.
  2006-05-11 16:48                 ` Rick Jones
  2006-05-11 16:55                   ` Stephen Hemminger
@ 2006-05-11 17:30                   ` Andi Kleen
  1 sibling, 0 replies; 185+ messages in thread
From: Andi Kleen @ 2006-05-11 17:30 UTC (permalink / raw)
  To: Rick Jones
  Cc: Keir Fraser, tytso, Herbert Xu, xen-devel, ian.pratt, rdreier,
	linux-kernel, netdev, virtualization, chrisw, shemminger

On Thursday 11 May 2006 18:48, Rick Jones wrote:
>  From the peanut gallery...
> 
> Can remote TCP ISN's be considered a source of entropy these days?  How 
> about checksums?

Indirectly - we measure how long it takes to compute them.

-Andi

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-09  7:16   ` Pavel Machek
  2006-05-10 20:09     ` Andi Kleen
@ 2006-05-12  0:28     ` Rusty Russell
  1 sibling, 0 replies; 185+ messages in thread
From: Rusty Russell @ 2006-05-12  0:28 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Chris Wright, virtualization, Christian Limpach, xen-devel,
	linux-kernel, Ian Pratt

On Tue, 2006-05-09 at 07:16 +0000, Pavel Machek wrote:
> Hi!
> 
> > --- linus-2.6.orig/include/asm-i386/mach-default/mach_system.h
> > +++ linus-2.6/include/asm-i386/mach-default/mach_system.h
> > @@ -1,6 +1,8 @@
> >  #ifndef __ASM_MACH_SYSTEM_H
> >  #define __ASM_MACH_SYSTEM_H
> >  
> > +#define clearsegment(seg)
> 
> do {} while (0), please.

It's off-topic, but: why?

Rusty.
-- 
 ccontrol: http://ccontrol.ozlabs.org


^ permalink raw reply	[flat|nested] 185+ messages in thread

* [Xen-devel] Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-11 16:43     ` Christian Limpach
@ 2006-05-12  6:47       ` Jan Beulich
  2006-05-12  8:38         ` Christian Limpach
  0 siblings, 1 reply; 185+ messages in thread
From: Jan Beulich @ 2006-05-12  6:47 UTC (permalink / raw)
  To: Christian Limpach
  Cc: virtualization, xen-devel, Chris Wright, linux-kernel,
	Zachary Amsden, Ian Pratt

>I've updated our loader to support this now, so that this patch is
>no longer necessary.  I have at the same time added a new field to
>xen_guest which allows specifying the entry point, allowing us to have
>a different entry point when running the kernel image on Xen.

Why do you need a separate entry point here? The code should be able to figure out which mode it is run in without
problems...

Jan

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [Xen-devel] Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-05-12  6:47       ` [Xen-devel] " Jan Beulich
@ 2006-05-12  8:38         ` Christian Limpach
  0 siblings, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-12  8:38 UTC (permalink / raw)
  To: Jan Beulich
  Cc: virtualization, xen-devel, Chris Wright, linux-kernel,
	Zachary Amsden, Ian Pratt

On Fri, May 12, 2006 at 08:47:13AM +0200, Jan Beulich wrote:
> >I've updated our loader to support this now, so that this patch is
> >no longer necessary.  I have at the same time added a new field to
> >xen_guest which allows specifying the entry point, allowing us to have
> >a different entry point when running the kernel image on Xen.
> 
> Why do you need a separate entry point here? The code should be able to figure out which mode it is run in without
> problems...

I think it's the cleanest way to have different startup code for
native and non-native in the same kernel.  But even if that's not
needed (for Linux), then you can have it point at the same address.
It is also always pointing to a virtual address, while the elf header
one now points to a physical address which doesn't make much sense
in the environment we start the kernel.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 24/35] Add support for Xen event channels.
  2006-05-09  7:00 ` [RFC PATCH 24/35] Add support for Xen event channels Chris Wright
@ 2006-05-12 21:41   ` Pavel Machek
  2006-05-13 12:27   ` Andrew Morton
  1 sibling, 0 replies; 185+ messages in thread
From: Pavel Machek @ 2006-05-12 21:41 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Hi!

> +++ linus-2.6/drivers/xen/core/evtchn.c
> @@ -0,0 +1,887 @@

msssng wwls?

> +/* NB. Interrupts are disabled on entry. */
> +asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
> +{
> +	unsigned long  l1, l2;
> +	unsigned int   l1i, l2i, port;

Better variable names would be nice.


						Pavel
-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 25/35] Add Xen time abstractions
  2006-05-09  7:00 ` [RFC PATCH 25/35] Add Xen time abstractions Chris Wright
  2006-05-09 16:23   ` Daniel Walker
  2006-05-09 21:50   ` Andi Kleen
@ 2006-05-12 21:44   ` Pavel Machek
  2 siblings, 0 replies; 185+ messages in thread
From: Pavel Machek @ 2006-05-12 21:44 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Hi!

> --- /dev/null
> +++ linus-2.6/drivers/xen/core/time.c
> @@ -0,0 +1,1045 @@
> +/*
> + *  time.c
> + *
> + *  Copyright (C) 1991, 1992, 1995  Linus Torvalds

Really?

> +void init_cpu_khz(void)
> +{
> +	u64 __cpu_khz = 1000000ULL << 32;
> +	struct vcpu_time_info *info;
> +	info = &HYPERVISOR_shared_info->vcpu_info[0].time;

No, I do not think linus wrote that. You probably want to add your
copyright there, and remove obsolete changelog.

-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 26/35] Add Xen subarch reboot support
  2006-05-09 17:02   ` Andi Kleen
@ 2006-05-12 21:46     ` Pavel Machek
  2006-05-12 21:57       ` Chris Wright
  0 siblings, 1 reply; 185+ messages in thread
From: Pavel Machek @ 2006-05-12 21:46 UTC (permalink / raw)
  To: Andi Kleen
  Cc: virtualization, Chris Wright, linux-kernel, xen-devel, Ian Pratt

Hi!

> > +
> > +/* Ignore multiple shutdown requests. */
> > +static int shutting_down = SHUTDOWN_INVALID;
> > +static void __shutdown_handler(void *unused);
> > +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
> > +
> > +static int shutdown_process(void *__unused)
> > +{
> > +	static char *envp[] = { "HOME=/", "TERM=linux",
> > +				"PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
> > +	static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
> 
> This should be configurable, probably in a sysctl

Actually we have similar code in sparc and acpi parts, IIRC. We
probably want to have one, common, shut-me-off routine.

-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 28/35] add support for Xen feature queries
  2006-05-09  7:00 ` [RFC PATCH 28/35] add support for Xen feature queries Chris Wright
@ 2006-05-12 21:56   ` Pavel Machek
  0 siblings, 0 replies; 185+ messages in thread
From: Pavel Machek @ 2006-05-12 21:56 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Hi!

> Add support for parsing and interpreting hypervisor feature
> flags. These allow the kernel to determine what features are provided
> by the underlying hypervisor. For example, whether page tables need to
> be write protected explicitly by the kernel, and whether the kernel
> (appears to) run in ring 0 rather than ring 1. This information allows
> the kernel to improve performance by avoiding unnecessary actions.


> --- /dev/null
> +++ linus-2.6/include/xen/features.h
> @@ -0,0 +1,20 @@
> +/******************************************************************************
> + * features.h
> + *
> + * Query the features reported by Xen.
> + *
> + * Copyright (c) 2006, Ian Campbell
> + */
> +
> +#ifndef __ASM_XEN_FEATURES_H__
> +#define __ASM_XEN_FEATURES_H__
> +
> +#include <xen/interface/version.h>
> +
> +extern void setup_xen_features(void);
> +
> +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];

32 bytes per submap? Why not use __test_bit & friends and make the
bitmap compact?

> +#define xen_feature(flag)	(xen_features[flag])

Perhaps this kind of indirection is not neccessary?

> --- /dev/null
> +++ linus-2.6/drivers/xen/core/features.c
> @@ -0,0 +1,29 @@
> +/******************************************************************************
> + * features.c
> + *
> + * Xen feature flags.
> + *
> + * Copyright (c) 2006, Ian Campbell, XenSource Inc.

GPL?
						Pavel
-- 
Thanks for all the (sleeping) penguins.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 26/35] Add Xen subarch reboot support
  2006-05-12 21:46     ` Pavel Machek
@ 2006-05-12 21:57       ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-12 21:57 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Andi Kleen, virtualization, Chris Wright, linux-kernel,
	xen-devel, Ian Pratt

* Pavel Machek (pavel@suse.cz) wrote:
> Hi!
> 
> > > +
> > > +/* Ignore multiple shutdown requests. */
> > > +static int shutting_down = SHUTDOWN_INVALID;
> > > +static void __shutdown_handler(void *unused);
> > > +static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
> > > +
> > > +static int shutdown_process(void *__unused)
> > > +{
> > > +	static char *envp[] = { "HOME=/", "TERM=linux",
> > > +				"PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
> > > +	static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
> > 
> > This should be configurable, probably in a sysctl
> 
> Actually we have similar code in sparc and acpi parts, IIRC. We
> probably want to have one, common, shut-me-off routine.

Yep, I had that cleanup in mind, the patch said:

TODO:
 - move poweroff and halt to generic similar to c_a_d

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 16/35] subarch support for interrupt and exception gates
  2006-05-09  7:00 ` [RFC PATCH 16/35] subarch support for interrupt and exception gates Chris Wright
  2006-05-09 11:09   ` Andi Kleen
@ 2006-05-13 12:27   ` Andrew Morton
  2006-05-15 18:30     ` Chris Wright
  1 sibling, 1 reply; 185+ messages in thread
From: Andrew Morton @ 2006-05-13 12:27 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, ian.pratt, Christian.Limpach

On Tue, 09 May 2006 00:00:16 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> --- linus-2.6.orig/include/asm-i386/mach-xen/setup_arch_pre.h
> +++ linus-2.6/include/asm-i386/mach-xen/setup_arch_pre.h
> @@ -5,6 +5,8 @@
>  struct start_info *xen_start_info;
>  EXPORT_SYMBOL(xen_start_info);
>  
> +struct trap_info xen_trap_table[257];
> +
>  /*
>   * Point at the empty zero page to start with. We map the real shared_info
>   * page as soon as fixmap is up and running.

Is there any particular reason why things-which-should-be-in-a-C-file are
present in a .h file?

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 24/35] Add support for Xen event channels.
  2006-05-09  7:00 ` [RFC PATCH 24/35] Add support for Xen event channels Chris Wright
  2006-05-12 21:41   ` Pavel Machek
@ 2006-05-13 12:27   ` Andrew Morton
  2006-05-13 13:02     ` Keir Fraser
  1 sibling, 1 reply; 185+ messages in thread
From: Andrew Morton @ 2006-05-13 12:27 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, ian.pratt, Christian.Limpach

On Tue, 09 May 2006 00:00:24 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> +void __init init_IRQ(void)
> +{
> +	int i;
> +	int cpu;
> +
> +	irq_ctx_init(0);
> +
> +	spin_lock_init(&irq_mapping_update_lock);

May as well initialise this at compile time.

> +	init_evtchn_cpu_bindings();
> +
> +	/* No VIRQ or IPI bindings. */
> +	for (cpu = 0; cpu < NR_CPUS; cpu++) {

Using NR_CPUS is a little...  old-fashioned.  I'd suggest a sweep through
all the Xen code, look for places where it should be using
for_each_foo_cpu().




^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 29/35] Add the Xen virtual console driver.
  2006-05-09  7:00 ` [RFC PATCH 29/35] Add the Xen virtual console driver Chris Wright
  2006-05-09 13:26   ` Andi Kleen
@ 2006-05-13 12:27   ` Andrew Morton
  2006-05-13 12:51     ` Nick Piggin
  1 sibling, 1 reply; 185+ messages in thread
From: Andrew Morton @ 2006-05-13 12:27 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, ian.pratt, Christian.Limpach

On Tue, 09 May 2006 00:00:29 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> This provides a bootstrap and ongoing emergency console which is
> intended to be available from very early during boot and at all times
> thereafter, in contrast with alternatives such as UDP-based syslogd,
> or logging in via ssh. The protocol is based on a simple shared-memory
> ring buffer.
>
> ...
>
> +/* The kernel and user-land drivers share a common transmit buffer. */
> +static unsigned int wbuf_size = 4096;
> +#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
> +static char *wbuf;
> +static unsigned int wc, wp; /* write_cons, write_prod */
> +
> +static int __init xencons_bufsz_setup(char *str)
> +{
> +	unsigned int goal;
> +	goal = simple_strtoul(str, NULL, 0);
> +	while (wbuf_size < goal)
> +		wbuf_size <<= 1;

roundup_pow_of_two()

> +/* This lock protects accesses to the common transmit buffer. */
> +static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED;

DEFINE_SPINLOCK()  (entire patchset)

> +
> +static void kcons_write(
> +	struct console *c, const char *s, unsigned int count)
> +{
> +	int           i = 0;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&xencons_lock, flags);
> +
> +	while (i < count) {
> +		for (; i < count; i++) {
> +			if ((wp - wc) >= (wbuf_size - 1))
> +				break;
> +			if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
> +				wbuf[WBUF_MASK(wp++)] = '\r';
> +		}
> +
> +		__xencons_tx_flush();
> +	}
> +
> +	spin_unlock_irqrestore(&xencons_lock, flags);
> +}

hm.  You have all that elaborate generate-ringbuffer-code-with-C-macros
stuff in the header file patch, yet this code (blessedly) doesn't use it.

> +static void kcons_write_dom0(
> +	struct console *c, const char *s, unsigned int count)
> +{
> +	int rc;
> +
> +	while ((count > 0) &&
> +	       ((rc = HYPERVISOR_console_io(
> +			CONSOLEIO_write, count, (char *)s)) > 0)) {
> +		count -= rc;
> +		s += rc;
> +	}
> +}

must.. not.. mention.. coding.. style..



^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver.
  2006-05-09  7:00 ` [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver Chris Wright
                     ` (2 preceding siblings ...)
  2006-05-09 19:49   ` Greg KH
@ 2006-05-13 12:28   ` Andrew Morton
  3 siblings, 0 replies; 185+ messages in thread
From: Andrew Morton @ 2006-05-13 12:28 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, ian.pratt, Christian.Limpach

On Tue, 09 May 2006 00:00:33 -0700
Chris Wright <chrisw@sous-sol.org> wrote:

> This communicates with the machine control software via a registry
> residing in a controlling virtual machine. This allows dynamic
> creation, destruction and modification of virtual device
> configurations (network devices, block devices and CPUS, to name some
> examples).
> 
>
> ...
>
> +void _dev_error(struct xenbus_device *dev, int err, const char *fmt,
> +		va_list ap)

I don't think this needs global scope?  (hopefully not, with that name..)

> +	int ret;
> +	unsigned int len;
> +	char *printf_buffer = NULL, *path_buffer = NULL;

	char *print_buffer;
	char *path_buffer = NULL;

> +#define PRINTF_BUFFER_SIZE 4096
> +	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);

Assuming that GFP_KERNEL is legal in this context seems like a bad idea.

<looks>

hm, it does that all over the place, so I guess it works.

> +/* Based on Rusty Russell's skeleton driver's unmap_page */
> +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
> +{
> +	struct vm_struct *area;
> +	struct gnttab_unmap_grant_ref op = {
> +		.host_addr = (unsigned long)vaddr,
> +	};
> +
> +	/* It'd be nice if linux/vmalloc.h provided a find_vm_area(void *addr)
> +	 * method so that we don't have to muck with vmalloc internals here.

We take patches ;)

But then, perhaps the requirement doesn't make a lot of sense in a
multithreaded environment.  We don't refcount the entries on vmlist, so
there's no point in being able to look them up.  Instead, the calling code
is supposed to be able to keep track of its own state.

Which begs the question: why isn't this code able to do that thing?

> +	 * We could force the user to hang on to their struct vm_struct from
> +	 * xenbus_map_ring_valloc, but these 6 lines considerably simplify
> +	 * this API.
> +	 */
> +	read_lock(&vmlist_lock);
> +	for (area = vmlist; area != NULL; area = area->next) {
> +		if (area->addr == vaddr)
> +			break;
> +	}
> +	read_unlock(&vmlist_lock);
> +
> +	if (!area) {
> +		xenbus_dev_error(dev, -ENOENT,
> +				 "can't find mapped virtual address %p", vaddr);
> +		return GNTST_bad_virt_addr;
> +	}

One assumes there's some locking hereabouts which ensures that `area' is
still on that list after vmlist_lock got dropped?

> +
> +static void *get_output_chunk(XENSTORE_RING_IDX cons,
> +			      XENSTORE_RING_IDX prod,
> +			      char *buf, uint32_t *len)
> +{
> +	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
> +	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
> +		*len = XENSTORE_RING_SIZE - (prod - cons);
> +	return buf + MASK_XENSTORE_IDX(prod);
> +}

Another open-coded ringbuffer?  Am still seeking the user of the
interesting ring.h.



^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 29/35] Add the Xen virtual console driver.
  2006-05-13 12:27   ` Andrew Morton
@ 2006-05-13 12:51     ` Nick Piggin
  2006-05-13 14:29       ` Andrew Morton
  0 siblings, 1 reply; 185+ messages in thread
From: Nick Piggin @ 2006-05-13 12:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, ian.pratt,
	Christian.Limpach

Andrew Morton wrote:

>>+static void kcons_write_dom0(
>>+	struct console *c, const char *s, unsigned int count)
>>+{
>>+	int rc;
>>+
>>+	while ((count > 0) &&
>>+	       ((rc = HYPERVISOR_console_io(
>>+			CONSOLEIO_write, count, (char *)s)) > 0)) {
>>+		count -= rc;
>>+		s += rc;
>>+	}
>>+}
> 
> 
> must.. not.. mention.. coding.. style..

Someone should write you a script to go through a patch and flag the
most common style mistakes. Have the output formatted to look like
you're replying to the mail, and wire it up to your inbox ;)

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 24/35] Add support for Xen event channels.
  2006-05-13 12:27   ` Andrew Morton
@ 2006-05-13 13:02     ` Keir Fraser
  0 siblings, 0 replies; 185+ messages in thread
From: Keir Fraser @ 2006-05-13 13:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: virtualization, linux-kernel, xen-devel, ian.pratt, Chris Wright


On 13 May 2006, at 13:27, Andrew Morton wrote:

>
>> +	init_evtchn_cpu_bindings();
>> +
>> +	/* No VIRQ or IPI bindings. */
>> +	for (cpu = 0; cpu < NR_CPUS; cpu++) {
>
> Using NR_CPUS is a little...  old-fashioned.  I'd suggest a sweep 
> through
> all the Xen code, look for places where it should be using
> for_each_foo_cpu().

Actually that's a particularly good catch in this case, since we use 
per_cpu() inside the loop and that's only well defined for 
cpu_possible_map. Oops.

The elusive users of ring.h are our split device drivers. It hides a 
bunch of details about muxing requests and responses on the same ring, 
and notification thresholds. There are a few other places we have ring 
buffers but they are sufficiently simple that implementing in place is 
clearer.

  Thanks,
  Keir


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 29/35] Add the Xen virtual console driver.
  2006-05-13 12:51     ` Nick Piggin
@ 2006-05-13 14:29       ` Andrew Morton
  2006-05-13 14:43         ` Nick Piggin
  0 siblings, 1 reply; 185+ messages in thread
From: Andrew Morton @ 2006-05-13 14:29 UTC (permalink / raw)
  To: Nick Piggin
  Cc: chrisw, linux-kernel, virtualization, xen-devel, ian.pratt,
	Christian.Limpach

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Andrew Morton wrote:
> 
> >>+static void kcons_write_dom0(
> >>+	struct console *c, const char *s, unsigned int count)
> >>+{
> >>+	int rc;
> >>+
> >>+	while ((count > 0) &&
> >>+	       ((rc = HYPERVISOR_console_io(
> >>+			CONSOLEIO_write, count, (char *)s)) > 0)) {
> >>+		count -= rc;
> >>+		s += rc;
> >>+	}
> >>+}
> > 
> > 
> > must.. not.. mention.. coding.. style..
> 
> Someone should write you a script to go through a patch and flag the
> most common style mistakes. Have the output formatted to look like
> you're replying to the mail, and wire it up to your inbox ;)
> 

Even better, someone should write a coding style document, so people get it
right from the outset.

Clever, aren't I?

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 29/35] Add the Xen virtual console driver.
  2006-05-13 14:29       ` Andrew Morton
@ 2006-05-13 14:43         ` Nick Piggin
  0 siblings, 0 replies; 185+ messages in thread
From: Nick Piggin @ 2006-05-13 14:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: chrisw, linux-kernel, virtualization, xen-devel, ian.pratt,
	Christian.Limpach

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 

>>Someone should write you a script to go through a patch and flag the
>>most common style mistakes. Have the output formatted to look like
>>you're replying to the mail, and wire it up to your inbox ;)
>>
> 
> 
> Even better, someone should write a coding style document, so people get it
> right from the outset.

I thought that was tried several years back -- I noticed you still
do it manually, so I just assumed that the style document scheme
hadn't worked.

> 
> Clever, aren't I?
> 

Yes... but I don't think it's your cleverness that's the problem ;)

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-09 15:22       ` Christoph Hellwig
  2006-05-09 15:45         ` Pekka Enberg
@ 2006-05-14  1:35         ` Andrew Morton
  2006-05-15 21:01           ` Chris Wright
  1 sibling, 1 reply; 185+ messages in thread
From: Andrew Morton @ 2006-05-14  1:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: ak, hch, chrisw, linux-kernel, virtualization, xen-devel

Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, May 09, 2006 at 05:20:11PM +0200, Andi Kleen wrote:
> > > It's also wrong.  There's more than one hypervisor and Xen shouldn't just
> > > grab this namespace.  make it xen_ or xenhv_.
> > 
> > You should reject the recent "hypervisor file system" with the same
> > argument then.
> 
> I prefer it would become lparfs or something like that indeed.

Yes, it did get renamed to something s390-specific.

Also, note

	http://www.kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/gregkh-01-driver/driver-core-add-sys-hypervisor-when-needed.patch

which creates the /sys/hypervisor directory.  With the expectation that
_all_ hypervisorish subsystems will base their sysfs trees in there.


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-09  7:00 ` [RFC PATCH 08/35] Add Xen-specific memory management definitions Chris Wright
  2006-05-09 14:49   ` Martin J. Bligh
@ 2006-05-15  6:44   ` Pete Zaitcev
  2006-05-15  7:04     ` Keir Fraser
  2006-05-15  8:19     ` Christian Limpach
  2006-05-17 16:06   ` Pete Zaitcev
  2 siblings, 2 replies; 185+ messages in thread
From: Pete Zaitcev @ 2006-05-15  6:44 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, Christian.Limpach, xen-devel,
	ian.pratt, zaitcev

On Tue, 09 May 2006 00:00:08 -0700, Chris Wright <chrisw@sous-sol.org> wrote:

I'm a little concerned with the code below being entirely too smart:

> +static inline unsigned long mfn_to_pfn(unsigned long mfn)
> +{
> +#ifndef CONFIG_XEN_SHADOW_MODE
> +	unsigned long pfn;
> +
> +	if (xen_feature(XENFEAT_auto_translated_physmap))
> +		return mfn;
> +
> +	/*
> +	 * The array access can fail (e.g., device space beyond end of RAM).
> +	 * In such cases it doesn't matter what we return (we return garbage),
> +	 * but we must handle the fault without crashing!
> +	 */
> +	asm (
> +		"1:	movl %1,%0\n"
> +		"2:\n"
> +		".section __ex_table,\"a\"\n"
> +		"	.align 4\n"
> +		"	.long 1b,2b\n"
> +		".previous"
> +		: "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) );
> +
> +	return pfn;
> +#else
> +	return mfn;
> +#endif
> +}

I found that if someone tries to use this too early, hypervisor terminates
the domain with a message like this:

(XEN) DOM0: (file=mm.c, line=486) Non-privileged attempt to map I/O space 000fec00

Why can't we use something like this, only a proper number of machine
pages:

diff -urp -X dontdiff linux-2.6.15-1.2054_FC5/include/asm-x86_64/mach-xen/asm/page.h linux-2.6.15-1.2054_FC5.z3/include/asm-x86_64/mach-xen/asm/page.h
--- linux-2.6.15-1.2054_FC5/include/asm-x86_64/mach-xen/asm/page.h	2006-03-17 17:52:38.000000000 -0800
+++ linux-2.6.15-1.2054_FC5.z3/include/asm-x86_64/mach-xen/asm/page.h	2006-05-11 20:36:16.000000000 -0700
@@ -101,26 +101,11 @@ static inline int phys_to_machine_mappin
 
 static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
-	unsigned long pfn;
-
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
-
-	/*
-	 * The array access can fail (e.g., device space beyond end of RAM).
-	 * In such cases it doesn't matter what we return (we return garbage),
-	 * but we must handle the fault without crashing!
-	 */
-	asm (
-		"1:	movq %1,%0\n"
-		"2:\n"
-		".section __ex_table,\"a\"\n"
-		"	.align 8\n"
-		"	.quad 1b,2b\n"
-		".previous"
-		: "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) );
-
-	return pfn;
+	if (mfn >= 1048576)	/* 4GB _machine_ RAM. XXX How to find out? */
+		return ~0;
+	return machine_to_phys_mapping[mfn];
 }
 
 /*

I'm sure you considered this, but decided to be tricky. Why?
No way to find the safe number of machine pages in a guest?

-- Pete

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-15  6:44   ` Pete Zaitcev
@ 2006-05-15  7:04     ` Keir Fraser
  2006-05-15  8:19     ` Christian Limpach
  1 sibling, 0 replies; 185+ messages in thread
From: Keir Fraser @ 2006-05-15  7:04 UTC (permalink / raw)
  To: Pete Zaitcev
  Cc: virtualization, linux-kernel, xen-devel, ian.pratt, Chris Wright


On 15 May 2006, at 07:44, Pete Zaitcev wrote:

> I'm sure you considered this, but decided to be tricky. Why?
> No way to find the safe number of machine pages in a guest?

We want to allow holes in the table if RAM is sparse. That code 
shouldn't ever fail after the guest has installed a page-fault handler. 
If you can make it do so (was it i386 or x86/64?) we're interested in 
seeing the full crash output.

  -- Keir


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-15  6:44   ` Pete Zaitcev
  2006-05-15  7:04     ` Keir Fraser
@ 2006-05-15  8:19     ` Christian Limpach
  1 sibling, 0 replies; 185+ messages in thread
From: Christian Limpach @ 2006-05-15  8:19 UTC (permalink / raw)
  To: Pete Zaitcev
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, ian.pratt

On Sun, May 14, 2006 at 11:44:18PM -0700, Pete Zaitcev wrote:
> I'm sure you considered this, but decided to be tricky. Why?
> No way to find the safe number of machine pages in a guest?

In addition to wanting to support holes, the number of machine
pages will usually change when you move it to another machine.

    christian


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 16/35] subarch support for interrupt and exception gates
  2006-05-13 12:27   ` Andrew Morton
@ 2006-05-15 18:30     ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-15 18:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, ian.pratt,
	Christian.Limpach

* Andrew Morton (akpm@osdl.org) wrote:
> On Tue, 09 May 2006 00:00:16 -0700
> Chris Wright <chrisw@sous-sol.org> wrote:
> 
> > --- linus-2.6.orig/include/asm-i386/mach-xen/setup_arch_pre.h
> > +++ linus-2.6/include/asm-i386/mach-xen/setup_arch_pre.h
> > @@ -5,6 +5,8 @@
> >  struct start_info *xen_start_info;
> >  EXPORT_SYMBOL(xen_start_info);
> >  
> > +struct trap_info xen_trap_table[257];
> > +
> >  /*
> >   * Point at the empty zero page to start with. We map the real shared_info
> >   * page as soon as fixmap is up and running.
> 
> Is there any particular reason why things-which-should-be-in-a-C-file are
> present in a .h file?

It's following direction of current subarch interaction with setup.c
(namely the setup_arch_post.h).  It's definitely not so nice.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 00/35] Xen i386 paravirtualization support
  2006-05-14  1:35         ` Andrew Morton
@ 2006-05-15 21:01           ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-15 21:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, ak, chrisw, linux-kernel, virtualization, xen-devel

* Andrew Morton (akpm@osdl.org) wrote:
> Also, note
> 
> 	http://www.kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/gregkh-01-driver/driver-core-add-sys-hypervisor-when-needed.patch
> 
> which creates the /sys/hypervisor directory.  With the expectation that
> _all_ hypervisorish subsystems will base their sysfs trees in there.

That's fine, Xen is currently using the same dir, so we just need to
adjust and select Kconfig.  There's some fallout as a a result (using
attrs directly vs spinning your own fs, for example), but it makes sense
to have common anchor in sysfs.

thanks,
-chris

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-09  7:00 ` [RFC PATCH 08/35] Add Xen-specific memory management definitions Chris Wright
  2006-05-09 14:49   ` Martin J. Bligh
  2006-05-15  6:44   ` Pete Zaitcev
@ 2006-05-17 16:06   ` Pete Zaitcev
  2006-05-18  7:42     ` Chris Wright
  2 siblings, 1 reply; 185+ messages in thread
From: Pete Zaitcev @ 2006-05-17 16:06 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, Christian.Limpach, xen-devel,
	ian.pratt, zaitcev

On Tue, 09 May 2006 00:00:08 -0700, Chris Wright <chrisw@sous-sol.org> wrote:

> +static inline unsigned long pfn_to_mfn(unsigned long pfn)
> +{
> +#ifndef CONFIG_XEN_SHADOW_MODE
> +	if (xen_feature(XENFEAT_auto_translated_physmap))
> +		return pfn;
> +	return phys_to_machine_mapping[(unsigned int)(pfn)] &
> +		~FOREIGN_FRAME_BIT;
> +#else
> +	return pfn;
> +#endif
> +}

Why do we need several modes in Linux guests?

If a significant tradeoff exists (for example, between performance
and maximum addressable memory), then we need to think about the
real issue instead of throwing config options into the pot.

-- Pete

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 08/35] Add Xen-specific memory management definitions
  2006-05-17 16:06   ` Pete Zaitcev
@ 2006-05-18  7:42     ` Chris Wright
  0 siblings, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-18  7:42 UTC (permalink / raw)
  To: Pete Zaitcev
  Cc: Chris Wright, linux-kernel, virtualization, Christian.Limpach,
	xen-devel, ian.pratt

* Pete Zaitcev (zaitcev@redhat.com) wrote:
> On Tue, 09 May 2006 00:00:08 -0700, Chris Wright <chrisw@sous-sol.org> wrote:
> 
> > +static inline unsigned long pfn_to_mfn(unsigned long pfn)
> > +{
> > +#ifndef CONFIG_XEN_SHADOW_MODE
> > +	if (xen_feature(XENFEAT_auto_translated_physmap))
> > +		return pfn;
> > +	return phys_to_machine_mapping[(unsigned int)(pfn)] &
> > +		~FOREIGN_FRAME_BIT;
> > +#else
> > +	return pfn;
> > +#endif
> > +}
> 
> Why do we need several modes in Linux guests?

This patchset only supports shadow translated mode, so the extra CONFIG is
just an artifact of this simplied patchset.  Ultimately, this is not the
preferred mode (performance wise), but shadow mode is simpler to port to.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-09  7:00 ` [RFC PATCH 17/35] Segment register changes for Xen Chris Wright
  2006-05-09  7:16   ` Pavel Machek
  2006-05-09 16:44   ` Andi Kleen
@ 2006-05-18 20:20   ` Zachary Amsden
  2006-05-18 20:41     ` Keir Fraser
  2006-05-18 21:26     ` Chris Wright
  2 siblings, 2 replies; 185+ messages in thread
From: Zachary Amsden @ 2006-05-18 20:20 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, virtualization, xen-devel, Ian Pratt, Christian Limpach

Chris Wright wrote:
> 1. We clear FS/GS before changing TLS entries and switching LDT, as
> otherwise the hypervisor will fail to restore thread-local values on
> return to the guest kernel and we take a slow exception path.

> @@ -647,6 +647,8 @@ struct task_struct fastcall * __switch_t
>  	 */
>  	savesegment(fs, prev->fs);
>  	savesegment(gs, prev->gs);
> +	clearsegment(fs);
> +	clearsegment(gs);
>   

Really not needed.  Think about it.  You can even speed up Xen.  I'm 
glad the native operation here is a nop, but it should be 
hypervisor_clearsegment or xen_clearsegment if you really want to keep it.

Zach

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-18 20:20   ` Zachary Amsden
@ 2006-05-18 20:41     ` Keir Fraser
  2006-05-18 21:26     ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Keir Fraser @ 2006-05-18 20:41 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: virtualization, Ian Pratt, xen-devel, linux-kernel, Chris Wright


On 18 May 2006, at 21:20, Zachary Amsden wrote:

>> 1. We clear FS/GS before changing TLS entries and switching LDT, as
>> otherwise the hypervisor will fail to restore thread-local values on
>> return to the guest kernel and we take a slow exception path.
>
>> @@ -647,6 +647,8 @@ struct task_struct fastcall * __switch_t
>>  	 */
>>  	savesegment(fs, prev->fs);
>>  	savesegment(gs, prev->gs);
>> +	clearsegment(fs);
>> +	clearsegment(gs);
>>
>
> Really not needed.  Think about it.  You can even speed up Xen.  I'm 
> glad the native operation here is a nop, but it should be 
> hypervisor_clearsegment or xen_clearsegment if you really want to keep 
> it.

Maybe you could explain in more detail? It's not needed for 
correctness, but it is faster for us to clear at that point.

  -- Keir


^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 17/35] Segment register changes for Xen
  2006-05-18 20:20   ` Zachary Amsden
  2006-05-18 20:41     ` Keir Fraser
@ 2006-05-18 21:26     ` Chris Wright
  1 sibling, 0 replies; 185+ messages in thread
From: Chris Wright @ 2006-05-18 21:26 UTC (permalink / raw)
  To: Zachary Amsden
  Cc: Chris Wright, linux-kernel, virtualization, xen-devel, Ian Pratt,
	Christian Limpach

* Zachary Amsden (zach@vmware.com) wrote:
> Chris Wright wrote:
> >1. We clear FS/GS before changing TLS entries and switching LDT, as
> >otherwise the hypervisor will fail to restore thread-local values on
> >return to the guest kernel and we take a slow exception path.
> 
> >@@ -647,6 +647,8 @@ struct task_struct fastcall * __switch_t
> > 	 */
> > 	savesegment(fs, prev->fs);
> > 	savesegment(gs, prev->gs);
> >+	clearsegment(fs);
> >+	clearsegment(gs);
> >  
> 
> Really not needed.  Think about it.  You can even speed up Xen.

Please describe how.  I'm afraid I'm missing your point, as I don't see
the improvement.

> I'm glad the native operation here is a nop, but it should be 
> hypervisor_clearsegment or xen_clearsegment if you really want to keep it.

Yeah, Andi had similar naming concern.

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-03-22 22:57   ` Dan Hecht
@ 2006-03-27  8:18     ` Gerd Hoffmann
  0 siblings, 0 replies; 185+ messages in thread
From: Gerd Hoffmann @ 2006-03-27  8:18 UTC (permalink / raw)
  To: Dan Hecht
  Cc: Chris Wright, linux-kernel, xen-devel, virtualization, Ian Pratt,
	Christian Limpach

  Hi,

> Rather than changing LOAD_OFFSET in Linux, why not leave this alone and
> change the Xen domain builder to properly interpret the ELF program
> header fields?

I've a patch in the queue which does exactly that ;)
Planned to submit just after xen 3.0.2 release ...

cheers,

  Gerd

-- 
Gerd 'just married' Hoffmann <kraxel@suse.de>
I'm the hacker formerly known as Gerd Knorr.
http://www.suse.de/~kraxel/just-married.jpeg

^ permalink raw reply	[flat|nested] 185+ messages in thread

* Re: [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-03-22  6:30 ` [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch Chris Wright
@ 2006-03-22 22:57   ` Dan Hecht
  2006-03-27  8:18     ` Gerd Hoffmann
  0 siblings, 1 reply; 185+ messages in thread
From: Dan Hecht @ 2006-03-22 22:57 UTC (permalink / raw)
  To: Chris Wright
  Cc: linux-kernel, xen-devel, virtualization, Ian Pratt, Christian Limpach

Chris Wright wrote:
> Change LOAD_OFFSET so that the kernel has virtual addresses in the elf header fields.
> 
> Unlike bare metal kernels, Xen kernels start with virtual address
> management turned on and thus the addresses to load to should be
> virtual addresses.
> 

Rather than changing LOAD_OFFSET in Linux, why not leave this alone and 
change the Xen domain builder to properly interpret the ELF program 
header fields?

i.e. with this change, we'd have

p_paddr = __PAGE_OFFSET + segment_offset
p_vaddr = __PAGE_OFFSET + segment_offset
VIRT_BASE = __PAGE_OFFSET

where, the VA mapping p_paddr -> (p_paddr-VIRT_BASE) is established by 
the domain builder.

Instead, why not drop this patch, and the VIRT_BASE portion of the 
__xen_guest section, and instead change Xen's domain builder to treat 
p_paddr and p_vaddr in a more standard way?  Since Xen starts the domain 
with virtual address management enabled, it makes sense for it to use 
p_vaddr to determine the virtual address to load the kernel to.  Then, 
p_paddr could be used to determine which pseudo-physical pages back that 
virtual address range.

i.e. use, just like vanilla linux:

p_paddr = segment_offset
p_vaddr = __PAGE_OFFSET + segment_offset

so these two fields directly indicate the same mapping as before, but 
now in terms of p_vaddr -> p_paddr, which makes sense, and no need for 
the extra VIRT_BASE attribute in the __xen_guest section.

Dan

^ permalink raw reply	[flat|nested] 185+ messages in thread

* [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch
  2006-03-22  6:30 Chris Wright
@ 2006-03-22  6:30 ` Chris Wright
  2006-03-22 22:57   ` Dan Hecht
  0 siblings, 1 reply; 185+ messages in thread
From: Chris Wright @ 2006-03-22  6:30 UTC (permalink / raw)
  To: linux-kernel; +Cc: xen-devel, virtualization, Ian Pratt, Christian Limpach

[-- Attachment #1: 06-i386-load-offset --]
[-- Type: text/plain, Size: 1451 bytes --]

Change LOAD_OFFSET so that the kernel has virtual addresses in the elf header fields.

Unlike bare metal kernels, Xen kernels start with virtual address
management turned on and thus the addresses to load to should be
virtual addresses.

Signed-off-by: Ian Pratt <ian.pratt@xensource.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/kernel/vmlinux.lds.S                   |    2 +-
 include/asm-i386/mach-default/mach_vmlinux.lds.h |    6 ++++++
 include/asm-i386/mach-xen/mach_vmlinux.lds.h     |    6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

--- xen-subarch-2.6.orig/arch/i386/kernel/vmlinux.lds.S
+++ xen-subarch-2.6/arch/i386/kernel/vmlinux.lds.S
@@ -2,7 +2,7 @@
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
  */
 
-#define LOAD_OFFSET __PAGE_OFFSET
+#include "mach_vmlinux.lds.h"
 
 #include <asm-generic/vmlinux.lds.h>
 #include <asm/thread_info.h>
--- /dev/null
+++ xen-subarch-2.6/include/asm-i386/mach-default/mach_vmlinux.lds.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_MACH_VMLINUX_LDS_H
+#define __ASM_MACH_VMLINUX_LDS_H
+
+#define LOAD_OFFSET __PAGE_OFFSET
+
+#endif /* __ASM_MACH_VMLINUX_LDS_H */
--- /dev/null
+++ xen-subarch-2.6/include/asm-i386/mach-xen/mach_vmlinux.lds.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_MACH_VMLINUX_LDS_H
+#define __ASM_MACH_VMLINUX_LDS_H
+
+#define LOAD_OFFSET 0
+
+#endif /* __ASM_MACH_VMLINUX_LDS_H */

--

^ permalink raw reply	[flat|nested] 185+ messages in thread

end of thread, other threads:[~2006-05-18 21:23 UTC | newest]

Thread overview: 185+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-05-09  8:49 [RFC PATCH 00/35] Xen i386 paravirtualization support Chris Wright
2006-05-09  7:00 ` [RFC PATCH 01/35] Add XEN config options and disable unsupported config options Chris Wright
2006-05-09 10:05   ` Adrian Bunk
2006-05-09 11:06     ` Ed Tomlinson
2006-05-09 12:45     ` Christian Limpach
2006-05-09 23:23     ` Chris Wright
2006-05-09 14:47   ` Daniel Walker
2006-05-09 15:16     ` Christian Limpach
2006-05-09 16:00       ` Daniel Walker
2006-05-09 23:25         ` Chris Wright
2006-05-09 16:42   ` Andi Kleen
2006-05-10 15:36   ` [Xen-devel] " Alan Cox
2006-05-10 15:48     ` Christian Limpach
2006-05-09  7:00 ` [RFC PATCH 02/35] Makefile support to build Xen subarch Chris Wright
2006-05-09  7:00 ` [RFC PATCH 03/35] Add Xen interface header files Chris Wright
2006-05-09 14:49   ` Martin J. Bligh
2006-05-09 17:54     ` Christian Limpach
2006-05-09 15:15   ` Christoph Hellwig
2006-05-09 19:35     ` Hollis Blanchard
2006-05-09 19:48       ` [Xen-devel] " Anthony Liguori
2006-05-09 22:34       ` Christoph Hellwig
2006-05-09 22:36     ` Ingo Oeser
2006-05-09 16:06   ` Daniel Walker
2006-05-09 16:18     ` Christian Limpach
2006-05-09 16:29       ` Daniel Walker
2006-05-09  7:00 ` [RFC PATCH 04/35] Hypervisor " Chris Wright
2006-05-09 22:43   ` Ingo Oeser
2006-05-09 23:01     ` Chris Wright
2006-05-09  7:00 ` [RFC PATCH 05/35] Add sync bitops Chris Wright
2006-05-09 22:56   ` Christoph Lameter
2006-05-09 23:04     ` Andi Kleen
2006-05-09 23:07     ` Chris Wright
2006-05-09  7:00 ` [RFC PATCH 06/35] Add vmlinuz build target Chris Wright
2006-05-09  7:00 ` [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch Chris Wright
2006-05-10 23:28   ` Zachary Amsden
2006-05-11  7:47     ` [Xen-devel] " Gerd Hoffmann
2006-05-11  8:51       ` Chris Wright
2006-05-11  9:06         ` Gerd Hoffmann
2006-05-11 16:43     ` Christian Limpach
2006-05-12  6:47       ` [Xen-devel] " Jan Beulich
2006-05-12  8:38         ` Christian Limpach
2006-05-09  7:00 ` [RFC PATCH 08/35] Add Xen-specific memory management definitions Chris Wright
2006-05-09 14:49   ` Martin J. Bligh
2006-05-09 17:44     ` Christian Limpach
2006-05-15  6:44   ` Pete Zaitcev
2006-05-15  7:04     ` Keir Fraser
2006-05-15  8:19     ` Christian Limpach
2006-05-17 16:06   ` Pete Zaitcev
2006-05-18  7:42     ` Chris Wright
2006-05-09  7:00 ` [RFC PATCH 09/35] Change __FIXADDR_TOP to leave room for the hypervisor Chris Wright
2006-05-09  7:00 ` [RFC PATCH 10/35] Add a new head.S start-of-day file for booting on Xen Chris Wright
2006-05-09  7:00 ` [RFC PATCH 11/35] Add support for Xen to entry.S Chris Wright
2006-05-09 16:51   ` Andi Kleen
2006-05-09  7:00 ` [RFC PATCH 12/35] Add start-of-day setup hooks to subarch Chris Wright
2006-05-09  7:00 ` [RFC PATCH 13/35] Support loading an initrd when running on Xen Chris Wright
2006-05-09  7:00 ` [RFC PATCH 14/35] Subarch support for CPUID instruction Chris Wright
2006-05-09  7:00 ` [RFC PATCH 15/35] subarch support for controlling interrupt delivery Chris Wright
2006-05-09 14:49   ` Martin J. Bligh
2006-05-09 14:55     ` Nick Piggin
2006-05-09 15:51     ` Christian Limpach
2006-05-09 16:02       ` Martin J. Bligh
2006-05-09 16:07       ` Andi Kleen
2006-05-09 16:29         ` Christian Limpach
2006-05-09 16:31           ` Andi Kleen
2006-05-09 20:42             ` Christian Limpach
2006-05-09 21:56               ` Andi Kleen
2006-05-10 10:35                 ` Christian Limpach
2006-05-10 10:54                   ` Andi Kleen
2006-05-09 21:56               ` Chris Wright
2006-05-09  7:00 ` [RFC PATCH 16/35] subarch support for interrupt and exception gates Chris Wright
2006-05-09 11:09   ` Andi Kleen
2006-05-09 12:55     ` Christian Limpach
2006-05-13 12:27   ` Andrew Morton
2006-05-15 18:30     ` Chris Wright
2006-05-09  7:00 ` [RFC PATCH 17/35] Segment register changes for Xen Chris Wright
2006-05-09  7:16   ` Pavel Machek
2006-05-10 20:09     ` Andi Kleen
2006-05-10 20:30       ` Pavel Machek
2006-05-11 10:34         ` Avi Kivity
2006-05-11 10:41           ` Andi Kleen
2006-05-12  0:28     ` [Xen-devel] " Rusty Russell
2006-05-09 16:44   ` Andi Kleen
2006-05-18 20:20   ` Zachary Amsden
2006-05-18 20:41     ` Keir Fraser
2006-05-18 21:26     ` Chris Wright
2006-05-09  7:00 ` [RFC PATCH 18/35] Support gdt/idt/ldt handling on Xen Chris Wright
2006-05-09  7:21   ` Pavel Machek
2006-05-10 20:23     ` Andi Kleen
2006-05-09 14:49   ` Martin J. Bligh
2006-05-09 18:14     ` Christian Limpach
2006-05-09 18:21       ` Martin Bligh
2006-05-09  7:00 ` [RFC PATCH 19/35] subarch support for control register accesses Chris Wright
2006-05-09  7:00 ` [RFC PATCH 20/35] subarch stack pointer update Chris Wright
2006-05-09  7:00 ` [RFC PATCH 21/35] subarch TLB support Chris Wright
2006-05-09  7:00 ` [RFC PATCH 22/35] subarch suport for idle loop (NO_IDLE_HZ for Xen) Chris Wright
2006-05-09 13:21   ` Andi Kleen
2006-05-09 15:13     ` Christian Limpach
2006-05-09  7:00 ` [RFC PATCH 23/35] Increase x86 interrupt vector range Chris Wright
2006-05-09  7:00 ` [RFC PATCH 24/35] Add support for Xen event channels Chris Wright
2006-05-12 21:41   ` Pavel Machek
2006-05-13 12:27   ` Andrew Morton
2006-05-13 13:02     ` Keir Fraser
2006-05-09  7:00 ` [RFC PATCH 25/35] Add Xen time abstractions Chris Wright
2006-05-09 16:23   ` Daniel Walker
2006-05-09 16:38     ` Christian Limpach
2006-05-09 19:27       ` Adrian Bunk
2006-05-09 21:50   ` Andi Kleen
2006-05-09 23:03     ` Ingo Oeser
2006-05-09 23:09       ` Andi Kleen
2006-05-09 23:13       ` Chris Wright
2006-05-12 21:44   ` Pavel Machek
2006-05-09  7:00 ` [RFC PATCH 26/35] Add Xen subarch reboot support Chris Wright
2006-05-09 17:02   ` Andi Kleen
2006-05-12 21:46     ` Pavel Machek
2006-05-12 21:57       ` Chris Wright
2006-05-09  7:00 ` [RFC PATCH 27/35] Add nosegneg capability to the vsyscall page notes Chris Wright
2006-05-09  7:00 ` [RFC PATCH 28/35] add support for Xen feature queries Chris Wright
2006-05-12 21:56   ` Pavel Machek
2006-05-09  7:00 ` [RFC PATCH 29/35] Add the Xen virtual console driver Chris Wright
2006-05-09 13:26   ` Andi Kleen
2006-05-09 15:03     ` Christian Limpach
2006-05-13 12:27   ` Andrew Morton
2006-05-13 12:51     ` Nick Piggin
2006-05-13 14:29       ` Andrew Morton
2006-05-13 14:43         ` Nick Piggin
2006-05-09  7:00 ` [RFC PATCH 30/35] Add apply_to_page_range() function Chris Wright
2006-05-09  7:00 ` [RFC PATCH 31/35] Add Xen grant table support Chris Wright
2006-05-09  7:00 ` [RFC PATCH 32/35] Add Xen driver utility functions Chris Wright
2006-05-09 19:48   ` Greg KH
2006-05-09 21:50   ` Andi Kleen
2006-05-09  7:00 ` [RFC PATCH 33/35] Add the Xenbus sysfs and virtual device hotplug driver Chris Wright
2006-05-09 16:06   ` Alexey Dobriyan
2006-05-09 16:28     ` Andi Kleen
2006-05-09 19:40   ` Greg KH
2006-05-09 21:53     ` Chris Wright
2006-05-09 22:01       ` Greg KH
2006-05-09 22:50         ` Chris Wright
2006-05-09 23:43         ` Anthony Liguori
2006-05-09 19:49   ` Greg KH
2006-05-09 19:58     ` Chris Wright
2006-05-13 12:28   ` Andrew Morton
2006-05-09  7:00 ` [RFC PATCH 34/35] Add the Xen virtual network device driver Chris Wright
2006-05-09 11:55   ` [Xen-devel] " Herbert Xu
2006-05-09 12:43     ` Christian Limpach
2006-05-09 13:01       ` Herbert Xu
2006-05-09 13:14         ` Andi Kleen
2006-05-09 13:16         ` Christian Limpach
2006-05-09 13:26           ` Herbert Xu
2006-05-09 14:00             ` Christian Limpach
2006-05-09 14:30               ` David Boutcher
2006-05-09 23:35                 ` Chris Wright
2006-05-09 11:58   ` Christoph Hellwig
2006-05-09 23:37     ` Chris Wright
2006-05-09 18:56   ` Stephen Hemminger
2006-05-09 23:39     ` Chris Wright
2006-05-09 20:25   ` Stephen Hemminger
2006-05-09 20:26     ` Keir Fraser
2006-05-09 20:39       ` Stephen Hemminger
2006-05-09 20:46       ` Roland Dreier
2006-05-10 18:28         ` Andi Kleen
2006-05-11  0:33           ` Herbert Xu
2006-05-11  7:49             ` Keir Fraser
2006-05-11  8:04               ` Herbert Xu
2006-05-11  9:47               ` Andi Kleen
2006-05-11 16:18                 ` Stephen Hemminger
2006-05-11 16:48                 ` Rick Jones
2006-05-11 16:55                   ` Stephen Hemminger
2006-05-11 17:30                   ` Andi Kleen
2006-05-09 20:32     ` Chris Wright
2006-05-09 22:41   ` [Xen-devel] " Herbert Xu
2006-05-09 23:51     ` Chris Wright
2006-05-10  6:36       ` Keir Fraser
2006-05-09  7:00 ` [RFC PATCH 35/35] Add Xen virtual block " Chris Wright
2006-05-09 12:01   ` Christoph Hellwig
2006-05-09 14:49 ` [RFC PATCH 00/35] Xen i386 paravirtualization support Martin J. Bligh
2006-05-09 15:07   ` Christoph Hellwig
2006-05-09 15:12     ` Martin J. Bligh
2006-05-09 15:20     ` Andi Kleen
2006-05-09 15:22       ` Christoph Hellwig
2006-05-09 15:45         ` Pekka Enberg
2006-05-14  1:35         ` Andrew Morton
2006-05-15 21:01           ` Chris Wright
  -- strict thread matches above, loose matches on Subject: below --
2006-03-22  6:30 Chris Wright
2006-03-22  6:30 ` [RFC PATCH 07/35] Make LOAD_OFFSET defined by subarch Chris Wright
2006-03-22 22:57   ` Dan Hecht
2006-03-27  8:18     ` Gerd Hoffmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).