All of lore.kernel.org
 help / color / mirror / Atom feed
* compile fixes for mini-os
@ 2004-01-24 21:39 Kip Macy
  2004-01-25  1:00 ` Keir Fraser
  0 siblings, 1 reply; 8+ messages in thread
From: Kip Macy @ 2004-01-24 21:39 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 412 bytes --]

I'm compiling the mini-os files in the FreeBSD build environment which
uses the following extra flags:

-Wcast-qual -Wredundant-decls -Wnested-externs -Wstrict-prototypes -Wpointer-arith -Winline -ansi

This produces a number of annoying warnings which are fixed by the
attached patch. In addition the sabon.sty doesn't exist in my
environment - I was able to comment it out in style.tex any
apparent ill effect.

[-- Attachment #2: Type: TEXT/PLAIN, Size: 8236 bytes --]

===== docs/style.tex 1.1 vs edited =====
7c7
< \usepackage{sabon}
---
> % \usepackage{sabon}
===== extras/mini-os/Makefile 1.5 vs edited =====
6c6,7
< CFLAGS  := -fno-builtin -O3 -Wall -Ih/
---
> CFLAGS  := -fno-builtin -O3 -Wall -Ih/ -Wcast-qual -Wredundant-decls -Wnested-externs -Wstrict-prototypes -Wpointer-arith -Winline -ansi
> 
===== extras/mini-os/events.c 1.3 vs edited =====
35,36c35,36
< 
<     if (ev >= NR_EVS) {
---
> 	/* assuming ev can't be negative */
>     if ((unsigned int)ev >= NR_EVS) {
94c94
<     int i;
---
>     unsigned int i;
===== extras/mini-os/kernel.c 1.8 vs edited =====
63c63
< 
---
> extern char shared_info[PAGE_SIZE];
66c66
<     extern char shared_info[PAGE_SIZE];
---
> 
===== extras/mini-os/mm.c 1.4 vs edited =====
90c90
<         int i;
---
>         unsigned int i;
222c222
<     int i;
---
>     unsigned int i;
259c259
<         for ( i = PAGE_SHIFT; (1<<(i+1)) <= range; i++ )
---
>         for ( i = PAGE_SHIFT;  (unsigned int)(1<<(i+1)) <= range;  i++ )
281c281
<     int i;
---
>     unsigned int i;
300c300
<     while ( i != order )
---
>     while ( i != (unsigned int)order )
===== extras/mini-os/time.c 1.4 vs edited =====
91c91
< static inline unsigned long get_time_delta_usecs(void)
---
> __inline__ static unsigned long get_time_delta_usecs(void)
===== extras/mini-os/traps.c 1.5 vs edited =====
60c60
< static inline void dump_code(unsigned eip)
---
> __inline__ static void dump_code(unsigned eip)
83c83
< static void inline do_trap(int trapnr, char *str,
---
> __inline__ static void do_trap(int trapnr, char *str,
87c87
<   printf("%d %s", trapnr, str);
---
>   printf("%d %s : %ld", trapnr, str, error_code);
===== extras/mini-os/h/hypervisor.h 1.8 vs edited =====
35c35
< static inline int HYPERVISOR_set_trap_table(trap_info_t *table)
---
> __inline__ static int HYPERVISOR_set_trap_table(trap_info_t *table)
46c46
< static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count)
---
> __inline__ static int HYPERVISOR_mmu_update(mmu_update_t *req, int count)
57c57
< static inline int HYPERVISOR_console_write(const char *str, int count)
---
> __inline__ static int HYPERVISOR_console_write(const char *str, int count)
69c69
< static inline int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
---
> __inline__ static int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
81c81
< static inline int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
---
> __inline__ static int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
92c92
< static inline int HYPERVISOR_set_callbacks(
---
> __inline__ static int HYPERVISOR_set_callbacks(
106c106
< static inline int HYPERVISOR_net_io_op(netop_t *op)
---
> __inline__ static int HYPERVISOR_net_io_op(netop_t *op)
117c117
< static inline int HYPERVISOR_fpu_taskswitch(void)
---
> __inline__ static int HYPERVISOR_fpu_taskswitch(void)
127c127
< static inline int HYPERVISOR_yield(void)
---
> __inline__ static int HYPERVISOR_yield(void)
138c138
< static inline int HYPERVISOR_exit(void)
---
> __inline__ static int HYPERVISOR_exit(void)
149c149
< static inline int HYPERVISOR_stop(void)
---
> __inline__ static int HYPERVISOR_stop(void)
160c160
< static inline int HYPERVISOR_dom0_op(void *dom0_op)
---
> __inline__ static int HYPERVISOR_dom0_op(void *dom0_op)
171c171
< static inline int HYPERVISOR_network_op(void *network_op)
---
> __inline__ static int HYPERVISOR_network_op(void *network_op)
182c182
< static inline int HYPERVISOR_block_io_op(unsigned int op)
---
> __inline__ static int HYPERVISOR_block_io_op(unsigned int op)
193c193
< static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value)
---
> __inline__ static int HYPERVISOR_set_debugreg(int reg, unsigned long value)
204c204
< static inline unsigned long HYPERVISOR_get_debugreg(int reg)
---
> __inline__ static unsigned long HYPERVISOR_get_debugreg(int reg)
215c215
< static inline int HYPERVISOR_update_descriptor(
---
> __inline__ static int HYPERVISOR_update_descriptor(
227c227
< static inline int HYPERVISOR_set_fast_trap(int idx)
---
> __inline__ static int HYPERVISOR_set_fast_trap(int idx)
238c238
< static inline int HYPERVISOR_dom_mem_op(void *dom_mem_op)
---
> __inline__ static int HYPERVISOR_dom_mem_op(void *dom_mem_op)
249c249
< static inline int HYPERVISOR_multicall(void *call_list, int nr_calls)
---
> __inline__ static int HYPERVISOR_multicall(void *call_list, int nr_calls)
260c260
< static inline long HYPERVISOR_kbd_op(unsigned char op, unsigned char val)
---
> __inline__ static long HYPERVISOR_kbd_op(unsigned char op, unsigned char val)
271c271
< static inline int HYPERVISOR_update_va_mapping(
---
> __inline__ static int HYPERVISOR_update_va_mapping(
===== extras/mini-os/h/lib.h 1.2 vs edited =====
117c117
< struct mallinfo mallinfo();
---
> struct mallinfo mallinfo(void);
126c126
< void malloc_stats();
---
> void malloc_stats(void);
===== extras/mini-os/h/mm.h 1.3 vs edited =====
54c54
< static inline unsigned long phys_to_machine(unsigned long phys)
---
> __inline__ static unsigned long phys_to_machine(unsigned long phys)
60c60
< static inline unsigned long machine_to_phys(unsigned long machine)
---
> __inline__ static unsigned long machine_to_phys(unsigned long machine)
72c72
< void init_mm();
---
> void init_mm(void);
===== extras/mini-os/h/os.h 1.5 vs edited =====
140,141c140,142
< #define __xg(x) ((struct __xchg_dummy *)(x))
< static inline unsigned long __xchg(unsigned long x, volatile void * ptr,
---
> #define __xg(x) ((volatile struct __xchg_dummy *)(x))
> 
> __inline__ static unsigned long __xchg(unsigned long x, volatile void * ptr,
===== extras/mini-os/lib/malloc.c 1.3 vs edited =====
47c47
<     static void *last;
---
>     static char *last;
49c49
<     void *ret;
---
>     char *ret;
57c57
<     ret = (void *)alloc_pages(order);
---
>     ret = (char *)alloc_pages(order);
===== extras/mini-os/lib/math.c 1.2 vs edited =====
143,144c143
< __qdivrem(uq, vq, arq)
< 	u64 uq, vq, *arq;
---
> __qdivrem(u64 uq, u64 vq, u64 *arq)
366,367c365
< __udivdi3(a, b)
<         u64 a, b;
---
> __udivdi3(u64 a, u64 b)
377,378c375
< __umoddi3(a, b)
<         u_quad_t a, b;
---
> __umoddi3(u_quad_t a, u_quad_t b)
===== extras/mini-os/lib/string.c 1.2 vs edited =====
38c38,39
< 	char *tmp = (char *) dest, *s = (char *) src;
---
> 	char *tmp = (char *) dest;
> 	const char *s = (const char *) src;
123c124
<         return (char *) s;
---
>         return (char *)s;
126,138c127,144
< char * strstr(const char * s1,const char * s2)
< {
<         int l1, l2;
< 
<         l2 = strlen(s2);
<         if (!l2)
<                 return (char *) s1;
<         l1 = strlen(s1);
<         while (l1 >= l2) {
<                 l1--;
<                 if (!memcmp(s1,s2,l2))
<                         return (char *) s1;
<                 s1++;
---
> /*
>  * Find the first occurrence of find in s.
>  */
> char *
> strstr(const char *s, const char *find)
> {
> 	    char c, sc;
>         size_t len;
> 
>         if ((c = *find++) != 0) {
>                 len = strlen(find);
>                 do {
>                         do {
>                                 if ((sc = *s++) == 0)
>                                         return (NULL);
>                         } while (sc != c);
>                 } while (strncmp(s, find, len) != 0);
>                 s--;
140c146
<         return NULL;
---
>         return ((char *)s);
142d147
< 
===== xen/include/hypervisor-ifs/block.h 1.24 vs edited =====
107,108c107,108
<     int         max;             // maximumum number of disks to report
<     xen_disk_t *disks;           // pointer to array of disk info 
---
>     int         max;             /* maximumum number of disks to report */
>     xen_disk_t *disks;           /* pointer to array of disk info */ 
110c110
<     int         count;           // how many disks we have info about 
---
>     int         count;           /* how many disks we have info about */

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: compile fixes for mini-os
  2004-01-24 21:39 compile fixes for mini-os Kip Macy
@ 2004-01-25  1:00 ` Keir Fraser
  2004-01-25  1:27   ` Kip Macy
  0 siblings, 1 reply; 8+ messages in thread
From: Keir Fraser @ 2004-01-25  1:00 UTC (permalink / raw)
  To: Kip Macy; +Cc: xen-devel

> I'm compiling the mini-os files in the FreeBSD build environment which
> uses the following extra flags:
> 
> -Wcast-qual -Wredundant-decls -Wnested-externs -Wstrict-prototypes -Wpointer-arith -Winline -ansi
> 
> This produces a number of annoying warnings which are fixed by the
> attached patch. In addition the sabon.sty doesn't exist in my
> environment - I was able to comment it out in style.tex any
> apparent ill effect.

Okay, I guess that relying on the sabon package is a bad idea. Also
thanks for the C++ comment-style fixes.

Personally I dislike the warnings GCC emits when it's placed in
super-anal mode. I would never take fixes for Xen or Xenolinux to stop
those warnings since I think GCC should be run with a more permissive
set of options. 

For the mini-os I'll give the patches some thought. I guess it's being
used as a starting point for ports that may use a different compiler
or have a predetermined set of command-line options, so perhaps it
makes sense...

 -- Keir

PS. Please send your documentation updates when you think it's
worthwhile. They'll be vey useful!
.


-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: compile fixes for mini-os
  2004-01-25  1:00 ` Keir Fraser
@ 2004-01-25  1:27   ` Kip Macy
  2004-01-25  1:58     ` Keir Fraser
  0 siblings, 1 reply; 8+ messages in thread
From: Kip Macy @ 2004-01-25  1:27 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1618 bytes --]

I've attached the full diff as well as interface.tex. I sent
interface.tex to Ian, but haven't got any ack back.


> Personally I dislike the warnings GCC emits when it's placed in
> super-anal mode. I would never take fixes for Xen or Xenolinux to stop
> those warnings since I think GCC should be run with a more permissive
> set of options.

I admit that many of the options are just plain annoying. However, in
one instance you were casting away volatile, which could get you in
trouble. There are more mundane problems that could cause issues, such
as signed/unsigned comparisons. I would argue that you might as well
never use the const qualifier if you're just going to cast it away in
most cases. In addition you frequently do pointer arithmetic with void
pointers, you're just relying on the compiler to know that you really
mean char pointer. The changes to inline usage are neccessary to get
the code to compile at all with -ansi.


> For the mini-os I'll give the patches some thought. I guess it's being
> used as a starting point for ports that may use a different compiler
> or have a predetermined set of command-line options, so perhaps it
> makes sense...

Your point is taken for Xen. However, a lot of environments are more
restrictive, so it would be a service to users to accomodate them in
that respect.



> PS. Please send your documentation updates when you think it's
> worthwhile. They'll be vey useful!
> .


It is pretty incomplete as things stand, but it might provide a
sufficient start so that others would feel happy adding a bit here
and there as the pieces fall into place for them.

[-- Attachment #2: Type: TEXT/PLAIN, Size: 27632 bytes --]

===== docs/interface.tex 1.1 vs edited =====
18,19c18,20
< {\huge Xen v1.1 for x86} \\[80mm]
< {\Large Copyright (c) 2003, The Xen Team} \\[3mm]
---
> {\huge Xen v1.3 for x86} \\[80mm]
> 
> {\Large Xen is Copyright (c) 2004, The Xen Team} \\[3mm]
21c22
< {\large Last updated on 28th October, 2003}
---
> {\large Last updated on 18th January, 2004}
46a48,83
> Xen allows the hardware resouces of a machine to be virtualized and
> dynamically partitioned such as to allow multiple different 'guest'
> operating system images to be run simultaneously.
> 
> Virtualizing the machine in this manner provides flexibility allowing
> different users to choose their preferred operating system (Windows,
> Linux, FreeBSD, or a custom operating system). Furthermore, Xen provides
> secure partitioning between these 'domains', and enables better resource
> accounting and QoS isolation than can be achieved with a conventional
> operating system.
> 
> The hypervisor runs directly on server hardware and dynamically partitions
> it between a number of {\it domains}, each of which hosts an instance
> of a {\it guest operating system}. The hypervisor provides just enough
> abstraction of the machine to allow effective isolation and resource 
> management between these domains.
> 
> Xen essentially takes a virtual machine approach as pioneered by IBM VM/370.
> However, unlike VM/370 or more recent efforts such as VMWare and Virtual PC,
> Xen doesn not attempt to completely virtualize the underlying hardware. Instead
> parts of the hosted guest operating systems to work with the hypervisor; the
> operating system is effectively ported to a new target architecture, typically
> requiring changes in just the machine-dependent code. The user-level API is
> unchanged, thus existing binaries and operating system distributions can work
> unmodified.
> 
> In addition to exporting virtualized instances of CPU, memory, network and
> block devicees, Xen exposes a control interface to set how these resources
> are shared between the running domains. The control interface is privileged
> and may only be accessed by one particular virtual machine: {\it domain0}.
> This domain is a required part of any Xen-base server and runs the application
> software that manages the control-plane aspects of the platform. Running the
> control software in {\it domain0}, distinct from the hypervisor itself, allows
> the Xen framework to separate the notions of {\it mechanism} and {\it policy}
> within the system.
> 
49a87,89
> All privileged state must be handled by Xen. The guest OS has no direct access
> to CR3 and is not permitted to update privileged bits in EFLAGS.
> 
50a91,93
> The IDT is virtualised by submitting a virtual 'trap
> table' to Xen. Most trap handlers are identical to native x86
> handlers. The page-fault handler is a noteable exception.
52a96,108
> Interrupts are virtualized by mapping them to events, which are delivered 
> asynchronously to the target domain. A guest OS can map these events onto
> its standard interrupt dispatch mechanisms, such as a simple vectoring 
> scheme. Each physical interrupt source controlled by the hypervisor, including
> network devices, disks, or the timer subsystem, is responsible for identifying
> the target for an incoming interrupt and sending an event to that domain.
> 
> This demultiplexing mechanism also provides a device-specific mechanism for 
> event coalescing or hold-off. For example, a guest OS may request to only 
> actually receive an event after {\it n} packets are queued ready for delivery
> to it, {\it t} nanoseconds after the first packet arrived (which ever is true
> first). This allows latency and throughput requirements to be addressed on a
> domain-specific basis.
54a111,146
> Guest operating systems need to be aware of the passage of real time and their
> own ``virtual time'', i.e. the time they have been executing. Furthermore, a
> notion of time is required in the hypervisor itself for scheduling and the
> activities that relate to it. To this end the hypervisor provides for notions
> of time: cycle counter time, system time, wall clock time, domain virtual 
> time.
> 
> 
> \section{Cycle counter time}
> This provides the finest-grained, free-running time reference, with the approximate
> frequency being publicly accessible. The cycle counter time is used to accurately
> extrapolate the other time references. On SMP machines it is currently assumed
> that the cycle counter time is synchronised between CPUs. The current x86-based
> implementation achieves this within inter-CPU communication latencies.
> 
> \section{System time}
> This is a 64-bit value containing the nanoseconds elapsed since boot time. Unlike
> cycle counter time, system time accurately reflects the passage of real time, i.e.
> it is adjusted several times a second for timer drift. This is done by running an
> NTP client in {\it domain0} on behalf of the machine, feeding updates to the 
> hypervisor. Intermediate values can be extrapolated using the cycle counter. 
> 
> \section{Wall clock time}
> This is the actual ``time of day'' Unix style struct timeval (i.e. seconds and
> microseconds since 1 January 1970, adjusted by leap seconds etc.). Again, an 
> NTP client hosted by {\it domain0} can help maintain this value. To guest 
> operating systems this value will be reported instead of the hardware RTC
> clock value and they can use the system time and cycle counter times to start
> and remain perfectly in time.
> 
> 
> \section{Domain virtual time}
> This progresses at the same pace as cycle counter time, but only while a domain
> is executing. It stops while a domain is de-scheduled. Therefore the share of the 
> CPU that a domain receives is indicated by the rate at which its domain virtual
> time increases, relative to the rate at which cycle counter time does so.
58c150,325
< \chapter{I/O}
---
> The hypervisor is responsible for providing memory to each of the domains running 
> over it. However, the Xen hypervisor's duty is restricted to managing physical
> memory and to policing page table updates. All other memory management functions
> are handly externally. Start-of-day issues such as building initial page tables
> for a domain, loading its kernel image and so on are done by the {\it domain builder}
> running in user-space with {\it domain0}. Paging to disk and swapping is handled
> by the guest operating systems themselves, if they need it.
> 
> On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It has full
> access to the physical memory available in the system and is responsible for 
> allocating portions of it to the domains. Guest operating systems run in and use
> {\it rings 1}, {\it 2} and {\it 3} as they see fit, aside from the fact that
> segmentation is used to prevent the guest OS from accessing a portion of the 
> linear address space that is reserved for use by the hypervisor. This approach
> allows transitions between the guest OS and hypervisor without flushing the TLB.
> We expect most guest operating systems will use ring 1 for their own operation
> and place applications (if they support such a notion) in ring 3.
> 
> \section{Physical Memory Allocation}
> The hypervisor reserves a small fixed portion of physical memory at system boot
> time. This special memory region is located at the beginning of physical memory
> and is mapped at the very top of every virtual address space. 
> 
> Any physical memory that is not used directly by the hypervisor is divided into
> pages and is available for allocation to domains. The hypervisor tracks which
> pages are free and which pages have been allocated to each domain. When a new
> domain is initialized, the hypervisor allocates it pages drawn from the free 
> list. The amount of memory required by the domain is passed to the hypervisor
> as one of the parameters for new domain initialization by the domain builder.
> 
> Domains can never be allocated further memory beyond that which was requested
> for them on initialization. However, a domain can return pages to the hypervisor
> if it discovers that its memory requirements have diminished.
> 
> % put reasons for why pages might be returned here.
> \section{Page Table Updates}
> In addition to managing physical memory allocation, the hypervisor is also in
> charge of performing page table updates on behalf of the domains. This is 
> neccessary to prevent domains from adding arbitrary mappings to their page
> tables or introducing mappings to other's page tables.
> 
> 
> 
> 
> \section{Pseudo-Physical Memory}
> The usual problem of external fragmentation means that a domain is unlikely to
> receive a contiguous stretch of physical memory. However, most guest operating
> systems do not have built-in support for operating in a fragmented physical
> address space e.g. Linux has to have a one-to-one mapping for it physical
> memory. There a notion of {\it pseudo physical memory} is introdouced. 
> Once a domain is allocated a number of pages, at its start of the day, one of
> the first things it needs to do is build its own {\it real physical} to 
> {\it pseudo physical} mapping. From that moment onwards {\it pseudo physical}
> address are used instead of discontiguous {\it real physical} addresses. Thus,
> the rest of the guest OS code has an impression of operating in a contiguous
> address space. Guest OS page tables contain real physical addresses. Mapping
> {\it pseudo physical} to {\it real physical} addresses is need on page
> table updates and also on remapping memory regions with the guest OS.
> 
> 
> 
> \chapter{Network I/O}
> Since the hypervisor must multiplex network resources, its network subsystem
> may be viewed as a virtual network switching element with each domain having
> one or more virtual network interfaces to this network.
> 
> The hypervisor acts conceptually as an IP router, forwarding each domain's
> traffic according to a set of rules.
> 
> \section{Hypervisor Packet Handling}
> The hypervisor is responsible primarily for {\it data-path} operations.
> In terms of networking this means packet transmission and reception.
> 
> On the transmission side, the hypervisor needs to perform two key actions:
> \begin{itemize}
> \item {\tt Validation:} A domain is only allowed to emit packets matching a certain
> specification; for example, ones in which the source IP address matches
> one assigned to the virtual interface over which it is sent. The hypervisor
> is responsible for ensuring any such requirements are met, either by checking
> or by stamping outgoing packets with prescribed values for certain fields.
> 
> \item {\tt Scheduling:} Since a number of domains can share a single ``real'' network 
> interface, the hypervisor must mediate access when several domains each 
> have packets queued for transmission. Of course, this general scheduling
> function subsumes basic shaping or rate-limiting schemes.
> 
> \item {\tt Logging and Accounting:} The hypervisor can be configured with classifier 
> rules that control how packets are accounted or logged. For example, 
> {\it domain0} could request that it receives a log message or copy of the
> packet whenever another domain attempts to send a TCP packet containg a 
> SYN.
> \end{itemize}
> On the recive side, the hypervisor's role is relatively straightforward:
> once a packet is received, it just needs to determine the virtual interface(s)
> to which it must be delivered and deliver it via page-flipping. 
> 
> 
> \section{Data Transfer}
> 
> Each virtual interface uses two ``descriptor rings'', one for transmit,
> the other for receive. Each descriptor identifies a block of contiguous
> physical memory allocated to the domain. There are four cases:
> 
> \begin{itemize}
> 
> \item The transmit ring carries packets to transmit from the domain to the
> hypervisor.
> 
> \item The return path of the transmit ring carries ``empty'' descriptors
> indicating that the contents have been transmitted and the memory can be
> re-used.
> 
> \item The receive ring carries empty descriptors from the domain to the 
> hypervisor; these provide storage space for that domain's received packets.
> 
> \item The return path of the receive ring carries packets that have been
> received.
> \end{itemize}
> 
> Real physical addresses are used throughout, with the domain performing 
> translation from pseudo-physical addresses if that is necessary.
> 
> If a domain does not keep its receive ring stocked with empty buffers then 
> packets destined to it may be dropped. This provides some defense against 
> receiver-livelock problems because an overload domain will cease to receive
> further data. Similarly, on the transmit path, it provides the application
> with feedback on the rate at which packets are able to leave the system.
> 
> Synchronization between the hypervisor and the domain is achieved using 
> counters held in shared memory that is accessible to both. Each ring has
> associated producer and consumer indices indicating the area in the ring
> that holds descriptors that contain data. After receiving {\it n} packets
> or {\t nanoseconds} after receiving the first packet, the hypervisor sends
> an event to the domain. 
> 
> \chapter{Block I/O}
> 
> \section{Virtual Block Devices (VBDs)}
> 
> All guest OS disk access goes through the VBD interface. The VBD interface
> provides the administrator with the ability to selectively grant domains 
> access to portions of block storage devices visible to the system.
> 
> A VBD can also be comprised of a set of extents from multiple storage devices.
> This provides the same functionality as a concatenated disk driver.
> 
> \section{Virtual Disks (VDs)}
> 
> VDs are an abstraction built on top of the VBD interface. One can reserve disk
> space for use by the VD layer. This space is then managed as a pool of free extents.
> The VD tools can automatically allocate collections of extents from this pool to
> create ``virtual disks'' on demand. 
> 
> \subsection{Virtual Disk Management}
> The VD management code consists of a set of python libraries. It can therefore
> be accessed by custom scripts as well as the convenience scripts provided. The
> VD database is a SQLite database in /var/spool/xen\_vdisk.sqlite.
> 
> The VD scripts and general VD usage are documented in the VBD-HOWTO.txt.
> 
> \subsection{Data Transfer}
> Domains which have been granted access to a logical block device are permitted
> to read and write it directly through the hypervisor, rather than requiring
> {\it domain0} to mediate every data access. 
> 
> In overview, the same style of descriptor-ring that is used for network
> packets is used here. Each domain has one ring that carries operation requests to the 
> hypervisor and carries the results back again. 
> 
> Rather than copying data in and out of the hypervisor, we use page pinning to
> enable DMA transfers directly between the physical device and the domain's 
> buffers. Disk read operations are straightforward; the hypervisor just needs
> to know which pages have pending DMA transfers, and prevent the guest OS from
> giving the page back to the hypervisor, or to use them for storing page tables.
> 
> %block API here 
60a328,438
> {\it Domain0} is responsible for building all other domains on the server
> and providing control interfaces for managing scheduling, networking, and
> blocks.
> 
> 
> \chapter{Hypervisor calls}
> 
> \section{ set\_trap\_table(trap\_info\_t *table)} 
> 
> Install trap handler table.
> 
> \section{ mmu\_update(mmu\_update\_t *req, int count)} 
> Update the page table for the domain. Updates can be batched.
> The update types are: 
> 
> {\it MMU\_NORMAL\_PT\_UPDATE}:
> 
> {\it MMU\_UNCHECKED\_PT\_UPDATE}:
> 
> {\it MMU\_MACHPHYS\_UPDATE}:
> 
> {\it MMU\_EXTENDED\_COMMAND}:
> 
> \section{ console\_write(const char *str, int count)}
> Output buffer str to the console.
> 
> \section{ set\_gdt(unsigned long *frame\_list, int entries)} 
> Set the global descriptor table - virtualization for lgdt.
> 
> \section{ stack\_switch(unsigned long ss, unsigned long esp)} 
> Request context switch from hypervisor.
> 
> \section{ set\_callbacks(unsigned long event\_selector, unsigned long event\_address,
>     			unsigned long failsafe\_selector, unsigned long failsafe\_address) } 
>  Register OS event processing routine. In Linux both the event\_selector and 
> failsafe\_selector are the kernel's CS. The value event\_address specifies the address for
> an interrupt handler dispatch routine and failsafe\_address specifies a handler for 
> application faults.
> 
> \section{ net\_io\_op(netop\_t *op)}  
> Notify hypervisor of updates to transmit and/or receive descriptor rings.
> 
> \section{ fpu\_taskswitch(void)} 
> Notify hypervisor that fpu registers needed to be save on context switch.
> 
> \section{ sched\_op(unsigned long op)} 
> Request scheduling operation from hypervisor. The options are: yield, stop, and exit.
> 
> \section{ dom0\_op(dom0\_op\_t *op)} 
> Administrative domain operations for domain management. The options are:
> 
> {\it DOM0\_CREATEDOMAIN}: create new domain, specifying the name and memory usage
> in kilobytes.
> 
> {\it DOM0\_STARTDOMAIN}: make domain schedulable
> 
> {\it DOM0\_STOPDOMAIN}: mark domain as unschedulable
> 
> {\it DOM0\_DESTROYDOMAIN}: deallocate resources associated with the domain
> 
> {\it DOM0\_GETMEMLIST}: get list of pages used by the domain
> 
> {\it DOM0\_BUILDDOMAIN}: do final guest OS setup for domain
> 
> {\it DOM0\_BVTCTL}: adjust scheduler context switch time
> 
> {\it DOM0\_ADJUSTDOM}: adjust scheduling priorities for domain
> 
> {\it DOM0\_GETDOMAINFO}: get statistics about the domain
> 
> {\it DOM0\_IOPL}:
> 
> {\it DOM0\_MSR}:
> 
> {\it DOM0\_DEBUG}:
> 
> {\it DOM0\_SETTIME}: set system time
> 
> {\it DOMO\_READCONSOLE}: read console content from hypervisor buffer ring
> 
> {\it DOMO\_PINCPUDOMAIN}: pin domain to a particular CPU
> 
> 
> \section{network\_op(network\_op\_t *op)} 
> update network ruleset
> 
> \section{ block\_io\_op(block\_io\_op\_t *op)}
> 
> \section{ set\_debugreg(int reg, unsigned long value)}
> set debug register reg to value
> 
> \section{ get\_debugreg(int reg)}
>  get the debug register reg
> 
> \section{ update\_descriptor(unsigned long pa, unsigned long word1, unsigned long word2)} 
> 
> \section{ set\_fast\_trap(int idx)}
>  install traps to allow guest OS to bypass hypervisor
> 
> \section{ dom\_mem\_op(dom\_mem\_op\_t *op)}
>  increase or decrease memory reservations for guest OS
> 
> \section{ multicall(multicall\_entry\_t *call\_list, int nr\_calls)}
>  execute a series of hypervisor calls
> 
> \section{ kbd\_op(unsigned char op, unsigned char val)}
> 
> \section{update\_va\_mapping(unsigned long page\_nr, unsigned long val, unsigned long flags)}
> 
> \section{ event\_channel\_op(unsigned int cmd, unsigned int id)} 
> inter-domain event-channel management, options are: open, close, send, and status.
===== docs/style.tex 1.1 vs edited =====
7c7
< \usepackage{sabon}
---
> % \usepackage{sabon}
===== extras/mini-os/Makefile 1.5 vs edited =====
6c6,7
< CFLAGS  := -fno-builtin -O3 -Wall -Ih/
---
> CFLAGS  := -fno-builtin -O3 -Wall -Ih/ -Wcast-qual -Wredundant-decls -Wnested-externs -Wstrict-prototypes -Wpointer-arith -Winline -ansi
> 
===== extras/mini-os/events.c 1.3 vs edited =====
35,36c35,36
< 
<     if (ev >= NR_EVS) {
---
> 	/* assuming ev can't be negative */
>     if ((unsigned int)ev >= NR_EVS) {
94c94
<     int i;
---
>     unsigned int i;
===== extras/mini-os/kernel.c 1.8 vs edited =====
63c63
< 
---
> extern char shared_info[PAGE_SIZE];
66c66
<     extern char shared_info[PAGE_SIZE];
---
> 
===== extras/mini-os/mm.c 1.4 vs edited =====
90c90
<         int i;
---
>         unsigned int i;
222c222
<     int i;
---
>     unsigned int i;
259c259
<         for ( i = PAGE_SHIFT; (1<<(i+1)) <= range; i++ )
---
>         for ( i = PAGE_SHIFT;  (unsigned int)(1<<(i+1)) <= range;  i++ )
281c281
<     int i;
---
>     unsigned int i;
300c300
<     while ( i != order )
---
>     while ( i != (unsigned int)order )
===== extras/mini-os/time.c 1.4 vs edited =====
91c91
< static inline unsigned long get_time_delta_usecs(void)
---
> __inline__ static unsigned long get_time_delta_usecs(void)
===== extras/mini-os/traps.c 1.5 vs edited =====
60c60
< static inline void dump_code(unsigned eip)
---
> __inline__ static void dump_code(unsigned eip)
83c83
< static void inline do_trap(int trapnr, char *str,
---
> __inline__ static void do_trap(int trapnr, char *str,
87c87
<   printf("%d %s", trapnr, str);
---
>   printf("%d %s : %ld", trapnr, str, error_code);
===== extras/mini-os/h/hypervisor.h 1.8 vs edited =====
35c35
< static inline int HYPERVISOR_set_trap_table(trap_info_t *table)
---
> __inline__ static int HYPERVISOR_set_trap_table(trap_info_t *table)
46c46
< static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count)
---
> __inline__ static int HYPERVISOR_mmu_update(mmu_update_t *req, int count)
57c57
< static inline int HYPERVISOR_console_write(const char *str, int count)
---
> __inline__ static int HYPERVISOR_console_write(const char *str, int count)
69c69
< static inline int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
---
> __inline__ static int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
81c81
< static inline int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
---
> __inline__ static int HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
92c92
< static inline int HYPERVISOR_set_callbacks(
---
> __inline__ static int HYPERVISOR_set_callbacks(
106c106
< static inline int HYPERVISOR_net_io_op(netop_t *op)
---
> __inline__ static int HYPERVISOR_net_io_op(netop_t *op)
117c117
< static inline int HYPERVISOR_fpu_taskswitch(void)
---
> __inline__ static int HYPERVISOR_fpu_taskswitch(void)
127c127
< static inline int HYPERVISOR_yield(void)
---
> __inline__ static int HYPERVISOR_yield(void)
138c138
< static inline int HYPERVISOR_exit(void)
---
> __inline__ static int HYPERVISOR_exit(void)
149c149
< static inline int HYPERVISOR_stop(void)
---
> __inline__ static int HYPERVISOR_stop(void)
160c160
< static inline int HYPERVISOR_dom0_op(void *dom0_op)
---
> __inline__ static int HYPERVISOR_dom0_op(void *dom0_op)
171c171
< static inline int HYPERVISOR_network_op(void *network_op)
---
> __inline__ static int HYPERVISOR_network_op(void *network_op)
182c182
< static inline int HYPERVISOR_block_io_op(unsigned int op)
---
> __inline__ static int HYPERVISOR_block_io_op(unsigned int op)
193c193
< static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value)
---
> __inline__ static int HYPERVISOR_set_debugreg(int reg, unsigned long value)
204c204
< static inline unsigned long HYPERVISOR_get_debugreg(int reg)
---
> __inline__ static unsigned long HYPERVISOR_get_debugreg(int reg)
215c215
< static inline int HYPERVISOR_update_descriptor(
---
> __inline__ static int HYPERVISOR_update_descriptor(
227c227
< static inline int HYPERVISOR_set_fast_trap(int idx)
---
> __inline__ static int HYPERVISOR_set_fast_trap(int idx)
238c238
< static inline int HYPERVISOR_dom_mem_op(void *dom_mem_op)
---
> __inline__ static int HYPERVISOR_dom_mem_op(void *dom_mem_op)
249c249
< static inline int HYPERVISOR_multicall(void *call_list, int nr_calls)
---
> __inline__ static int HYPERVISOR_multicall(void *call_list, int nr_calls)
260c260
< static inline long HYPERVISOR_kbd_op(unsigned char op, unsigned char val)
---
> __inline__ static long HYPERVISOR_kbd_op(unsigned char op, unsigned char val)
271c271
< static inline int HYPERVISOR_update_va_mapping(
---
> __inline__ static int HYPERVISOR_update_va_mapping(
===== extras/mini-os/h/lib.h 1.2 vs edited =====
117c117
< struct mallinfo mallinfo();
---
> struct mallinfo mallinfo(void);
126c126
< void malloc_stats();
---
> void malloc_stats(void);
===== extras/mini-os/h/mm.h 1.3 vs edited =====
54c54
< static inline unsigned long phys_to_machine(unsigned long phys)
---
> __inline__ static unsigned long phys_to_machine(unsigned long phys)
60c60
< static inline unsigned long machine_to_phys(unsigned long machine)
---
> __inline__ static unsigned long machine_to_phys(unsigned long machine)
72c72
< void init_mm();
---
> void init_mm(void);
===== extras/mini-os/h/os.h 1.5 vs edited =====
140,141c140,142
< #define __xg(x) ((struct __xchg_dummy *)(x))
< static inline unsigned long __xchg(unsigned long x, volatile void * ptr,
---
> #define __xg(x) ((volatile struct __xchg_dummy *)(x))
> 
> __inline__ static unsigned long __xchg(unsigned long x, volatile void * ptr,
===== extras/mini-os/lib/malloc.c 1.3 vs edited =====
47c47
<     static void *last;
---
>     static char *last;
49c49
<     void *ret;
---
>     char *ret;
57c57
<     ret = (void *)alloc_pages(order);
---
>     ret = (char *)alloc_pages(order);
===== extras/mini-os/lib/math.c 1.2 vs edited =====
143,144c143
< __qdivrem(uq, vq, arq)
< 	u64 uq, vq, *arq;
---
> __qdivrem(u64 uq, u64 vq, u64 *arq)
366,367c365
< __udivdi3(a, b)
<         u64 a, b;
---
> __udivdi3(u64 a, u64 b)
377,378c375
< __umoddi3(a, b)
<         u_quad_t a, b;
---
> __umoddi3(u_quad_t a, u_quad_t b)
===== extras/mini-os/lib/string.c 1.2 vs edited =====
38c38,39
< 	char *tmp = (char *) dest, *s = (char *) src;
---
> 	char *tmp = (char *) dest;
> 	const char *s = (const char *) src;
123c124
<         return (char *) s;
---
>         return (char *)s;
126,138c127,144
< char * strstr(const char * s1,const char * s2)
< {
<         int l1, l2;
< 
<         l2 = strlen(s2);
<         if (!l2)
<                 return (char *) s1;
<         l1 = strlen(s1);
<         while (l1 >= l2) {
<                 l1--;
<                 if (!memcmp(s1,s2,l2))
<                         return (char *) s1;
<                 s1++;
---
> /*
>  * Find the first occurrence of find in s.
>  */
> char *
> strstr(const char *s, const char *find)
> {
> 	    char c, sc;
>         size_t len;
> 
>         if ((c = *find++) != 0) {
>                 len = strlen(find);
>                 do {
>                         do {
>                                 if ((sc = *s++) == 0)
>                                         return (NULL);
>                         } while (sc != c);
>                 } while (strncmp(s, find, len) != 0);
>                 s--;
140c146
<         return NULL;
---
>         return ((char *)s);
142d147
< 
===== xen/include/hypervisor-ifs/block.h 1.24 vs edited =====
107,108c107,108
<     int         max;             // maximumum number of disks to report
<     xen_disk_t *disks;           // pointer to array of disk info 
---
>     int         max;             /* maximumum number of disks to report */
>     xen_disk_t *disks;           /* pointer to array of disk info */ 
110c110
<     int         count;           // how many disks we have info about 
---
>     int         count;           /* how many disks we have info about */

[-- Attachment #3: Type: TEXT/PLAIN, Size: 19371 bytes --]

\documentclass[11pt,twoside,final,openright]{xenstyle}
\usepackage{a4,graphicx,setspace}
\setstretch{1.15}
\input{style.tex}

\begin{document}

% TITLE PAGE
\pagestyle{empty}
\begin{center}
\vspace*{\fill}
\includegraphics{eps/xenlogo.eps}
\vfill
\vfill
\vfill
\begin{tabular}{l}
{\Huge \bf Interface manual} \\[4mm]
{\huge Xen v1.3 for x86} \\[80mm]

{\Large Xen is Copyright (c) 2004, The Xen Team} \\[3mm]
{\Large University of Cambridge, UK} \\[20mm]
{\large Last updated on 18th January, 2004}
\end{tabular}
\vfill
\end{center}
\cleardoublepage

% TABLE OF CONTENTS
\pagestyle{plain}
\pagenumbering{roman}
{ \parskip 0pt plus 1pt
  \tableofcontents }
\cleardoublepage

% PREPARE FOR MAIN TEXT
\pagenumbering{arabic}
\raggedbottom
\widowpenalty=10000
\clubpenalty=10000
\parindent=0pt
\renewcommand{\topfraction}{.8}
\renewcommand{\bottomfraction}{.8}
\renewcommand{\textfraction}{.2}
\renewcommand{\floatpagefraction}{.8}
\setstretch{1.15}

\chapter{Introduction}
Xen allows the hardware resouces of a machine to be virtualized and
dynamically partitioned such as to allow multiple different 'guest'
operating system images to be run simultaneously.

Virtualizing the machine in this manner provides flexibility allowing
different users to choose their preferred operating system (Windows,
Linux, FreeBSD, or a custom operating system). Furthermore, Xen provides
secure partitioning between these 'domains', and enables better resource
accounting and QoS isolation than can be achieved with a conventional
operating system.

The hypervisor runs directly on server hardware and dynamically partitions
it between a number of {\it domains}, each of which hosts an instance
of a {\it guest operating system}. The hypervisor provides just enough
abstraction of the machine to allow effective isolation and resource 
management between these domains.

Xen essentially takes a virtual machine approach as pioneered by IBM VM/370.
However, unlike VM/370 or more recent efforts such as VMWare and Virtual PC,
Xen doesn not attempt to completely virtualize the underlying hardware. Instead
parts of the hosted guest operating systems to work with the hypervisor; the
operating system is effectively ported to a new target architecture, typically
requiring changes in just the machine-dependent code. The user-level API is
unchanged, thus existing binaries and operating system distributions can work
unmodified.

In addition to exporting virtualized instances of CPU, memory, network and
block devicees, Xen exposes a control interface to set how these resources
are shared between the running domains. The control interface is privileged
and may only be accessed by one particular virtual machine: {\it domain0}.
This domain is a required part of any Xen-base server and runs the application
software that manages the control-plane aspects of the platform. Running the
control software in {\it domain0}, distinct from the hypervisor itself, allows
the Xen framework to separate the notions of {\it mechanism} and {\it policy}
within the system.


\chapter{CPU state}

All privileged state must be handled by Xen. The guest OS has no direct access
to CR3 and is not permitted to update privileged bits in EFLAGS.

\chapter{Exceptions}
The IDT is virtualised by submitting a virtual 'trap
table' to Xen. Most trap handlers are identical to native x86
handlers. The page-fault handler is a noteable exception.

\chapter{Interrupts and events}
Interrupts are virtualized by mapping them to events, which are delivered 
asynchronously to the target domain. A guest OS can map these events onto
its standard interrupt dispatch mechanisms, such as a simple vectoring 
scheme. Each physical interrupt source controlled by the hypervisor, including
network devices, disks, or the timer subsystem, is responsible for identifying
the target for an incoming interrupt and sending an event to that domain.

This demultiplexing mechanism also provides a device-specific mechanism for 
event coalescing or hold-off. For example, a guest OS may request to only 
actually receive an event after {\it n} packets are queued ready for delivery
to it, {\it t} nanoseconds after the first packet arrived (which ever is true
first). This allows latency and throughput requirements to be addressed on a
domain-specific basis.

\chapter{Time}
Guest operating systems need to be aware of the passage of real time and their
own ``virtual time'', i.e. the time they have been executing. Furthermore, a
notion of time is required in the hypervisor itself for scheduling and the
activities that relate to it. To this end the hypervisor provides for notions
of time: cycle counter time, system time, wall clock time, domain virtual 
time.


\section{Cycle counter time}
This provides the finest-grained, free-running time reference, with the approximate
frequency being publicly accessible. The cycle counter time is used to accurately
extrapolate the other time references. On SMP machines it is currently assumed
that the cycle counter time is synchronised between CPUs. The current x86-based
implementation achieves this within inter-CPU communication latencies.

\section{System time}
This is a 64-bit value containing the nanoseconds elapsed since boot time. Unlike
cycle counter time, system time accurately reflects the passage of real time, i.e.
it is adjusted several times a second for timer drift. This is done by running an
NTP client in {\it domain0} on behalf of the machine, feeding updates to the 
hypervisor. Intermediate values can be extrapolated using the cycle counter. 

\section{Wall clock time}
This is the actual ``time of day'' Unix style struct timeval (i.e. seconds and
microseconds since 1 January 1970, adjusted by leap seconds etc.). Again, an 
NTP client hosted by {\it domain0} can help maintain this value. To guest 
operating systems this value will be reported instead of the hardware RTC
clock value and they can use the system time and cycle counter times to start
and remain perfectly in time.


\section{Domain virtual time}
This progresses at the same pace as cycle counter time, but only while a domain
is executing. It stops while a domain is de-scheduled. Therefore the share of the 
CPU that a domain receives is indicated by the rate at which its domain virtual
time increases, relative to the rate at which cycle counter time does so.

\chapter{Memory}

The hypervisor is responsible for providing memory to each of the domains running 
over it. However, the Xen hypervisor's duty is restricted to managing physical
memory and to policing page table updates. All other memory management functions
are handly externally. Start-of-day issues such as building initial page tables
for a domain, loading its kernel image and so on are done by the {\it domain builder}
running in user-space with {\it domain0}. Paging to disk and swapping is handled
by the guest operating systems themselves, if they need it.

On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It has full
access to the physical memory available in the system and is responsible for 
allocating portions of it to the domains. Guest operating systems run in and use
{\it rings 1}, {\it 2} and {\it 3} as they see fit, aside from the fact that
segmentation is used to prevent the guest OS from accessing a portion of the 
linear address space that is reserved for use by the hypervisor. This approach
allows transitions between the guest OS and hypervisor without flushing the TLB.
We expect most guest operating systems will use ring 1 for their own operation
and place applications (if they support such a notion) in ring 3.

\section{Physical Memory Allocation}
The hypervisor reserves a small fixed portion of physical memory at system boot
time. This special memory region is located at the beginning of physical memory
and is mapped at the very top of every virtual address space. 

Any physical memory that is not used directly by the hypervisor is divided into
pages and is available for allocation to domains. The hypervisor tracks which
pages are free and which pages have been allocated to each domain. When a new
domain is initialized, the hypervisor allocates it pages drawn from the free 
list. The amount of memory required by the domain is passed to the hypervisor
as one of the parameters for new domain initialization by the domain builder.

Domains can never be allocated further memory beyond that which was requested
for them on initialization. However, a domain can return pages to the hypervisor
if it discovers that its memory requirements have diminished.

% put reasons for why pages might be returned here.
\section{Page Table Updates}
In addition to managing physical memory allocation, the hypervisor is also in
charge of performing page table updates on behalf of the domains. This is 
neccessary to prevent domains from adding arbitrary mappings to their page
tables or introducing mappings to other's page tables.




\section{Pseudo-Physical Memory}
The usual problem of external fragmentation means that a domain is unlikely to
receive a contiguous stretch of physical memory. However, most guest operating
systems do not have built-in support for operating in a fragmented physical
address space e.g. Linux has to have a one-to-one mapping for it physical
memory. There a notion of {\it pseudo physical memory} is introdouced. 
Once a domain is allocated a number of pages, at its start of the day, one of
the first things it needs to do is build its own {\it real physical} to 
{\it pseudo physical} mapping. From that moment onwards {\it pseudo physical}
address are used instead of discontiguous {\it real physical} addresses. Thus,
the rest of the guest OS code has an impression of operating in a contiguous
address space. Guest OS page tables contain real physical addresses. Mapping
{\it pseudo physical} to {\it real physical} addresses is need on page
table updates and also on remapping memory regions with the guest OS.



\chapter{Network I/O}
Since the hypervisor must multiplex network resources, its network subsystem
may be viewed as a virtual network switching element with each domain having
one or more virtual network interfaces to this network.

The hypervisor acts conceptually as an IP router, forwarding each domain's
traffic according to a set of rules.

\section{Hypervisor Packet Handling}
The hypervisor is responsible primarily for {\it data-path} operations.
In terms of networking this means packet transmission and reception.

On the transmission side, the hypervisor needs to perform two key actions:
\begin{itemize}
\item {\tt Validation:} A domain is only allowed to emit packets matching a certain
specification; for example, ones in which the source IP address matches
one assigned to the virtual interface over which it is sent. The hypervisor
is responsible for ensuring any such requirements are met, either by checking
or by stamping outgoing packets with prescribed values for certain fields.

\item {\tt Scheduling:} Since a number of domains can share a single ``real'' network 
interface, the hypervisor must mediate access when several domains each 
have packets queued for transmission. Of course, this general scheduling
function subsumes basic shaping or rate-limiting schemes.

\item {\tt Logging and Accounting:} The hypervisor can be configured with classifier 
rules that control how packets are accounted or logged. For example, 
{\it domain0} could request that it receives a log message or copy of the
packet whenever another domain attempts to send a TCP packet containg a 
SYN.
\end{itemize}
On the recive side, the hypervisor's role is relatively straightforward:
once a packet is received, it just needs to determine the virtual interface(s)
to which it must be delivered and deliver it via page-flipping. 


\section{Data Transfer}

Each virtual interface uses two ``descriptor rings'', one for transmit,
the other for receive. Each descriptor identifies a block of contiguous
physical memory allocated to the domain. There are four cases:

\begin{itemize}

\item The transmit ring carries packets to transmit from the domain to the
hypervisor.

\item The return path of the transmit ring carries ``empty'' descriptors
indicating that the contents have been transmitted and the memory can be
re-used.

\item The receive ring carries empty descriptors from the domain to the 
hypervisor; these provide storage space for that domain's received packets.

\item The return path of the receive ring carries packets that have been
received.
\end{itemize}

Real physical addresses are used throughout, with the domain performing 
translation from pseudo-physical addresses if that is necessary.

If a domain does not keep its receive ring stocked with empty buffers then 
packets destined to it may be dropped. This provides some defense against 
receiver-livelock problems because an overload domain will cease to receive
further data. Similarly, on the transmit path, it provides the application
with feedback on the rate at which packets are able to leave the system.

Synchronization between the hypervisor and the domain is achieved using 
counters held in shared memory that is accessible to both. Each ring has
associated producer and consumer indices indicating the area in the ring
that holds descriptors that contain data. After receiving {\it n} packets
or {\t nanoseconds} after receiving the first packet, the hypervisor sends
an event to the domain. 

\chapter{Block I/O}

\section{Virtual Block Devices (VBDs)}

All guest OS disk access goes through the VBD interface. The VBD interface
provides the administrator with the ability to selectively grant domains 
access to portions of block storage devices visible to the system.

A VBD can also be comprised of a set of extents from multiple storage devices.
This provides the same functionality as a concatenated disk driver.

\section{Virtual Disks (VDs)}

VDs are an abstraction built on top of the VBD interface. One can reserve disk
space for use by the VD layer. This space is then managed as a pool of free extents.
The VD tools can automatically allocate collections of extents from this pool to
create ``virtual disks'' on demand. 

\subsection{Virtual Disk Management}
The VD management code consists of a set of python libraries. It can therefore
be accessed by custom scripts as well as the convenience scripts provided. The
VD database is a SQLite database in /var/spool/xen\_vdisk.sqlite.

The VD scripts and general VD usage are documented in the VBD-HOWTO.txt.

\subsection{Data Transfer}
Domains which have been granted access to a logical block device are permitted
to read and write it directly through the hypervisor, rather than requiring
{\it domain0} to mediate every data access. 

In overview, the same style of descriptor-ring that is used for network
packets is used here. Each domain has one ring that carries operation requests to the 
hypervisor and carries the results back again. 

Rather than copying data in and out of the hypervisor, we use page pinning to
enable DMA transfers directly between the physical device and the domain's 
buffers. Disk read operations are straightforward; the hypervisor just needs
to know which pages have pending DMA transfers, and prevent the guest OS from
giving the page back to the hypervisor, or to use them for storing page tables.

%block API here 

\chapter{Privileged operations}
{\it Domain0} is responsible for building all other domains on the server
and providing control interfaces for managing scheduling, networking, and
blocks.


\chapter{Hypervisor calls}

\section{ set\_trap\_table(trap\_info\_t *table)} 

Install trap handler table.

\section{ mmu\_update(mmu\_update\_t *req, int count)} 
Update the page table for the domain. Updates can be batched.
The update types are: 

{\it MMU\_NORMAL\_PT\_UPDATE}:

{\it MMU\_UNCHECKED\_PT\_UPDATE}:

{\it MMU\_MACHPHYS\_UPDATE}:

{\it MMU\_EXTENDED\_COMMAND}:

\section{ console\_write(const char *str, int count)}
Output buffer str to the console.

\section{ set\_gdt(unsigned long *frame\_list, int entries)} 
Set the global descriptor table - virtualization for lgdt.

\section{ stack\_switch(unsigned long ss, unsigned long esp)} 
Request context switch from hypervisor.

\section{ set\_callbacks(unsigned long event\_selector, unsigned long event\_address,
    			unsigned long failsafe\_selector, unsigned long failsafe\_address) } 
 Register OS event processing routine. In Linux both the event\_selector and 
failsafe\_selector are the kernel's CS. The value event\_address specifies the address for
an interrupt handler dispatch routine and failsafe\_address specifies a handler for 
application faults.

\section{ net\_io\_op(netop\_t *op)}  
Notify hypervisor of updates to transmit and/or receive descriptor rings.

\section{ fpu\_taskswitch(void)} 
Notify hypervisor that fpu registers needed to be save on context switch.

\section{ sched\_op(unsigned long op)} 
Request scheduling operation from hypervisor. The options are: yield, stop, and exit.

\section{ dom0\_op(dom0\_op\_t *op)} 
Administrative domain operations for domain management. The options are:

{\it DOM0\_CREATEDOMAIN}: create new domain, specifying the name and memory usage
in kilobytes.

{\it DOM0\_STARTDOMAIN}: make domain schedulable

{\it DOM0\_STOPDOMAIN}: mark domain as unschedulable

{\it DOM0\_DESTROYDOMAIN}: deallocate resources associated with the domain

{\it DOM0\_GETMEMLIST}: get list of pages used by the domain

{\it DOM0\_BUILDDOMAIN}: do final guest OS setup for domain

{\it DOM0\_BVTCTL}: adjust scheduler context switch time

{\it DOM0\_ADJUSTDOM}: adjust scheduling priorities for domain

{\it DOM0\_GETDOMAINFO}: get statistics about the domain

{\it DOM0\_IOPL}:

{\it DOM0\_MSR}:

{\it DOM0\_DEBUG}:

{\it DOM0\_SETTIME}: set system time

{\it DOMO\_READCONSOLE}: read console content from hypervisor buffer ring

{\it DOMO\_PINCPUDOMAIN}: pin domain to a particular CPU


\section{network\_op(network\_op\_t *op)} 
update network ruleset

\section{ block\_io\_op(block\_io\_op\_t *op)}

\section{ set\_debugreg(int reg, unsigned long value)}
set debug register reg to value

\section{ get\_debugreg(int reg)}
 get the debug register reg

\section{ update\_descriptor(unsigned long pa, unsigned long word1, unsigned long word2)} 

\section{ set\_fast\_trap(int idx)}
 install traps to allow guest OS to bypass hypervisor

\section{ dom\_mem\_op(dom\_mem\_op\_t *op)}
 increase or decrease memory reservations for guest OS

\section{ multicall(multicall\_entry\_t *call\_list, int nr\_calls)}
 execute a series of hypervisor calls

\section{ kbd\_op(unsigned char op, unsigned char val)}

\section{update\_va\_mapping(unsigned long page\_nr, unsigned long val, unsigned long flags)}

\section{ event\_channel\_op(unsigned int cmd, unsigned int id)} 
inter-domain event-channel management, options are: open, close, send, and status.

\end{document}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: compile fixes for mini-os
  2004-01-25  1:27   ` Kip Macy
@ 2004-01-25  1:58     ` Keir Fraser
  2004-01-25  2:07       ` Kip Macy
  0 siblings, 1 reply; 8+ messages in thread
From: Keir Fraser @ 2004-01-25  1:58 UTC (permalink / raw)
  To: Kip Macy; +Cc: Keir Fraser, xen-devel

> > Personally I dislike the warnings GCC emits when it's placed in
> > super-anal mode. I would never take fixes for Xen or Xenolinux to stop
> > those warnings since I think GCC should be run with a more permissive
> > set of options.
> 
> I admit that many of the options are just plain annoying. However, in
> one instance you were casting away volatile, which could get you in
> trouble.

Hmmm... volatile generally gets sprinkled round like magic dust. I'm
not convinced it's usually needed.

> There are more mundane problems that could cause issues, such
> as signed/unsigned comparisons. I would argue that you might as well
> never use the const qualifier if you're just going to cast it away in
> most cases. In addition you frequently do pointer arithmetic with void
> pointers, you're just relying on the compiler to know that you really
> mean char pointer. The changes to inline usage are neccessary to get
> the code to compile at all with -ansi.

I'll give the fixes a look - probably when I get back to Cambridge in
teh middle of next week. I'll probably take them all for the mini-os -
I'll have a think about whether any of the warning options should be
added to Xen as well.

 -- Keir


-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: compile fixes for mini-os
  2004-01-25  1:58     ` Keir Fraser
@ 2004-01-25  2:07       ` Kip Macy
  2004-01-25  3:17         ` Keir Fraser
  0 siblings, 1 reply; 8+ messages in thread
From: Kip Macy @ 2004-01-25  2:07 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

>
> Hmmm... volatile generally gets sprinkled round like magic dust. I'm
> not convinced it's usually needed.

Looking closer at the code, it looks like gcc is just being dumb about
the context. The overhead of using -Wcast-qual and most of the others
probably isn't worth it. The one that is probably worth having for Xen,
if only because doing arithmetic on void pointers seems like blatantly
bad form, is -Wpointer-arith.

>
> I'll give the fixes a look - probably when I get back to Cambridge in
> teh middle of next week. I'll probably take them all for the mini-os -
> I'll have a think about whether any of the warning options should be
> added to Xen as well.
>

Thanks.

			-Kip


-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: compile fixes for mini-os
  2004-01-25  2:07       ` Kip Macy
@ 2004-01-25  3:17         ` Keir Fraser
  2004-01-25  3:34           ` Kip Macy
  0 siblings, 1 reply; 8+ messages in thread
From: Keir Fraser @ 2004-01-25  3:17 UTC (permalink / raw)
  To: Kip Macy; +Cc: Keir Fraser, xen-devel

> >
> > Hmmm... volatile generally gets sprinkled round like magic dust. I'm
> > not convinced it's usually needed.
> 
> Looking closer at the code, it looks like gcc is just being dumb about
> the context. The overhead of using -Wcast-qual and most of the others
> probably isn't worth it. The one that is probably worth having for Xen,
> if only because doing arithmetic on void pointers seems like blatantly
> bad form, is -Wpointer-arith.

Actually, I've looked at and applied your patch just now.

I think that most of the warnings options may be sane for Xen
(especially once the crufty Linux devuce drivers have been moved out
of teh code base).

The only two possible exceptions are -Wcast-qual and
-Wnested-externs. I like being able to make extern declarations in
function scope (it's for when I'm too lazy to place it in a sane
header file, and it indicates that only one function needs to be fixed
up if I ever want to do the declaration properly). I could perhaps
live without that though...

However, -Wcast-qual really sucks. I haven't even added it to the
mini-os! Anything which makes it impossible to implement Standard C
functions (eg. strstr()) without causing compiler warnings is just
plain wrong!

I guess there's a philosophical argument about which is wrong (the
StdC definition of 'const' or the StdC definition of 'strstr') but
basically I'd like to keep the usual prototype for th estring
functions but not have to suffer compile warnings :-) 'const' and
'volatile' are both difficult to use sanely -- I try to avoid them
wherever possible.

 -- Keir


-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: compile fixes for mini-os
  2004-01-25  3:17         ` Keir Fraser
@ 2004-01-25  3:34           ` Kip Macy
  2004-01-25  6:11             ` C qualifiers Keir Fraser
  0 siblings, 1 reply; 8+ messages in thread
From: Kip Macy @ 2004-01-25  3:34 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

>
> Actually, I've looked at and applied your patch just now.

Cool. That was a lot faster than "the middle of next week" :-).

>
> I think that most of the warnings options may be sane for Xen
> (especially once the crufty Linux devuce drivers have been moved out
> of teh code base).

Good.

> live without that though...

I'm not actually sure what is wrong with the nested extern declaration.
In general I think externs are a sign of sloppiness (to which I'm in no
way immune). However, if one is going to use them, I don't see what is
wrong with putting them right where one needs them.

>
> However, -Wcast-qual really sucks. I haven't even added it to the
> mini-os! Anything which makes it impossible to implement Standard C
> functions (eg. strstr()) without causing compiler warnings is just
> plain wrong!

I think one needs to be able to exempt functions from checking just
as one can with lint. For example, lint checks that if a function
returns a value, any caller should check the return value. "printf"
returns an int, *no one* does or should check the value of printf.
In the absence of such a facility I'm willing to let casting issues
slide.

>
> I guess there's a philosophical argument about which is wrong (the
> StdC definition of 'const' or the StdC definition of 'strstr') but
> basically I'd like to keep the usual prototype for th estring
> functions but not have to suffer compile warnings :-) 'const' and
> 'volatile' are both difficult to use sanely -- I try to avoid them
> wherever possible.

I think const is a perfectly usable construct. It is great for
allowing the compiler to check that functions that are supposed to
treat their inputs as immutable, do in fact not mutate them. I think
the StdC function definitions are braindead. How is it that you can
pass in an immutable reference to a function and then have that function
return a mutable reference to the same data? That doesn't make *any*
sense to me. If someone can explain the rationale behind it, I'm happy
to listen.


Thanks Keir. I hope that shortly I will be contributing something more
than just compiler warnings ;-).


					-Kip


-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: C qualifiers
  2004-01-25  3:34           ` Kip Macy
@ 2004-01-25  6:11             ` Keir Fraser
  0 siblings, 0 replies; 8+ messages in thread
From: Keir Fraser @ 2004-01-25  6:11 UTC (permalink / raw)
  To: Kip Macy; +Cc: Keir Fraser, xen-devel

> > I guess there's a philosophical argument about which is wrong (the
> > StdC definition of 'const' or the StdC definition of 'strstr') but
> > basically I'd like to keep the usual prototype for th estring
> > functions but not have to suffer compile warnings :-) 'const' and
> > 'volatile' are both difficult to use sanely -- I try to avoid them
> > wherever possible.
> 
> I think const is a perfectly usable construct. It is great for
> allowing the compiler to check that functions that are supposed to
> treat their inputs as immutable, do in fact not mutate them. I think
> the StdC function definitions are braindead. How is it that you can
> pass in an immutable reference to a function and then have that function
> return a mutable reference to the same data? That doesn't make *any*
> sense to me. If someone can explain the rationale behind it, I'm happy
> to listen.

Right. The string functions shouldn't be const-qualifying their
parameters. Unfortunately 'const' is so sticky that when you start
applying it to function parameters then it starts to proliferate
within your program --- I think it creates more hassle than it could
possibly save. If you could squash 'const' when computing a return
valkue then that would be very useful -- essentially trust the caller
to know what they're doing and only write via the return value if the
passed in a mutable argument.

'volatile' has similar problems and it's broken to use
it in portable programs anyway (and often broken even in non-portable
programs, if the program is multithreaded [since the CPU can reorder
volatile memory accesses, even though they were emitted in the correct
order by the compiler]).

 -- Keir


-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2004-01-25  6:11 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-01-24 21:39 compile fixes for mini-os Kip Macy
2004-01-25  1:00 ` Keir Fraser
2004-01-25  1:27   ` Kip Macy
2004-01-25  1:58     ` Keir Fraser
2004-01-25  2:07       ` Kip Macy
2004-01-25  3:17         ` Keir Fraser
2004-01-25  3:34           ` Kip Macy
2004-01-25  6:11             ` C qualifiers Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.