All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend
@ 2007-02-10 11:52 Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [2/10] i386: paravirt CPU hypercall batching mode Andi Kleen
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, Andi Kleen, Jeremy Fitzhardinge, Rusty Russell,
	Chris Wright, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>

The VMI backend uses explicit page type notification to track shadow page
tables.  The allocation of page table roots is especially tricky.  We need to
clone the root for non-PAE mode while it is protected under the pgd lock to
correctly copy the shadow.

We don't need to allocate pgds in PAE mode, (PDPs in Intel terminology) as
they only have 4 entries, and are cached entirely by the processor, which
makes shadowing them rather simple.

For base page table level allocation, pmd_populate provides the exact hook
point we need.  Also, we need to allocate pages when splitting a large page,
and we must release pages before returning the page to any free pool.

Despite being required with these slightly odd semantics for VMI, Xen also
uses these hooks to determine the exact moment when page tables are created or
released.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/paravirt.c |    6 ++++++
 arch/i386/mm/init.c         |    4 ++++
 arch/i386/mm/pageattr.c     |    2 ++
 arch/i386/mm/pgtable.c      |   24 ++++++++++++++++++++----
 include/asm-i386/paravirt.h |   14 ++++++++++++++
 include/asm-i386/pgalloc.h  |   30 ++++++++++++++++++++++++++----
 6 files changed, 72 insertions(+), 8 deletions(-)

Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -550,6 +550,12 @@ struct paravirt_ops paravirt_ops = {
 	.flush_tlb_kernel = native_flush_tlb_global,
 	.flush_tlb_single = native_flush_tlb_single,
 
+	.alloc_pt = (void *)native_nop,
+	.alloc_pd = (void *)native_nop,
+	.alloc_pd_clone = (void *)native_nop,
+	.release_pt = (void *)native_nop,
+	.release_pd = (void *)native_nop,
+
 	.set_pte = native_set_pte,
 	.set_pte_at = native_set_pte_at,
 	.set_pmd = native_set_pmd,
Index: linux/arch/i386/mm/init.c
===================================================================
--- linux.orig/arch/i386/mm/init.c
+++ linux/arch/i386/mm/init.c
@@ -62,6 +62,7 @@ static pmd_t * __init one_md_table_init(
 		
 #ifdef CONFIG_X86_PAE
 	pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+	paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
 	set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 	pud = pud_offset(pgd, 0);
 	if (pmd_table != pmd_offset(pud, 0)) 
@@ -82,6 +83,7 @@ static pte_t * __init one_page_table_ini
 {
 	if (pmd_none(*pmd)) {
 		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+		paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
 		if (page_table != pte_offset_kernel(pmd, 0))
 			BUG();	
@@ -345,6 +347,8 @@ static void __init pagetable_init (void)
 	/* Init entries of the first-level page table to the zero page */
 	for (i = 0; i < PTRS_PER_PGD; i++)
 		set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+#else
+	paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
 #endif
 
 	/* Enable PSE if available */
Index: linux/arch/i386/mm/pageattr.c
===================================================================
--- linux.orig/arch/i386/mm/pageattr.c
+++ linux/arch/i386/mm/pageattr.c
@@ -60,6 +60,7 @@ static struct page *split_large_page(uns
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
+	paravirt_alloc_pt(page_to_pfn(base));
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
                                           addr == address ? prot : ref_prot));
@@ -172,6 +173,7 @@ __change_page_attr(struct page *page, pg
 	if (!PageReserved(kpte_page)) {
 		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
 			ClearPagePrivate(kpte_page);
+			paravirt_release_pt(page_to_pfn(kpte_page));
 			list_add(&kpte_page->lru, &df_list);
 			revert_page(kpte_page, address);
 		}
Index: linux/arch/i386/mm/pgtable.c
===================================================================
--- linux.orig/arch/i386/mm/pgtable.c
+++ linux/arch/i386/mm/pgtable.c
@@ -248,9 +248,15 @@ void pgd_ctor(void *pgd, struct kmem_cac
 	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
 			KERNEL_PGD_PTRS);
+
 	if (PTRS_PER_PMD > 1)
 		return;
 
+	/* must happen under lock */
+	paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+			__pa(swapper_pg_dir) >> PAGE_SHIFT,
+			USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
+
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }
@@ -260,6 +266,7 @@ void pgd_dtor(void *pgd, struct kmem_cac
 {
 	unsigned long flags; /* can be called from interrupt context */
 
+	paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
 	spin_lock_irqsave(&pgd_lock, flags);
 	pgd_list_del(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -277,13 +284,18 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
 		if (!pmd)
 			goto out_oom;
+		paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
 		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
 	}
 	return pgd;
 
 out_oom:
-	for (i--; i >= 0; i--)
-		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+	for (i--; i >= 0; i--) {
+		pgd_t pgdent = pgd[i];
+		void* pmd = (void *)__va(pgd_val(pgdent)-1);
+		paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+		kmem_cache_free(pmd_cache, pmd);
+	}
 	kmem_cache_free(pgd_cache, pgd);
 	return NULL;
 }
@@ -294,8 +306,12 @@ void pgd_free(pgd_t *pgd)
 
 	/* in the PAE case user pgd entries are overwritten before usage */
 	if (PTRS_PER_PMD > 1)
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+			pgd_t pgdent = pgd[i];
+			void* pmd = (void *)__va(pgd_val(pgdent)-1);
+			paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+			kmem_cache_free(pmd_cache, pmd);
+		}
 	/* in the non-PAE case, free_pgtables() clears user pgd entries */
 	kmem_cache_free(pgd_cache, pgd);
 }
Index: linux/include/asm-i386/paravirt.h
===================================================================
--- linux.orig/include/asm-i386/paravirt.h
+++ linux/include/asm-i386/paravirt.h
@@ -127,6 +127,12 @@ struct paravirt_ops
 	void (fastcall *flush_tlb_kernel)(void);
 	void (fastcall *flush_tlb_single)(u32 addr);
 
+	void (fastcall *alloc_pt)(u32 pfn);
+	void (fastcall *alloc_pd)(u32 pfn);
+	void (fastcall *alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
+	void (fastcall *release_pt)(u32 pfn);
+	void (fastcall *release_pd)(u32 pfn);
+
 	void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
 	void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
 	void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval);
@@ -320,6 +326,14 @@ static inline unsigned long apic_read(un
 #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
 #define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
 
+#define paravirt_alloc_pt(pfn) paravirt_ops.alloc_pt(pfn)
+#define paravirt_release_pt(pfn) paravirt_ops.release_pt(pfn)
+
+#define paravirt_alloc_pd(pfn) paravirt_ops.alloc_pd(pfn)
+#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) \
+	paravirt_ops.alloc_pd_clone(pfn, clonepfn, start, count)
+#define paravirt_release_pd(pfn) paravirt_ops.release_pd(pfn)
+
 static inline void set_pte(pte_t *ptep, pte_t pteval)
 {
 	paravirt_ops.set_pte(ptep, pteval);
Index: linux/include/asm-i386/pgalloc.h
===================================================================
--- linux.orig/include/asm-i386/pgalloc.h
+++ linux/include/asm-i386/pgalloc.h
@@ -5,13 +5,31 @@
 #include <linux/threads.h>
 #include <linux/mm.h>		/* for struct page */
 
-#define pmd_populate_kernel(mm, pmd, pte) \
-		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_alloc_pt(pfn) do { } while (0)
+#define paravirt_alloc_pd(pfn) do { } while (0)
+#define paravirt_alloc_pd(pfn) do { } while (0)
+#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_release_pt(pfn) do { } while (0)
+#define paravirt_release_pd(pfn) do { } while (0)
+#endif
+
+#define pmd_populate_kernel(mm, pmd, pte)			\
+do {								\
+	paravirt_alloc_pt(__pa(pte) >> PAGE_SHIFT);		\
+	set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)));		\
+} while (0)
 
 #define pmd_populate(mm, pmd, pte) 				\
+do {								\
+	paravirt_alloc_pt(page_to_pfn(pte));			\
 	set_pmd(pmd, __pmd(_PAGE_TABLE +			\
 		((unsigned long long)page_to_pfn(pte) <<	\
-			(unsigned long long) PAGE_SHIFT)))
+			(unsigned long long) PAGE_SHIFT)));	\
+} while (0)
+
 /*
  * Allocate and free page tables.
  */
@@ -32,7 +50,11 @@ static inline void pte_free(struct page 
 }
 
 
-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+#define __pte_free_tlb(tlb,pte) 					\
+do {									\
+	paravirt_release_pt(page_to_pfn(pte));				\
+	tlb_remove_page((tlb),(pte));					\
+} while (0)
 
 #ifdef CONFIG_X86_PAE
 /*

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [2/10] i386: paravirt CPU hypercall batching mode
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [3/10] i386: iOPL handling for paravirt guests Andi Kleen
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, Andi Kleen, Jeremy Fitzhardinge, Rusty Russell,
	Chris Wright, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>

The VMI ROM has a mode where hypercalls can be queued and batched.  This turns
out to be a significant win during context switch, but must be done at a
specific point before side effects to CPU state are visible to subsequent
instructions.  This is similar to the MMU batching hooks already provided. 
The same hooks could be used by the Xen backend to implement a context switch
multicall.

To explain a bit more about lazy modes in the paravirt patches, basically, the
idea is that only one of lazy CPU or MMU mode can be active at any given time.
 Lazy MMU mode is similar to this lazy CPU mode, and allows for batching of
multiple PTE updates (say, inside a remap loop), but to avoid keeping some
kind of state machine about when to flush cpu or mmu updates, we just allow
one or the other to be active.  Although there is no real reason a more
comprehensive scheme could not be implemented, there is also no demonstrated
need for this extra complexity.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/paravirt.c   |    1 +
 arch/i386/kernel/process.c    |   25 +++++++++++++++++--------
 include/asm-generic/pgtable.h |   13 +++++++++++++
 include/asm-i386/paravirt.h   |   15 +++++++++++++++
 kernel/sched.c                |    7 +++++++
 5 files changed, 53 insertions(+), 8 deletions(-)

Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -545,6 +545,7 @@ struct paravirt_ops paravirt_ops = {
 	.apic_write_atomic = native_apic_write_atomic,
 	.apic_read = native_apic_read,
 #endif
+	.set_lazy_mode = (void *)native_nop,
 
 	.flush_tlb_user = native_flush_tlb,
 	.flush_tlb_kernel = native_flush_tlb_global,
Index: linux/arch/i386/kernel/process.c
===================================================================
--- linux.orig/arch/i386/kernel/process.c
+++ linux/arch/i386/kernel/process.c
@@ -670,14 +670,6 @@ struct task_struct fastcall * __switch_t
 	load_TLS(next, cpu);
 
 	/*
-	 * Restore %gs if needed (which is common)
-	 */
-	if (prev->gs | next->gs)
-		loadsegment(gs, next->gs);
-
-	write_pda(pcurrent, next_p);
-
-	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */
 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
@@ -686,6 +678,15 @@ struct task_struct fastcall * __switch_t
 
 	disable_tsc(prev_p, next_p);
 
+	/*
+	 * Leave lazy mode, flushing any hypercalls made here.
+	 * This must be done before restoring TLS segments so
+	 * the GDT and LDT are properly updated, and must be
+	 * done before math_state_restore, so the TS bit is up
+	 * to date.
+	 */
+	arch_leave_lazy_cpu_mode();
+
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
 	 * chances of needing FPU soon are obviously high now
@@ -693,6 +694,14 @@ struct task_struct fastcall * __switch_t
 	if (next_p->fpu_counter > 5)
 		math_state_restore();
 
+	/*
+	 * Restore %gs if needed (which is common)
+	 */
+	if (prev->gs | next->gs)
+		loadsegment(gs, next->gs);
+
+	write_pda(pcurrent, next_p);
+
 	return prev_p;
 }
 
Index: linux/include/asm-generic/pgtable.h
===================================================================
--- linux.orig/include/asm-generic/pgtable.h
+++ linux/include/asm-generic/pgtable.h
@@ -183,6 +183,19 @@ static inline void ptep_set_wrprotect(st
 #endif
 
 /*
+ * A facility to provide batching of the reload of page tables with the
+ * actual context switch code for paravirtualized guests.  By convention,
+ * only one of the lazy modes (CPU, MMU) should be active at any given
+ * time, entry should never be nested, and entry and exits should always
+ * be paired.  This is for sanity of maintaining and reasoning about the
+ * kernel code.
+ */
+#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE
+#define arch_enter_lazy_cpu_mode()	do {} while (0)
+#define arch_leave_lazy_cpu_mode()	do {} while (0)
+#endif
+
+/*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
  * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
Index: linux/include/asm-i386/paravirt.h
===================================================================
--- linux.orig/include/asm-i386/paravirt.h
+++ linux/include/asm-i386/paravirt.h
@@ -146,6 +146,8 @@ struct paravirt_ops
 	void (fastcall *pmd_clear)(pmd_t *pmdp);
 #endif
 
+	void (fastcall *set_lazy_mode)(int mode);
+
 	/* These two are jmp to, not actually called. */
 	void (fastcall *irq_enable_sysexit)(void);
 	void (fastcall *iret)(void);
@@ -386,6 +388,19 @@ static inline void pmd_clear(pmd_t *pmdp
 }
 #endif
 
+/* Lazy mode for batching updates / context switch */
+#define PARAVIRT_LAZY_NONE 0
+#define PARAVIRT_LAZY_MMU  1
+#define PARAVIRT_LAZY_CPU  2
+
+#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
+#define arch_enter_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_CPU)
+#define arch_leave_lazy_cpu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE)
+
+#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+#define arch_enter_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_MMU)
+#define arch_leave_lazy_mmu_mode() paravirt_ops.set_lazy_mode(PARAVIRT_LAZY_NONE)
+
 /* These all sit in the .parainstructions section to tell us what to patch. */
 struct paravirt_patch {
 	u8 *instr; 		/* original instructions */
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -1843,6 +1843,13 @@ context_switch(struct rq *rq, struct tas
 	struct mm_struct *mm = next->mm;
 	struct mm_struct *oldmm = prev->active_mm;
 
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_enter_lazy_cpu_mode();
+
 	if (!mm) {
 		next->active_mm = oldmm;
 		atomic_inc(&oldmm->mm_count);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [3/10] i386: iOPL handling for paravirt guests
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [2/10] i386: paravirt CPU hypercall batching mode Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [4/10] i386: sMP boot hook for paravirt Andi Kleen
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, Andi Kleen, Jeremy Fitzhardinge, Rusty Russell,
	Chris Wright, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>

I found a clever way to make the extra IOPL switching invisible to
non-paravirt compiles - since kernel_rpl is statically defined to be zero
there, and only non-zero rpl kernel have a problem restoring IOPL, as popf
does not restore IOPL flags unless run at CPL-0.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/process.c |    9 +++++++++
 1 file changed, 9 insertions(+)

Index: linux/arch/i386/kernel/process.c
===================================================================
--- linux.orig/arch/i386/kernel/process.c
+++ linux/arch/i386/kernel/process.c
@@ -670,6 +670,15 @@ struct task_struct fastcall * __switch_t
 	load_TLS(next, cpu);
 
 	/*
+	 * Restore IOPL if needed.  In normal use, the flags restore
+	 * in the switch assembly will handle this.  But if the kernel
+	 * is running virtualized at a non-zero CPL, the popf will
+	 * not restore flags, so it must be done in a separate step.
+	 */
+	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+		set_iopl_mask(next->iopl);
+
+	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */
 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [4/10] i386: sMP boot hook for paravirt
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [2/10] i386: paravirt CPU hypercall batching mode Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [3/10] i386: iOPL handling for paravirt guests Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [5/10] i386: vMI backend for paravirt-ops Andi Kleen
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, Andi Kleen, Jeremy Fitzhardinge, Rusty Russell,
	Chris Wright, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>

Add VMI SMP boot hook.  We emulate a regular boot sequence and use the same
APIC IPI initiation, we just poke magic values to load into the CPU state when
the startup IPI is received, rather than having to jump through a real mode
trampoline.

This is all that was needed to get SMP to work.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/kernel/paravirt.c |    2 ++
 arch/i386/kernel/smpboot.c  |    7 +++++++
 include/asm-i386/paravirt.h |    9 +++++++++
 include/asm-i386/smp.h      |    5 +++++
 4 files changed, 23 insertions(+)

Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -572,6 +572,8 @@ struct paravirt_ops paravirt_ops = {
 
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
+
+	.startup_ipi_hook = (void *)native_nop,
 };
 
 /*
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -834,6 +834,13 @@ wakeup_secondary_cpu(int phys_apicid, un
 		num_starts = 0;
 
 	/*
+	 * Paravirt / VMI wants a startup IPI hook here to set up the
+	 * target processor state.
+	 */
+	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
+		         (unsigned long) stack_start.esp);
+
+	/*
 	 * Run STARTUP IPI loop.
 	 */
 	Dprintk("#startup loops: %d.\n", num_starts);
Index: linux/include/asm-i386/paravirt.h
===================================================================
--- linux.orig/include/asm-i386/paravirt.h
+++ linux/include/asm-i386/paravirt.h
@@ -151,6 +151,8 @@ struct paravirt_ops
 	/* These two are jmp to, not actually called. */
 	void (fastcall *irq_enable_sysexit)(void);
 	void (fastcall *iret)(void);
+
+	void (fastcall *startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp);
 };
 
 /* Mark a paravirt probe function. */
@@ -323,6 +325,13 @@ static inline unsigned long apic_read(un
 }
 #endif
 
+#ifdef CONFIG_SMP
+static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+				    unsigned long start_esp)
+{
+	return paravirt_ops.startup_ipi_hook(phys_apicid, start_eip, start_esp);
+}
+#endif
 
 #define __flush_tlb() paravirt_ops.flush_tlb_user()
 #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
Index: linux/include/asm-i386/smp.h
===================================================================
--- linux.orig/include/asm-i386/smp.h
+++ linux/include/asm-i386/smp.h
@@ -52,6 +52,11 @@ extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
 #endif
 
+#ifndef CONFIG_PARAVIRT
+#define startup_ipi_hook(phys_apicid, start_eip, start_esp) 		\
+do { } while (0)
+#endif
+
 /*
  * This function is needed by all SMP systems. It must _always_ be valid
  * from the initial startup. We map APIC_BASE very early in page_setup(),

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [5/10] i386: vMI backend for paravirt-ops
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
                   ` (2 preceding siblings ...)
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [4/10] i386: sMP boot hook for paravirt Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [6/10] i386: vMI timer patches Andi Kleen
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, Andi Kleen, Jeremy Fitzhardinge, Rusty Russell,
	Chris Wright, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>

Fairly straightforward implementation of VMI backend for paravirt-ops.

[Adrian Bunk: some cleanups] 

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/Kconfig          |    9 
 arch/i386/kernel/Makefile  |    2 
 arch/i386/kernel/head.S    |    2 
 arch/i386/kernel/io_apic.c |    2 
 arch/i386/kernel/setup.c   |    9 
 arch/i386/kernel/smpboot.c |    4 
 arch/i386/kernel/vmi.c     |  904 +++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/mm/pgtable.c     |    2 
 include/asm-i386/timer.h   |    1 
 include/asm-i386/vmi.h     |  262 +++++++++++++
 10 files changed, 1195 insertions(+), 2 deletions(-)

Index: linux/arch/i386/Kconfig
===================================================================
--- linux.orig/arch/i386/Kconfig
+++ linux/arch/i386/Kconfig
@@ -199,6 +199,15 @@ config PARAVIRT
 	  However, when run without a hypervisor the kernel is
 	  theoretically slower.  If in doubt, say N.
 
+config VMI
+	bool "VMI Paravirt-ops support"
+	depends on PARAVIRT
+	default y
+	help
+	  VMI provides a paravirtualized interface to multiple hypervisors
+	  include VMware ESX server and Xen by connecting to a ROM module
+	  provided by the hypervisor.
+
 config ACPI_SRAT
 	bool
 	default y
Index: linux/arch/i386/kernel/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/Makefile
+++ linux/arch/i386/kernel/Makefile
@@ -40,6 +40,8 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_prin
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 
+obj-$(CONFIG_VMI)		+= vmi.o
+
 # Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 
Index: linux/arch/i386/kernel/head.S
===================================================================
--- linux.orig/arch/i386/kernel/head.S
+++ linux/arch/i386/kernel/head.S
@@ -360,7 +360,7 @@ check_x87:
  * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
  * that CPU's GDT and PDA.
  */
-setup_pda:
+ENTRY(setup_pda)
 	/* get the PDA pointer */
 	movl start_pda, %eax
 
Index: linux/arch/i386/kernel/io_apic.c
===================================================================
--- linux.orig/arch/i386/kernel/io_apic.c
+++ linux/arch/i386/kernel/io_apic.c
@@ -1920,7 +1920,7 @@ static void __init setup_ioapic_ids_from
 static void __init setup_ioapic_ids_from_mpc(void) { }
 #endif
 
-static int no_timer_check __initdata;
+int no_timer_check __initdata;
 
 static int __init notimercheck(char *s)
 {
Index: linux/arch/i386/kernel/setup.c
===================================================================
--- linux.orig/arch/i386/kernel/setup.c
+++ linux/arch/i386/kernel/setup.c
@@ -60,6 +60,7 @@
 #include <asm/io_apic.h>
 #include <asm/ist.h>
 #include <asm/io.h>
+#include <asm/vmi.h>
 #include <setup_arch.h>
 #include <bios_ebda.h>
 
@@ -581,6 +582,14 @@ void __init setup_arch(char **cmdline_p)
 
 	max_low_pfn = setup_memory();
 
+#ifdef CONFIG_VMI
+	/*
+	 * Must be after max_low_pfn is determined, and before kernel
+	 * pagetables are setup.
+	 */
+	vmi_init();
+#endif
+
 	/*
 	 * NOTE: before this point _nobody_ is allowed to allocate
 	 * any memory using the bootmem allocator.  Although the
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -62,6 +62,7 @@
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
+#include <asm/vmi.h>
 
 /* Set if we find a B stepping CPU */
 static int __devinitdata smp_b_stepping;
@@ -544,6 +545,9 @@ static void __cpuinit start_secondary(vo
 	 * booting is too fragile that we want to limit the
 	 * things done here to the most necessary things.
 	 */
+#ifdef CONFIG_VMI
+	vmi_bringup();
+#endif
 	secondary_cpu_init();
 	preempt_disable();
 	smp_callin();
Index: linux/arch/i386/kernel/vmi.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/vmi.c
@@ -0,0 +1,904 @@
+/*
+ * VMI specific paravirt-ops implementation
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to zach@vmware.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/license.h>
+#include <linux/cpu.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <asm/vmi.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+
+/* Convenient for calling VMI functions indirectly in the ROM */
+typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
+typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
+
+#define call_vrom_func(rom,func) \
+   (((VROMFUNC *)(rom->func))())
+
+#define call_vrom_long_func(rom,func,arg) \
+   (((VROMLONGFUNC *)(rom->func)) (arg))
+
+static struct vrom_header *vmi_rom;
+static int license_gplok;
+static int disable_nodelay;
+static int disable_pge;
+static int disable_pse;
+static int disable_sep;
+static int disable_tsc;
+static int disable_mtrr;
+
+/* Cached VMI operations */
+struct {
+	void (*cpuid)(void /* non-c */);
+	void (*_set_ldt)(u32 selector);
+	void (*set_tr)(u32 selector);
+	void (*set_kernel_stack)(u32 selector, u32 esp0);
+	void (*allocate_page)(u32, u32, u32, u32, u32);
+	void (*release_page)(u32, u32);
+	void (*set_pte)(pte_t, pte_t *, unsigned);
+	void (*update_pte)(pte_t *, unsigned);
+	void (*set_linear_mapping)(int, u32, u32, u32);
+	void (*flush_tlb)(int);
+	void (*set_initial_ap_state)(int, int);
+} vmi_ops;
+
+/* XXX move this to alternative.h */
+extern struct paravirt_patch __start_parainstructions[],
+	__stop_parainstructions[];
+
+/*
+ * VMI patching routines.
+ */
+#define MNEM_CALL 0xe8
+#define MNEM_JMP  0xe9
+#define MNEM_RET  0xc3
+
+static char irq_save_disable_callout[] = {
+	MNEM_CALL, 0, 0, 0, 0,
+	MNEM_CALL, 0, 0, 0, 0,
+	MNEM_RET
+};
+#define IRQ_PATCH_INT_MASK 0
+#define IRQ_PATCH_DISABLE  5
+
+static inline void patch_offset(unsigned char *eip, unsigned char *dest)
+{
+        *(unsigned long *)(eip+1) = dest-eip-5;
+}
+
+static unsigned patch_internal(int call, unsigned len, void *insns)
+{
+	u64 reloc;
+	struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	call);
+	switch(rel->type) {
+		case VMI_RELOCATION_CALL_REL:
+			BUG_ON(len < 5);
+			*(char *)insns = MNEM_CALL;
+			patch_offset(insns, rel->eip);
+			return 5;
+
+		case VMI_RELOCATION_JUMP_REL:
+			BUG_ON(len < 5);
+			*(char *)insns = MNEM_JMP;
+			patch_offset(insns, rel->eip);
+			return 5;
+
+		case VMI_RELOCATION_NOP:
+			/* obliterate the whole thing */
+			return 0;
+
+		case VMI_RELOCATION_NONE:
+			/* leave native code in place */
+			break;
+
+		default:
+			BUG();
+	}
+	return len;
+}
+
+/*
+ * Apply patch if appropriate, return length of new instruction
+ * sequence.  The callee does nop padding for us.
+ */
+static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	switch (type) {
+		case PARAVIRT_IRQ_DISABLE:
+			return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
+		case PARAVIRT_IRQ_ENABLE:
+			return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
+		case PARAVIRT_RESTORE_FLAGS:
+			return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
+		case PARAVIRT_SAVE_FLAGS:
+			return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
+        	case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
+			if (len >= 10) {
+				patch_internal(VMI_CALL_GetInterruptMask, len, insns);
+				patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
+				return 10;
+			} else {
+				/*
+				 * You bastards didn't leave enough room to
+				 * patch save_flags_irq_disable inline.  Patch
+				 * to a helper
+				 */
+				BUG_ON(len < 5);
+				*(char *)insns = MNEM_CALL;
+				patch_offset(insns, irq_save_disable_callout);
+				return 5;
+			}
+		case PARAVIRT_INTERRUPT_RETURN:
+			return patch_internal(VMI_CALL_IRET, len, insns);
+		case PARAVIRT_STI_SYSEXIT:
+			return patch_internal(VMI_CALL_SYSEXIT, len, insns);
+		default:
+			break;
+	}
+	return len;
+}
+
+/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
+static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
+                               unsigned int *ecx, unsigned int *edx)
+{
+	int override = 0;
+	if (*eax == 1)
+		override = 1;
+        asm volatile ("call *%6"
+                      : "=a" (*eax),
+                        "=b" (*ebx),
+                        "=c" (*ecx),
+                        "=d" (*edx)
+                      : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
+	if (override) {
+		if (disable_pse)
+			*edx &= ~X86_FEATURE_PSE;
+		if (disable_pge)
+			*edx &= ~X86_FEATURE_PGE;
+		if (disable_sep)
+			*edx &= ~X86_FEATURE_SEP;
+		if (disable_tsc)
+			*edx &= ~X86_FEATURE_TSC;
+		if (disable_mtrr)
+			*edx &= ~X86_FEATURE_MTRR;
+	}
+}
+
+static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
+{
+	if (gdt[nr].a != new->a || gdt[nr].b != new->b)
+		write_gdt_entry(gdt, nr, new->a, new->b);
+}
+
+static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
+	vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
+}
+
+static void vmi_set_ldt(const void *addr, unsigned entries)
+{
+	unsigned cpu = smp_processor_id();
+	u32 low, high;
+
+	pack_descriptor(&low, &high, (unsigned long)addr,
+			entries * sizeof(struct desc_struct) - 1,
+			DESCTYPE_LDT, 0);
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
+	vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
+}
+
+static void vmi_set_tr(void)
+{
+	vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
+}
+
+static void vmi_load_esp0(struct tss_struct *tss,
+				   struct thread_struct *thread)
+{
+	tss->esp0 = thread->esp0;
+
+	/* This can only happen when SEP is enabled, no need to test "SEP"arately */
+	if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+		tss->ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+	vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
+}
+
+static void vmi_flush_tlb_user(void)
+{
+	vmi_ops.flush_tlb(VMI_FLUSH_TLB);
+}
+
+static void vmi_flush_tlb_kernel(void)
+{
+	vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+}
+
+/* Stub to do nothing at all; used for delays and unimplemented calls */
+static void vmi_nop(void)
+{
+}
+
+
+#ifdef CONFIG_DEBUG_PAGE_TYPE
+
+#ifdef CONFIG_X86_PAE
+#define MAX_BOOT_PTS (2048+4+1)
+#else
+#define MAX_BOOT_PTS (1024+1)
+#endif
+
+/*
+ * During boot, mem_map is not yet available in paging_init, so stash
+ * all the boot page allocations here.
+ */
+static struct {
+	u32 pfn;
+	int type;
+} boot_page_allocations[MAX_BOOT_PTS];
+static int num_boot_page_allocations;
+static int boot_allocations_applied;
+
+void vmi_apply_boot_page_allocations(void)
+{
+	int i;
+	BUG_ON(!mem_map);
+	for (i = 0; i < num_boot_page_allocations; i++) {
+		struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
+		page->type = boot_page_allocations[i].type;
+		page->type = boot_page_allocations[i].type &
+				~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+	}
+	boot_allocations_applied = 1;
+}
+
+static void record_page_type(u32 pfn, int type)
+{
+	BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
+	boot_page_allocations[num_boot_page_allocations].pfn = pfn;
+	boot_page_allocations[num_boot_page_allocations].type = type;
+	num_boot_page_allocations++;
+}
+
+static void check_zeroed_page(u32 pfn, int type, struct page *page)
+{
+	u32 *ptr;
+	int i;
+	int limit = PAGE_SIZE / sizeof(int);
+
+	if (page_address(page))
+		ptr = (u32 *)page_address(page);
+	else
+		ptr = (u32 *)__va(pfn << PAGE_SHIFT);
+	/*
+	 * When cloning the root in non-PAE mode, only the userspace
+	 * pdes need to be zeroed.
+	 */
+	if (type & VMI_PAGE_CLONE)
+		limit = USER_PTRS_PER_PGD;
+	for (i = 0; i < limit; i++)
+		BUG_ON(ptr[i]);
+}
+
+/*
+ * We stash the page type into struct page so we can verify the page
+ * types are used properly.
+ */
+static void vmi_set_page_type(u32 pfn, int type)
+{
+	/* PAE can have multiple roots per page - don't track */
+	if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+		return;
+
+	if (boot_allocations_applied) {
+		struct page *page = pfn_to_page(pfn);
+		if (type != VMI_PAGE_NORMAL)
+			BUG_ON(page->type);
+		else
+			BUG_ON(page->type == VMI_PAGE_NORMAL);
+		page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+		if (type & VMI_PAGE_ZEROED)
+			check_zeroed_page(pfn, type, page);
+	} else {
+		record_page_type(pfn, type);
+	}
+}
+
+static void vmi_check_page_type(u32 pfn, int type)
+{
+	/* PAE can have multiple roots per page - skip checks */
+	if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
+		return;
+
+	type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
+	if (boot_allocations_applied) {
+		struct page *page = pfn_to_page(pfn);
+		BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
+		BUG_ON(type == VMI_PAGE_NORMAL && page->type);
+		BUG_ON((type & page->type) == 0);
+	}
+}
+#else
+#define vmi_set_page_type(p,t) do { } while (0)
+#define vmi_check_page_type(p,t) do { } while (0)
+#endif
+
+static void vmi_allocate_pt(u32 pfn)
+{
+	vmi_set_page_type(pfn, VMI_PAGE_L1);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
+}
+
+static void vmi_allocate_pd(u32 pfn)
+{
+ 	/*
+	 * This call comes in very early, before mem_map is setup.
+	 * It is called only for swapper_pg_dir, which already has
+	 * data on it.
+	 */
+ 	vmi_set_page_type(pfn, VMI_PAGE_L2);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
+}
+
+static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+{
+ 	vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
+	vmi_check_page_type(clonepfn, VMI_PAGE_L2);
+	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
+}
+
+static void vmi_release_pt(u32 pfn)
+{
+	vmi_ops.release_page(pfn, VMI_PAGE_L1);
+	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+static void vmi_release_pd(u32 pfn)
+{
+	vmi_ops.release_page(pfn, VMI_PAGE_L2);
+	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
+}
+
+/*
+ * Helper macros for MMU update flags.  We can defer updates until a flush
+ * or page invalidation only if the update is to the current address space
+ * (otherwise, there is no flush).  We must check against init_mm, since
+ * this could be a kernel update, which usually passes init_mm, although
+ * sometimes this check can be skipped if we know the particular function
+ * is only called on user mode PTEs.  We could change the kernel to pass
+ * current->active_mm here, but in particular, I was unsure if changing
+ * mm/highmem.c to do this would still be correct on other architectures.
+ */
+#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
+                                       (!mustbeuser && (mm) == &init_mm))
+#define vmi_flags_addr(mm, addr, level, user)                           \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+#define vmi_flags_addr_defer(mm, addr, level, user)                     \
+        ((level) | (is_current_as(mm, user) ?                           \
+                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
+
+static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pte(pte_t *ptep, pte_t pte)
+{
+	/* XXX because of set_pmd_pte, this can be called on PT or PD layers */
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
+	vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+#ifdef CONFIG_X86_PAE
+	const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
+	vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
+#else
+	const pte_t pte = { pmdval.pud.pgd.pgd };
+	vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+#endif
+	vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
+}
+
+#ifdef CONFIG_X86_PAE
+
+static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+	/*
+	 * XXX This is called from set_pmd_pte, but at both PT
+	 * and PD layers so the VMI_PAGE_PT flag is wrong.  But
+	 * it is only called for large page mapping changes,
+	 * the Xen backend, doesn't support large pages, and the
+	 * ESX backend doesn't depend on the flag.
+	 */
+	set_64bit((unsigned long long *)ptep,pte_val(pteval));
+	vmi_ops.update_pte(ptep, VMI_PAGE_PT);
+}
+
+static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
+}
+
+static void vmi_set_pud(pud_t *pudp, pud_t pudval)
+{
+	/* Um, eww */
+	const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
+	vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
+	vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
+}
+
+static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	const pte_t pte = { 0 };
+	vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
+	vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
+}
+
+void vmi_pmd_clear(pmd_t *pmd)
+{
+	const pte_t pte = { 0 };
+	vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
+	vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
+}
+#endif
+
+#ifdef CONFIG_SMP
+struct vmi_ap_state ap;
+extern void setup_pda(void);
+
+static void __init /* XXX cpu hotplug */
+vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
+		     unsigned long start_esp)
+{
+	/* Default everything to zero.  This is fine for most GPRs. */
+	memset(&ap, 0, sizeof(struct vmi_ap_state));
+
+	ap.gdtr_limit = GDT_SIZE - 1;
+	ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
+
+	ap.idtr_limit = IDT_ENTRIES * 8 - 1;
+	ap.idtr_base = (unsigned long) idt_table;
+
+	ap.ldtr = 0;
+
+	ap.cs = __KERNEL_CS;
+	ap.eip = (unsigned long) start_eip;
+	ap.ss = __KERNEL_DS;
+	ap.esp = (unsigned long) start_esp;
+
+	ap.ds = __USER_DS;
+	ap.es = __USER_DS;
+	ap.fs = __KERNEL_PDA;
+	ap.gs = 0;
+
+	ap.eflags = 0;
+
+	setup_pda();
+
+#ifdef CONFIG_X86_PAE
+	/* efer should match BSP efer. */
+	if (cpu_has_nx) {
+		unsigned l, h;
+		rdmsr(MSR_EFER, l, h);
+		ap.efer = (unsigned long long) h << 32 | l;
+	}
+#endif
+
+	ap.cr3 = __pa(swapper_pg_dir);
+	/* Protected mode, paging, AM, WP, NE, MP. */
+	ap.cr0 = 0x80050023;
+	ap.cr4 = mmu_cr4_features;
+	vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid);
+}
+#endif
+
+static inline int __init check_vmi_rom(struct vrom_header *rom)
+{
+	struct pci_header *pci;
+	struct pnp_header *pnp;
+	const char *manufacturer = "UNKNOWN";
+	const char *product = "UNKNOWN";
+	const char *license = "unspecified";
+
+	if (rom->rom_signature != 0xaa55)
+		return 0;
+	if (rom->vrom_signature != VMI_SIGNATURE)
+		return 0;
+	if (rom->api_version_maj != VMI_API_REV_MAJOR ||
+	    rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
+		printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
+				rom->api_version_maj,
+				rom->api_version_min);
+		return 0;
+	}
+
+	/*
+	 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
+	 * the PCI header and device type to make sure this is really a
+	 * VMI device.
+	 */
+	if (!rom->pci_header_offs) {
+		printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
+		return 0;
+	}
+
+	pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
+	if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
+	    pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
+		/* Allow it to run... anyways, but warn */
+		printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
+	}
+
+	if (rom->pnp_header_offs) {
+		pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
+		if (pnp->manufacturer_offset)
+			manufacturer = (const char *)rom+pnp->manufacturer_offset;
+		if (pnp->product_offset)
+			product = (const char *)rom+pnp->product_offset;
+	}
+
+	if (rom->license_offs)
+		license = (char *)rom+rom->license_offs;
+
+	printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
+		manufacturer, product,
+		rom->api_version_maj, rom->api_version_min,
+		pci->rom_version_maj, pci->rom_version_min);
+
+        license_gplok = license_is_gpl_compatible(license);
+        if (!license_gplok) {
+                printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... "
+		       "inlining disabled\n",
+                       license);
+                add_taint(TAINT_PROPRIETARY_MODULE);
+        }
+	return 1;
+}
+
+/*
+ * Probe for the VMI option ROM
+ */
+static inline int __init probe_vmi_rom(void)
+{
+	unsigned long base;
+
+	/* VMI ROM is in option ROM area, check signature */
+	for (base = 0xC0000; base < 0xE0000; base += 2048) {
+		struct vrom_header *romstart;
+		romstart = (struct vrom_header *)isa_bus_to_virt(base);
+		if (check_vmi_rom(romstart)) {
+			vmi_rom = romstart;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * VMI setup common to all processors
+ */
+void vmi_bringup(void)
+{
+ 	/* We must establish the lowmem mapping for MMU ops to work */
+	if (vmi_rom)
+		vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
+}
+
+/*
+ * Return a pointer to the VMI function or a NOP stub
+ */
+static void *vmi_get_function(int vmicall)
+{
+	u64 reloc;
+	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	vmicall);
+	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
+	if (rel->type == VMI_RELOCATION_CALL_REL)
+		return (void *)rel->eip;
+	else
+		return (void *)vmi_nop;
+}
+
+/*
+ * Helper macro for making the VMI paravirt-ops fill code readable.
+ * For unimplemented operations, fall back to default.
+ */
+#define para_fill(opname, vmicall)				\
+do {								\
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
+				    VMI_CALL_##vmicall);	\
+	if (rel->type != VMI_RELOCATION_NONE) {			\
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);	\
+		paravirt_ops.opname = (void *)rel->eip;		\
+	}							\
+} while (0)
+
+/*
+ * Activate the VMI interface and switch into paravirtualized mode
+ */
+static inline int __init activate_vmi(void)
+{
+	short kernel_cs;
+	u64 reloc;
+	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
+
+	if (call_vrom_func(vmi_rom, vmi_init) != 0) {
+		printk(KERN_ERR "VMI ROM failed to initialize!");
+		return 0;
+	}
+	savesegment(cs, kernel_cs);
+
+	paravirt_ops.paravirt_enabled = 1;
+	paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+
+	paravirt_ops.patch = vmi_patch;
+	paravirt_ops.name = "vmi";
+
+	/*
+	 * Many of these operations are ABI compatible with VMI.
+	 * This means we can fill in the paravirt-ops with direct
+	 * pointers into the VMI ROM.  If the calling convention for
+	 * these operations changes, this code needs to be updated.
+	 *
+	 * Exceptions
+	 *  CPUID paravirt-op uses pointers, not the native ISA
+	 *  halt has no VMI equivalent; all VMI halts are "safe"
+	 *  no MSR support yet - just trap and emulate.  VMI uses the
+	 *    same ABI as the native ISA, but Linux wants exceptions
+	 *    from bogus MSR read / write handled
+	 *  rdpmc is not yet used in Linux
+	 */
+
+	/* CPUID is special, so very special */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	VMI_CALL_CPUID);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops.cpuid = (void *)rel->eip;
+		paravirt_ops.cpuid = vmi_cpuid;
+	}
+
+	para_fill(clts, CLTS);
+	para_fill(get_debugreg, GetDR);
+	para_fill(set_debugreg, SetDR);
+	para_fill(read_cr0, GetCR0);
+	para_fill(read_cr2, GetCR2);
+	para_fill(read_cr3, GetCR3);
+	para_fill(read_cr4, GetCR4);
+	para_fill(write_cr0, SetCR0);
+	para_fill(write_cr2, SetCR2);
+	para_fill(write_cr3, SetCR3);
+	para_fill(write_cr4, SetCR4);
+	para_fill(save_fl, GetInterruptMask);
+	para_fill(restore_fl, SetInterruptMask);
+	para_fill(irq_disable, DisableInterrupts);
+	para_fill(irq_enable, EnableInterrupts);
+	/* irq_save_disable !!! sheer pain */
+	patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
+		     (char *)paravirt_ops.save_fl);
+	patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
+		     (char *)paravirt_ops.irq_disable);
+	para_fill(safe_halt, Halt);
+	para_fill(wbinvd, WBINVD);
+	/* paravirt_ops.read_msr = vmi_rdmsr */
+	/* paravirt_ops.write_msr = vmi_wrmsr */
+	para_fill(read_tsc, RDTSC);
+	/* paravirt_ops.rdpmc = vmi_rdpmc */
+
+	/* TR interface doesn't pass TR value */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	VMI_CALL_SetTR);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops.set_tr = (void *)rel->eip;
+		paravirt_ops.load_tr_desc = vmi_set_tr;
+	}
+
+	/* LDT is special, too */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,	VMI_CALL_SetLDT);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops._set_ldt = (void *)rel->eip;
+		paravirt_ops.set_ldt = vmi_set_ldt;
+	}
+
+	para_fill(load_gdt, SetGDT);
+	para_fill(load_idt, SetIDT);
+	para_fill(store_gdt, GetGDT);
+	para_fill(store_idt, GetIDT);
+	para_fill(store_tr, GetTR);
+	paravirt_ops.load_tls = vmi_load_tls;
+	para_fill(write_ldt_entry, WriteLDTEntry);
+	para_fill(write_gdt_entry, WriteGDTEntry);
+	para_fill(write_idt_entry, WriteIDTEntry);
+	reloc = call_vrom_long_func(vmi_rom, get_reloc,
+				    VMI_CALL_UpdateKernelStack);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
+		vmi_ops.set_kernel_stack = (void *)rel->eip;
+		paravirt_ops.load_esp0 = vmi_load_esp0;
+	}
+
+	para_fill(set_iopl_mask, SetIOPLMask);
+	paravirt_ops.io_delay = (void *)vmi_nop;
+	if (!disable_nodelay) {
+		paravirt_ops.const_udelay = (void *)vmi_nop;
+	}
+
+	para_fill(set_lazy_mode, SetLazyMode);
+
+	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		vmi_ops.flush_tlb = (void *)rel->eip;
+		paravirt_ops.flush_tlb_user = vmi_flush_tlb_user;
+		paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel;
+	}
+	para_fill(flush_tlb_single, InvalPage);
+
+	/*
+	 * Until a standard flag format can be agreed on, we need to
+	 * implement these as wrappers in Linux.  Get the VMI ROM
+	 * function pointers for the two backend calls.
+	 */
+#ifdef CONFIG_X86_PAE
+	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
+	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
+#else
+	vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
+	vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
+#endif
+	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
+	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
+
+	paravirt_ops.alloc_pt = vmi_allocate_pt;
+	paravirt_ops.alloc_pd = vmi_allocate_pd;
+	paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+	paravirt_ops.release_pt = vmi_release_pt;
+	paravirt_ops.release_pd = vmi_release_pd;
+	paravirt_ops.set_pte = vmi_set_pte;
+	paravirt_ops.set_pte_at = vmi_set_pte_at;
+	paravirt_ops.set_pmd = vmi_set_pmd;
+	paravirt_ops.pte_update = vmi_update_pte;
+	paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+#ifdef CONFIG_X86_PAE
+	paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
+	paravirt_ops.set_pte_present = vmi_set_pte_present;
+	paravirt_ops.set_pud = vmi_set_pud;
+	paravirt_ops.pte_clear = vmi_pte_clear;
+	paravirt_ops.pmd_clear = vmi_pmd_clear;
+#endif
+	/*
+	 * These MUST always be patched.  Don't support indirect jumps
+	 * through these operations, as the VMI interface may use either
+	 * a jump or a call to get to these operations, depending on
+	 * the backend.  They are performance critical anyway, so requiring
+	 * a patch is not a big problem.
+	 */
+	paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+	paravirt_ops.iret = (void *)0xbadbab0;
+
+#ifdef CONFIG_SMP
+	paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook;
+	vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead);
+	paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite);
+	paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite);
+#endif
+
+	/*
+	 * Alternative instruction rewriting doesn't happen soon enough
+	 * to convert VMI_IRET to a call instead of a jump; so we have
+	 * to do this before IRQs get reenabled.  Fortunately, it is
+	 * idempotent.
+	 */
+	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+
+	vmi_bringup();
+
+	return 1;
+}
+
+#undef para_fill
+
+void __init vmi_init(void)
+{
+	unsigned long flags;
+
+	if (!vmi_rom)
+		probe_vmi_rom();
+	else
+		check_vmi_rom(vmi_rom);
+
+	/* In case probing for or validating the ROM failed, basil */
+	if (!vmi_rom)
+		return;
+
+	reserve_top_address(-vmi_rom->virtual_top);
+
+	local_irq_save(flags);
+	activate_vmi();
+#ifdef CONFIG_SMP
+	no_timer_check = 1;
+#endif
+	local_irq_restore(flags & X86_EFLAGS_IF);
+}
+
+static int __init parse_vmi(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (!strcmp(arg, "disable_nodelay"))
+		disable_nodelay = 1;
+	else if (!strcmp(arg, "disable_pge")) {
+		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+		disable_pge = 1;
+	} else if (!strcmp(arg, "disable_pse")) {
+		clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+		disable_pse = 1;
+	} else if (!strcmp(arg, "disable_sep")) {
+		clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
+		disable_sep = 1;
+	} else if (!strcmp(arg, "disable_tsc")) {
+		clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+		disable_tsc = 1;
+	} else if (!strcmp(arg, "disable_mtrr")) {
+		clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
+		disable_mtrr = 1;
+	}
+	return 0;
+}
+
+early_param("vmi", parse_vmi);
Index: linux/arch/i386/mm/pgtable.c
===================================================================
--- linux.orig/arch/i386/mm/pgtable.c
+++ linux/arch/i386/mm/pgtable.c
@@ -171,6 +171,8 @@ void __set_fixmap (enum fixed_addresses 
 void reserve_top_address(unsigned long reserve)
 {
 	BUG_ON(fixmaps > 0);
+	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+	       (int)-reserve);
 #ifdef CONFIG_COMPAT_VDSO
 	BUG_ON(reserve != 0);
 #else
Index: linux/include/asm-i386/timer.h
===================================================================
--- linux.orig/include/asm-i386/timer.h
+++ linux/include/asm-i386/timer.h
@@ -8,6 +8,7 @@ void setup_pit_timer(void);
 /* Modifiers for buggy PIT handling */
 extern int pit_latch_buggy;
 extern int timer_ack;
+extern int no_timer_check;
 extern int recalibrate_cpu_khz(void);
 
 #endif
Index: linux/include/asm-i386/vmi.h
===================================================================
--- /dev/null
+++ linux/include/asm-i386/vmi.h
@@ -0,0 +1,262 @@
+/*
+ * VMI interface definition
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Maintained by: Zachary Amsden zach@vmware.com
+ *
+ */
+#include <linux/types.h>
+
+/*
+ *---------------------------------------------------------------------
+ *
+ *  VMI Option ROM API
+ *
+ *---------------------------------------------------------------------
+ */
+#define VMI_SIGNATURE 0x696d5663   /* "cVmi" */
+
+#define PCI_VENDOR_ID_VMWARE            0x15AD
+#define PCI_DEVICE_ID_VMWARE_VMI        0x0801
+
+/*
+ * We use two version numbers for compatibility, with the major
+ * number signifying interface breakages, and the minor number
+ * interface extensions.
+ */
+#define VMI_API_REV_MAJOR       3
+#define VMI_API_REV_MINOR       0
+
+#define VMI_CALL_CPUID			0
+#define VMI_CALL_WRMSR			1
+#define VMI_CALL_RDMSR			2
+#define VMI_CALL_SetGDT			3
+#define VMI_CALL_SetLDT			4
+#define VMI_CALL_SetIDT			5
+#define VMI_CALL_SetTR			6
+#define VMI_CALL_GetGDT			7
+#define VMI_CALL_GetLDT			8
+#define VMI_CALL_GetIDT			9
+#define VMI_CALL_GetTR			10
+#define VMI_CALL_WriteGDTEntry		11
+#define VMI_CALL_WriteLDTEntry		12
+#define VMI_CALL_WriteIDTEntry		13
+#define VMI_CALL_UpdateKernelStack	14
+#define VMI_CALL_SetCR0			15
+#define VMI_CALL_SetCR2			16
+#define VMI_CALL_SetCR3			17
+#define VMI_CALL_SetCR4			18
+#define VMI_CALL_GetCR0			19
+#define VMI_CALL_GetCR2			20
+#define VMI_CALL_GetCR3			21
+#define VMI_CALL_GetCR4			22
+#define VMI_CALL_WBINVD			23
+#define VMI_CALL_SetDR			24
+#define VMI_CALL_GetDR			25
+#define VMI_CALL_RDPMC			26
+#define VMI_CALL_RDTSC			27
+#define VMI_CALL_CLTS			28
+#define VMI_CALL_EnableInterrupts	29
+#define VMI_CALL_DisableInterrupts	30
+#define VMI_CALL_GetInterruptMask	31
+#define VMI_CALL_SetInterruptMask	32
+#define VMI_CALL_IRET			33
+#define VMI_CALL_SYSEXIT		34
+#define VMI_CALL_Halt			35
+#define VMI_CALL_Reboot			36
+#define VMI_CALL_Shutdown		37
+#define VMI_CALL_SetPxE			38
+#define VMI_CALL_SetPxELong		39
+#define VMI_CALL_UpdatePxE		40
+#define VMI_CALL_UpdatePxELong		41
+#define VMI_CALL_MachineToPhysical	42
+#define VMI_CALL_PhysicalToMachine	43
+#define VMI_CALL_AllocatePage		44
+#define VMI_CALL_ReleasePage		45
+#define VMI_CALL_InvalPage		46
+#define VMI_CALL_FlushTLB		47
+#define VMI_CALL_SetLinearMapping	48
+
+#define VMI_CALL_SetIOPLMask		61
+#define VMI_CALL_SetInitialAPState	62
+#define VMI_CALL_APICWrite		63
+#define VMI_CALL_APICRead		64
+#define VMI_CALL_SetLazyMode		73
+
+/*
+ *---------------------------------------------------------------------
+ *
+ * MMU operation flags
+ *
+ *---------------------------------------------------------------------
+ */
+
+/* Flags used by VMI_{Allocate|Release}Page call */
+#define VMI_PAGE_PAE             0x10  /* Allocate PAE shadow */
+#define VMI_PAGE_CLONE           0x20  /* Clone from another shadow */
+#define VMI_PAGE_ZEROED          0x40  /* Page is pre-zeroed */
+
+
+/* Flags shared by Allocate|Release Page and PTE updates */
+#define VMI_PAGE_PT              0x01
+#define VMI_PAGE_PD              0x02
+#define VMI_PAGE_PDP             0x04
+#define VMI_PAGE_PML4            0x08
+
+#define VMI_PAGE_NORMAL          0x00 /* for debugging */
+
+/* Flags used by PTE updates */
+#define VMI_PAGE_CURRENT_AS      0x10 /* implies VMI_PAGE_VA_MASK is valid */
+#define VMI_PAGE_DEFER           0x20 /* may queue update until TLB inval */
+#define VMI_PAGE_VA_MASK         0xfffff000
+
+#ifdef CONFIG_X86_PAE
+#define VMI_PAGE_L1		(VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
+#define VMI_PAGE_L2		(VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
+#else
+#define VMI_PAGE_L1		(VMI_PAGE_PT | VMI_PAGE_ZEROED)
+#define VMI_PAGE_L2		(VMI_PAGE_PD | VMI_PAGE_ZEROED)
+#endif
+
+/* Flags used by VMI_FlushTLB call */
+#define VMI_FLUSH_TLB            0x01
+#define VMI_FLUSH_GLOBAL         0x02
+
+/*
+ *---------------------------------------------------------------------
+ *
+ *  VMI relocation definitions for ROM call get_reloc
+ *
+ *---------------------------------------------------------------------
+ */
+
+/* VMI Relocation types */
+#define VMI_RELOCATION_NONE     0
+#define VMI_RELOCATION_CALL_REL 1
+#define VMI_RELOCATION_JUMP_REL 2
+#define VMI_RELOCATION_NOP	3
+
+#ifndef __ASSEMBLY__
+struct vmi_relocation_info {
+        unsigned char           *eip;
+        unsigned char           type;
+        unsigned char           reserved[3];
+};
+#endif
+
+
+/*
+ *---------------------------------------------------------------------
+ *
+ *  Generic ROM structures and definitions
+ *
+ *---------------------------------------------------------------------
+ */
+
+#ifndef __ASSEMBLY__
+
+struct vrom_header {
+	u16     rom_signature;  // option ROM signature
+	u8      rom_length;     // ROM length in 512 byte chunks
+	u8      rom_entry[4];   // 16-bit code entry point
+	u8      rom_pad0;       // 4-byte align pad
+	u32     vrom_signature; // VROM identification signature
+	u8      api_version_min;// Minor version of API
+	u8      api_version_maj;// Major version of API
+	u8      jump_slots;     // Number of jump slots
+	u8      reserved1;      // Reserved for expansion
+	u32     virtual_top;    // Hypervisor virtual address start
+	u16     reserved2;      // Reserved for expansion
+	u16	license_offs;	// Offset to License string
+	u16     pci_header_offs;// Offset to PCI OPROM header
+	u16     pnp_header_offs;// Offset to PnP OPROM header
+	u32     rom_pad3;       // PnP reserverd / VMI reserved
+	u8      reserved[96];   // Reserved for headers
+	char    vmi_init[8];    // VMI_Init jump point
+	char    get_reloc[8];   // VMI_GetRelocationInfo jump point
+} __attribute__((packed));
+
+struct pnp_header {
+        char sig[4];
+        char rev;
+        char size;
+        short next;
+        short res;
+        long devID;
+        unsigned short manufacturer_offset;
+        unsigned short product_offset;
+} __attribute__((packed));
+
+struct pci_header {
+        char sig[4];
+        short vendorID;
+        short deviceID;
+        short vpdData;
+        short size;
+        char rev;
+        char class;
+        char subclass;
+        char interface;
+        short chunks;
+        char rom_version_min;
+        char rom_version_maj;
+        char codetype;
+        char lastRom;
+        short reserved;
+} __attribute__((packed));
+
+/* Function prototypes for bootstrapping */
+extern void vmi_init(void);
+extern void vmi_bringup(void);
+extern void vmi_apply_boot_page_allocations(void);
+
+/* State needed to start an application processor in an SMP system. */
+struct vmi_ap_state {
+	u32 cr0;
+	u32 cr2;
+	u32 cr3;
+	u32 cr4;
+
+	u64 efer;
+
+	u32 eip;
+	u32 eflags;
+	u32 eax;
+	u32 ebx;
+	u32 ecx;
+	u32 edx;
+	u32 esp;
+	u32 ebp;
+	u32 esi;
+	u32 edi;
+	u16 cs;
+	u16 ss;
+	u16 ds;
+	u16 es;
+	u16 fs;
+	u16 gs;
+	u16 ldtr;
+
+	u16 gdtr_limit;
+	u32 gdtr_base;
+	u32 idtr_base;
+	u16 idtr_limit;
+};
+
+#endif

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [6/10] i386: vMI timer patches
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
                   ` (3 preceding siblings ...)
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [5/10] i386: vMI backend for paravirt-ops Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [7/10] VMI: Profile pc badness Andi Kleen
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, Andi Kleen, Jeremy Fitzhardinge, Rusty Russell,
	Chris Wright, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>

VMI timer code.  It works by taking over the local APIC clock when APIC is
configured, which requires a couple hooks into the APIC code.  The backend
timer code could be commonized into the timer infrastructure, but there are
some pieces missing (stolen time, in particular), and the exact semantics of
when to do accounting for NO_IDLE need to be shared between different
hypervisors as well.  So for now, VMI timer is a separate module.

[Adrian Bunk: cleanups]

Subject: VMI timer patches
Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/i386/Kconfig           |    9 
 arch/i386/kernel/Makefile   |    2 
 arch/i386/kernel/apic.c     |    2 
 arch/i386/kernel/entry.S    |    5 
 arch/i386/kernel/paravirt.c |    2 
 arch/i386/kernel/smpboot.c  |    4 
 arch/i386/kernel/time.c     |    4 
 arch/i386/kernel/tsc.c      |    4 
 arch/i386/kernel/vmi.c      |   45 ++++
 arch/i386/kernel/vmitime.c  |  495 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-i386/apic.h     |    2 
 include/asm-i386/paravirt.h |   12 +
 include/asm-i386/time.h     |    1 
 include/asm-i386/timer.h    |    2 
 include/asm-i386/vmi_time.h |  103 +++++++++
 15 files changed, 687 insertions(+), 5 deletions(-)

Index: linux/arch/i386/Kconfig
===================================================================
--- linux.orig/arch/i386/Kconfig
+++ linux/arch/i386/Kconfig
@@ -1268,3 +1268,12 @@ config X86_TRAMPOLINE
 config KTIME_SCALAR
 	bool
 	default y
+
+config NO_IDLE_HZ
+	bool
+	depends on PARAVIRT
+	default y
+	help
+	  Switches the regular HZ timer off when the system is going idle.
+	  This helps a hypervisor detect that the Linux system is idle,
+	  reducing the overhead of idle systems.
Index: linux/arch/i386/kernel/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/Makefile
+++ linux/arch/i386/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_EARLY_PRINTK)	+= early_prin
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 
-obj-$(CONFIG_VMI)		+= vmi.o
+obj-$(CONFIG_VMI)		+= vmi.o vmitime.o
 
 # Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
Index: linux/arch/i386/kernel/apic.c
===================================================================
--- linux.orig/arch/i386/kernel/apic.c
+++ linux/arch/i386/kernel/apic.c
@@ -1395,7 +1395,7 @@ int __init APIC_init_uniprocessor (void)
 		if (!skip_ioapic_setup && nr_ioapics)
 			setup_IO_APIC();
 #endif
-	setup_boot_APIC_clock();
+	setup_boot_clock();
 
 	return 0;
 }
Index: linux/arch/i386/kernel/entry.S
===================================================================
--- linux.orig/arch/i386/kernel/entry.S
+++ linux/arch/i386/kernel/entry.S
@@ -626,6 +626,11 @@ ENTRY(name)				\
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
+/* This alternate entry is needed because we hijack the apic LVTT */
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
+BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
+#endif
+
 KPROBE_ENTRY(page_fault)
 	RING0_EC_FRAME
 	pushl $do_page_fault
Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -544,6 +544,8 @@ struct paravirt_ops paravirt_ops = {
 	.apic_write = native_apic_write,
 	.apic_write_atomic = native_apic_write_atomic,
 	.apic_read = native_apic_read,
+	.setup_boot_clock = setup_boot_APIC_clock,
+	.setup_secondary_clock = setup_secondary_APIC_clock,
 #endif
 	.set_lazy_mode = (void *)native_nop,
 
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -553,7 +553,7 @@ static void __cpuinit start_secondary(vo
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
 		rep_nop();
-	setup_secondary_APIC_clock();
+	setup_secondary_clock();
 	if (nmi_watchdog == NMI_IO_APIC) {
 		disable_8259A_irq(0);
 		enable_NMI_through_LVT0(NULL);
@@ -1330,7 +1330,7 @@ static void __init smp_boot_cpus(unsigne
 
 	smpboot_setup_io_apic();
 
-	setup_boot_APIC_clock();
+	setup_boot_clock();
 
 	/*
 	 * Synchronize the TSC with the AP
Index: linux/arch/i386/kernel/time.c
===================================================================
--- linux.orig/arch/i386/kernel/time.c
+++ linux/arch/i386/kernel/time.c
@@ -232,6 +232,7 @@ EXPORT_SYMBOL(get_cmos_time);
 static void sync_cmos_clock(unsigned long dummy);
 
 static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+int no_sync_cmos_clock;
 
 static void sync_cmos_clock(unsigned long dummy)
 {
@@ -275,7 +276,8 @@ static void sync_cmos_clock(unsigned lon
 
 void notify_arch_cmos_timer(void)
 {
-	mod_timer(&sync_cmos_timer, jiffies + 1);
+	if (!no_sync_cmos_clock)
+		mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
 static long clock_cmos_diff;
Index: linux/arch/i386/kernel/tsc.c
===================================================================
--- linux.orig/arch/i386/kernel/tsc.c
+++ linux/arch/i386/kernel/tsc.c
@@ -23,6 +23,7 @@
  * an extra value to store the TSC freq
  */
 unsigned int tsc_khz;
+unsigned long long (*custom_sched_clock)(void);
 
 int tsc_disable;
 
@@ -107,6 +108,9 @@ unsigned long long sched_clock(void)
 {
 	unsigned long long this_offset;
 
+	if (unlikely(custom_sched_clock))
+		return (*custom_sched_clock)();
+
 	/*
 	 * in the NUMA case we dont use the TSC as they are not
 	 * synchronized across all CPUs.
Index: linux/arch/i386/kernel/vmi.c
===================================================================
--- linux.orig/arch/i386/kernel/vmi.c
+++ linux/arch/i386/kernel/vmi.c
@@ -34,6 +34,7 @@
 #include <asm/apic.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/vmi_time.h>
 
 /* Convenient for calling VMI functions indirectly in the ROM */
 typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -67,6 +68,7 @@ struct {
 	void (*set_linear_mapping)(int, u32, u32, u32);
 	void (*flush_tlb)(int);
 	void (*set_initial_ap_state)(int, int);
+	void (*halt)(void);
 } vmi_ops;
 
 /* XXX move this to alternative.h */
@@ -252,6 +254,19 @@ static void vmi_nop(void)
 {
 }
 
+/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
+#ifdef CONFIG_NO_IDLE_HZ
+static fastcall void vmi_safe_halt(void)
+{
+	int idle = vmi_stop_hz_timer();
+	vmi_ops.halt();
+	if (idle) {
+		local_irq_disable();
+		vmi_account_time_restart_hz_timer();
+		local_irq_enable();
+	}
+}
+#endif
 
 #ifdef CONFIG_DEBUG_PAGE_TYPE
 
@@ -727,7 +742,12 @@ static inline int __init activate_vmi(vo
 		     (char *)paravirt_ops.save_fl);
 	patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
 		     (char *)paravirt_ops.irq_disable);
+#ifndef CONFIG_NO_IDLE_HZ
 	para_fill(safe_halt, Halt);
+#else
+	vmi_ops.halt = vmi_get_function(VMI_CALL_Halt);
+	paravirt_ops.safe_halt = vmi_safe_halt;
+#endif
 	para_fill(wbinvd, WBINVD);
 	/* paravirt_ops.read_msr = vmi_rdmsr */
 	/* paravirt_ops.write_msr = vmi_wrmsr */
@@ -838,6 +858,31 @@ static inline int __init activate_vmi(vo
 #endif
 
 	/*
+	 * Check for VMI timer functionality by probing for a cycle frequency method
+	 */
+	reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
+	if (rel->type != VMI_RELOCATION_NONE) {
+		vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
+		vmi_timer_ops.get_cycle_counter =
+			vmi_get_function(VMI_CALL_GetCycleCounter);
+		vmi_timer_ops.get_wallclock =
+			vmi_get_function(VMI_CALL_GetWallclockTime);
+		vmi_timer_ops.wallclock_updated =
+			vmi_get_function(VMI_CALL_WallclockUpdated);
+		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
+		vmi_timer_ops.cancel_alarm =
+			 vmi_get_function(VMI_CALL_CancelAlarm);
+		paravirt_ops.time_init = vmi_time_init;
+		paravirt_ops.get_wallclock = vmi_get_wallclock;
+		paravirt_ops.set_wallclock = vmi_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+		paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
+		paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
+#endif
+		custom_sched_clock = vmi_sched_clock;
+	}
+
+	/*
 	 * Alternative instruction rewriting doesn't happen soon enough
 	 * to convert VMI_IRET to a call instead of a jump; so we have
 	 * to do this before IRQs get reenabled.  Fortunately, it is
Index: linux/arch/i386/kernel/vmitime.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/vmitime.c
@@ -0,0 +1,495 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2005, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to dhecht@vmware.com
+ *
+ */
+
+/*
+ * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
+ * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/jiffies.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/rcupdate.h>
+#include <linux/clocksource.h>
+
+#include <asm/timer.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/div64.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+
+#include <mach_timer.h>
+#include <io_ports.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
+#else
+#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
+#endif
+
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* /proc/sys/kernel/hz_timer state. */
+int sysctl_hz_timer;
+
+/* Some stats */
+static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
+static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
+static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
+static int alarm_hz = CONFIG_VMI_ALARM_HZ;
+
+/* Cache of the value get_cycle_frequency / HZ. */
+static signed long long cycles_per_jiffy;
+
+/* Cache of the value get_cycle_frequency / alarm_hz. */
+static signed long long cycles_per_alarm;
+
+/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
+ * Protected by xtime_lock. */
+static unsigned long long real_cycles_accounted_system;
+
+/* The number of cycles accounted for by update_process_times(), per cpu. */
+static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
+
+/* The number of stolen cycles accounted, per cpu. */
+static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
+
+/* Clock source. */
+static cycle_t read_real_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static cycle_t read_available_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+}
+
+#if 0
+static cycle_t read_stolen_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
+}
+#endif  /*  0  */
+
+static struct clocksource clocksource_vmi = {
+	.name			= "vmi-timer",
+	.rating			= 450,
+	.read			= read_real_cycles,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.is_continuous		= 1,
+};
+
+
+/* Timer interrupt handler. */
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
+
+static struct irqaction vmi_timer_irq  = {
+	vmi_timer_interrupt,
+	SA_INTERRUPT,
+	CPU_MASK_NONE,
+	"VMI-alarm",
+	NULL,
+	NULL
+};
+
+/* Alarm rate */
+static int __init vmi_timer_alarm_rate_setup(char* str)
+{
+	int alarm_rate;
+	if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
+		alarm_hz = alarm_rate;
+		printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
+	}
+	return 1;
+}
+__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
+
+
+/* Initialization */
+static void vmi_get_wallclock_ts(struct timespec *ts)
+{
+	unsigned long long wallclock;
+	wallclock = vmi_timer_ops.get_wallclock(); // nsec units
+	ts->tv_nsec = do_div(wallclock, 1000000000);
+	ts->tv_sec = wallclock;
+}
+
+static void update_xtime_from_wallclock(void)
+{
+	struct timespec ts;
+	vmi_get_wallclock_ts(&ts);
+	do_settimeofday(&ts);
+}
+
+unsigned long vmi_get_wallclock(void)
+{
+	struct timespec ts;
+	vmi_get_wallclock_ts(&ts);
+	return ts.tv_sec;
+}
+
+int vmi_set_wallclock(unsigned long now)
+{
+	return -1;
+}
+
+unsigned long long vmi_sched_clock(void)
+{
+	return read_available_cycles();
+}
+
+void __init vmi_time_init(void)
+{
+	unsigned long long cycles_per_sec, cycles_per_msec;
+
+	setup_irq(0, &vmi_timer_irq);
+#ifdef CONFIG_X86_LOCAL_APIC
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
+#endif
+
+	no_sync_cmos_clock = 1;
+
+	vmi_get_wallclock_ts(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+		-xtime.tv_sec, -xtime.tv_nsec);
+
+	real_cycles_accounted_system = read_real_cycles();
+	update_xtime_from_wallclock();
+	per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
+
+	cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
+
+	cycles_per_jiffy = cycles_per_sec;
+	(void)do_div(cycles_per_jiffy, HZ);
+	cycles_per_alarm = cycles_per_sec;
+	(void)do_div(cycles_per_alarm, alarm_hz);
+	cycles_per_msec = cycles_per_sec;
+	(void)do_div(cycles_per_msec, 1000);
+	cpu_khz = cycles_per_msec;
+
+	printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
+	       "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
+	       cycles_per_alarm);
+
+	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+						    clocksource_vmi.shift);
+	if (clocksource_register(&clocksource_vmi))
+		printk(KERN_WARNING "Error registering VMITIME clocksource.");
+
+	/* Disable PIT. */
+	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+	/* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
+	 * reduce the latency calling update_process_times. */
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
+		      cycles_per_alarm);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+void __init vmi_timer_setup_boot_alarm(void)
+{
+	local_irq_disable();
+
+	/* Route the interrupt to the correct vector. */
+	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
+
+	/* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
+	vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
+		      cycles_per_alarm);
+	local_irq_enable();
+}
+
+/* Initialize the time accounting variables for an AP on an SMP system.
+ * Also, set the local alarm for the AP. */
+void __init vmi_timer_setup_secondary_alarm(void)
+{
+	int cpu = smp_processor_id();
+
+	/* Route the interrupt to the correct vector. */
+	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
+
+	per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
+
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
+		      cycles_per_alarm);
+}
+
+#endif
+
+/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
+static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
+{
+	long long cycles_not_accounted;
+
+	write_seqlock(&xtime_lock);
+
+	cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
+	while (cycles_not_accounted >= cycles_per_jiffy) {
+		/* systems wide jiffies and wallclock. */
+		do_timer(1);
+
+		cycles_not_accounted -= cycles_per_jiffy;
+		real_cycles_accounted_system += cycles_per_jiffy;
+	}
+
+	if (vmi_timer_ops.wallclock_updated())
+		update_xtime_from_wallclock();
+
+	write_sequnlock(&xtime_lock);
+}
+
+/* Update per-cpu process times. */
+static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
+					     unsigned long long cur_process_times_cycles)
+{
+	long long cycles_not_accounted;
+	cycles_not_accounted = cur_process_times_cycles -
+		per_cpu(process_times_cycles_accounted_cpu, cpu);
+
+	while (cycles_not_accounted >= cycles_per_jiffy) {
+		/* Account time to the current process.  This includes
+		 * calling into the scheduler to decrement the timeslice
+		 * and possibly reschedule.*/
+		update_process_times(user_mode(regs));
+		/* XXX handle /proc/profile multiplier.  */
+		profile_tick(CPU_PROFILING);
+
+		cycles_not_accounted -= cycles_per_jiffy;
+		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+	}
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/* Update per-cpu idle times.  Used when a no-hz halt is ended. */
+static void vmi_account_no_hz_idle_cycles(int cpu,
+					  unsigned long long cur_process_times_cycles)
+{
+	long long cycles_not_accounted;
+	unsigned long no_idle_hz_jiffies = 0;
+
+	cycles_not_accounted = cur_process_times_cycles -
+		per_cpu(process_times_cycles_accounted_cpu, cpu);
+
+	while (cycles_not_accounted >= cycles_per_jiffy) {
+		no_idle_hz_jiffies++;
+		cycles_not_accounted -= cycles_per_jiffy;
+		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+	}
+	/* Account time to the idle process. */
+	account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
+}
+#endif
+
+/* Update per-cpu stolen time. */
+static void vmi_account_stolen_cycles(int cpu,
+				      unsigned long long cur_real_cycles,
+				      unsigned long long cur_avail_cycles)
+{
+	long long stolen_cycles_not_accounted;
+	unsigned long stolen_jiffies = 0;
+
+	if (cur_real_cycles < cur_avail_cycles)
+		return;
+
+	stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
+		per_cpu(stolen_cycles_accounted_cpu, cpu);
+
+	while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
+		stolen_jiffies++;
+		stolen_cycles_not_accounted -= cycles_per_jiffy;
+		per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
+	}
+	/* HACK: pass NULL to force time onto cpustat->steal. */
+	account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
+}
+
+/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
+ * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
+static void vmi_local_timer_interrupt(int cpu)
+{
+	unsigned long long cur_real_cycles, cur_process_times_cycles;
+
+	cur_real_cycles = read_real_cycles();
+	cur_process_times_cycles = read_available_cycles();
+	/* Update system wide (real) time state (xtime, jiffies). */
+	vmi_account_real_cycles(cur_real_cycles);
+	/* Update per-cpu process times. */
+	vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
+        /* Update time stolen from this cpu by the hypervisor. */
+	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+
+/* Must be called only from idle loop, with interrupts disabled. */
+int vmi_stop_hz_timer(void)
+{
+	/* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
+
+	unsigned long seq, next;
+	unsigned long long real_cycles_expiry;
+	int cpu = smp_processor_id();
+	int idle;
+
+	BUG_ON(!irqs_disabled());
+	if (sysctl_hz_timer != 0)
+		return 0;
+
+	cpu_set(cpu, nohz_cpu_mask);
+	smp_mb();
+	if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
+	    (next = next_timer_interrupt(), time_before_eq(next, jiffies))) {
+		cpu_clear(cpu, nohz_cpu_mask);
+		next = jiffies;
+		idle = 0;
+	} else
+		idle = 1;
+
+	/* Convert jiffies to the real cycle counter. */
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		real_cycles_expiry = real_cycles_accounted_system +
+			(long)(next - jiffies) * cycles_per_jiffy;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	/* This cpu is going idle. Disable the periodic alarm. */
+	if (idle) {
+		vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
+		per_cpu(idle_start_jiffies, cpu) = jiffies;
+	}
+
+	/* Set the real time alarm to expire at the next event. */
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
+		      real_cycles_expiry, 0);
+
+	return idle;
+}
+
+static void vmi_reenable_hz_timer(int cpu)
+{
+	/* For /proc/vmi/info idle_hz stat. */
+	per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
+	per_cpu(vmi_idle_no_hz_irqs, cpu)++;
+
+	/* Don't bother explicitly cancelling the one-shot alarm -- at
+	 * worse we will receive a spurious timer interrupt. */
+	vmi_timer_ops.set_alarm(
+		      VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
+		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
+		      cycles_per_alarm);
+	/* Indicate this cpu is no longer nohz idle. */
+	cpu_clear(cpu, nohz_cpu_mask);
+}
+
+/* Called from interrupt handlers when (local) HZ timer is disabled. */
+void vmi_account_time_restart_hz_timer(void)
+{
+	unsigned long long cur_real_cycles, cur_process_times_cycles;
+	int cpu = smp_processor_id();
+
+	BUG_ON(!irqs_disabled());
+	/* Account the time during which the HZ timer was disabled. */
+	cur_real_cycles = read_real_cycles();
+	cur_process_times_cycles = read_available_cycles();
+	/* Update system wide (real) time state (xtime, jiffies). */
+	vmi_account_real_cycles(cur_real_cycles);
+	/* Update per-cpu idle times. */
+	vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
+        /* Update time stolen from this cpu by the hypervisor. */
+	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
+	/* Reenable the hz timer. */
+	vmi_reenable_hz_timer(cpu);
+}
+
+#endif /* CONFIG_NO_IDLE_HZ */
+
+/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
+ * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
+ * APIC setup and setup_boot_vmi_alarm() is called.  */
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+	vmi_local_timer_interrupt(smp_processor_id());
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
+ * Also used in UP when CONFIG_X86_LOCAL_APIC.
+ * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
+void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+	int cpu = smp_processor_id();
+
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
+        per_cpu(irq_stat,cpu).apic_timer_irqs++;
+
+	/*
+	 * NOTE! We'd better ACK the irq immediately,
+	 * because timer handling can be slow.
+	 */
+	ack_APIC_irq();
+
+	/*
+	 * update_process_times() expects us to have done irq_enter().
+	 * Besides, if we don't timer interrupts ignore the global
+	 * interrupt lock, which is the WrongThing (tm) to do.
+	 */
+	irq_enter();
+	vmi_local_timer_interrupt(cpu);
+	irq_exit();
+	set_irq_regs(old_regs);
+}
+
+#endif  /* CONFIG_X86_LOCAL_APIC */
Index: linux/include/asm-i386/apic.h
===================================================================
--- linux.orig/include/asm-i386/apic.h
+++ linux/include/asm-i386/apic.h
@@ -43,6 +43,8 @@ extern void generic_apic_probe(void);
 #define apic_write native_apic_write
 #define apic_write_atomic native_apic_write_atomic
 #define apic_read native_apic_read
+#define setup_boot_clock setup_boot_APIC_clock
+#define setup_secondary_clock setup_secondary_APIC_clock
 #endif
 
 static __inline fastcall void native_apic_write(unsigned long reg,
Index: linux/include/asm-i386/paravirt.h
===================================================================
--- linux.orig/include/asm-i386/paravirt.h
+++ linux/include/asm-i386/paravirt.h
@@ -121,6 +121,8 @@ struct paravirt_ops
 	void (fastcall *apic_write)(unsigned long reg, unsigned long v);
 	void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
 	unsigned long (fastcall *apic_read)(unsigned long reg);
+	void (*setup_boot_clock)(void);
+	void (*setup_secondary_clock)(void);
 #endif
 
 	void (fastcall *flush_tlb_user)(void);
@@ -323,6 +325,16 @@ static inline unsigned long apic_read(un
 {
 	return paravirt_ops.apic_read(reg);
 }
+
+static inline void setup_boot_clock(void)
+{
+	paravirt_ops.setup_boot_clock();
+}
+
+static inline void setup_secondary_clock(void)
+{
+	paravirt_ops.setup_secondary_clock();
+}
 #endif
 
 #ifdef CONFIG_SMP
Index: linux/include/asm-i386/time.h
===================================================================
--- linux.orig/include/asm-i386/time.h
+++ linux/include/asm-i386/time.h
@@ -30,6 +30,7 @@ static inline int native_set_wallclock(u
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
+extern unsigned long long native_sched_clock(void);
 #else /* !CONFIG_PARAVIRT */
 
 #define get_wallclock() native_get_wallclock()
Index: linux/include/asm-i386/timer.h
===================================================================
--- linux.orig/include/asm-i386/timer.h
+++ linux/include/asm-i386/timer.h
@@ -9,6 +9,8 @@ void setup_pit_timer(void);
 extern int pit_latch_buggy;
 extern int timer_ack;
 extern int no_timer_check;
+extern unsigned long long (*custom_sched_clock)(void);
+extern int no_sync_cmos_clock;
 extern int recalibrate_cpu_khz(void);
 
 #endif
Index: linux/include/asm-i386/vmi_time.h
===================================================================
--- /dev/null
+++ linux/include/asm-i386/vmi_time.h
@@ -0,0 +1,103 @@
+/*
+ * VMI Time wrappers
+ *
+ * Copyright (C) 2006, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to dhecht@vmware.com
+ *
+ */
+
+#ifndef __VMI_TIME_H
+#define __VMI_TIME_H
+
+/*
+ * Raw VMI call indices for timer functions
+ */
+#define VMI_CALL_GetCycleFrequency	66
+#define VMI_CALL_GetCycleCounter	67
+#define VMI_CALL_SetAlarm		68
+#define VMI_CALL_CancelAlarm		69
+#define VMI_CALL_GetWallclockTime	70
+#define VMI_CALL_WallclockUpdated	71
+
+/* Cached VMI timer operations */
+extern struct vmi_timer_ops {
+	u64 (*get_cycle_frequency)(void);
+	u64 (*get_cycle_counter)(int);
+	u64 (*get_wallclock)(void);
+	int (*wallclock_updated)(void);
+	void (*set_alarm)(u32 flags, u64 expiry, u64 period);
+	void (*cancel_alarm)(u32 flags);
+} vmi_timer_ops;
+
+/* Prototypes */
+extern void __init vmi_time_init(void);
+extern unsigned long vmi_get_wallclock(void);
+extern int vmi_set_wallclock(unsigned long now);
+extern unsigned long long vmi_sched_clock(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void __init vmi_timer_setup_boot_alarm(void);
+extern void __init vmi_timer_setup_secondary_alarm(void);
+extern void apic_vmi_timer_interrupt(void);
+#endif
+
+#ifdef CONFIG_NO_IDLE_HZ
+extern int vmi_stop_hz_timer(void);
+extern void vmi_account_time_restart_hz_timer(void);
+#endif
+
+/*
+ * When run under a hypervisor, a vcpu is always in one of three states:
+ * running, halted, or ready.  The vcpu is in the 'running' state if it
+ * is executing.  When the vcpu executes the halt interface, the vcpu
+ * enters the 'halted' state and remains halted until there is some work
+ * pending for the vcpu (e.g. an alarm expires, host I/O completes on
+ * behalf of virtual I/O).  At this point, the vcpu enters the 'ready'
+ * state (waiting for the hypervisor to reschedule it).  Finally, at any
+ * time when the vcpu is not in the 'running' state nor the 'halted'
+ * state, it is in the 'ready' state.
+ *
+ * Real time is advances while the vcpu is 'running', 'ready', or
+ * 'halted'.  Stolen time is the time in which the vcpu is in the
+ * 'ready' state.  Available time is the remaining time -- the vcpu is
+ * either 'running' or 'halted'.
+ *
+ * All three views of time are accessible through the VMI cycle
+ * counters.
+ */
+
+/* The cycle counters. */
+#define VMI_CYCLES_REAL         0
+#define VMI_CYCLES_AVAILABLE    1
+#define VMI_CYCLES_STOLEN       2
+
+/* The alarm interface 'flags' bits */
+#define VMI_ALARM_COUNTERS      2
+
+#define VMI_ALARM_COUNTER_MASK  0x000000ff
+
+#define VMI_ALARM_WIRED_IRQ0    0x00000000
+#define VMI_ALARM_WIRED_LVTT    0x00010000
+
+#define VMI_ALARM_IS_ONESHOT    0x00000000
+#define VMI_ALARM_IS_PERIODIC   0x00000100
+
+#define CONFIG_VMI_ALARM_HZ	100
+
+#endif

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [7/10] VMI: Profile pc badness
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
                   ` (4 preceding siblings ...)
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [6/10] i386: vMI timer patches Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [8/10] VMI: Kprobe rpl fix Andi Kleen
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>
Profile_pc was broken when using paravirtualization because the
assumption the kernel was running at CPL 0 was violated, causing
bad logic to read a random value off the stack.

The only way to be in kernel lock functions is to be in kernel
code, so validate that assumption explicitly by checking the CS
value.  We don't want to be fooled by BIOS / APM segments and
try to read those stacks, so only match KERNEL_CS.

I moved some stuff in segment.h to make it prettier.

Signed-off-by: Zachary Amsden <zach@vmware.com>

diff -r 69d0339b9997 arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c	Fri Feb 02 15:55:46 2007 -0800
+++ b/arch/i386/kernel/time.c	Fri Feb 02 16:15:45 2007 -0800
@@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs 
 	unsigned long pc = instruction_pointer(regs);
 
 #ifdef CONFIG_SMP
-	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
+	    in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->ebp + 4);
 #else
-		unsigned long *sp;
-		if ((regs->xcs & 3) == 0)
-			sp = (unsigned long *)&regs->esp;
-		else
-			sp = (unsigned long *)regs->esp;
+		unsigned long *sp = (unsigned long *)&regs->esp;
+
 		/* Return address is either directly at stack pointer
 		   or above a saved eflags. Eflags has bits 22-31 zero,
 		   kernel addresses don't. */
diff -r 69d0339b9997 include/asm-i386/ptrace.h
--- a/include/asm-i386/ptrace.h	Fri Feb 02 15:55:46 2007 -0800
+++ b/include/asm-i386/ptrace.h	Fri Feb 02 16:12:37 2007 -0800
@@ -49,6 +49,10 @@ static inline int user_mode_vm(struct pt
 {
 	return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL;
 }
+static inline int v8086_mode(struct pt_regs *regs)
+{
+	return (regs->eflags & VM_MASK);
+}
 
 #define instruction_pointer(regs) ((regs)->eip)
 #define regs_return_value(regs) ((regs)->eax)
diff -r 69d0339b9997 include/asm-i386/segment.h
--- a/include/asm-i386/segment.h	Fri Feb 02 15:55:46 2007 -0800
+++ b/include/asm-i386/segment.h	Fri Feb 02 16:08:50 2007 -0800
@@ -83,13 +83,7 @@
  * The GDT has 32 entries
  */
 #define GDT_ENTRIES 32
-
 #define GDT_SIZE (GDT_ENTRIES * 8)
-
-/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
-#define SEGMENT_IS_FLAT_CODE(x)  (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
-#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
 
 /* Simple and small GDT entries for booting only */
 
@@ -134,4 +128,17 @@
 #ifndef CONFIG_PARAVIRT
 #define get_kernel_rpl()  0
 #endif
+/*
+ * Matching rules for certain types of segments.
+ */
+
+/* Matches only __KERNEL_CS, ignoring PnP / USER / APM segments */
+#define SEGMENT_IS_KERNEL_CODE(x) (((x) & 0xfc) == GDT_ENTRY_KERNEL_CS * 8)
+
+/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
+#define SEGMENT_IS_FLAT_CODE(x)  (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
+
+/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
+#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
+
 #endif


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [8/10] VMI: Kprobe rpl fix
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
                   ` (5 preceding siblings ...)
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [7/10] VMI: Profile pc badness Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [9/10] VMI: Vmi timer race Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [10/10] VMI: Paravirt debug defaults off Andi Kleen
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, Eric Biederman, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>
Kprobes bugfix for paravirt compatibility - RPL on the CS when inserting
BPs must match running kernel.

Signed-off-by: Zachary Amsden <zach@vmware.com>
CC: Eric Biederman <ebiederm@xmission.com>

diff -r fad1c2108c13 arch/i386/kernel/kprobes.c
--- a/arch/i386/kernel/kprobes.c	Fri Feb 02 16:22:37 2007 -0800
+++ b/arch/i386/kernel/kprobes.c	Fri Feb 02 16:28:48 2007 -0800
@@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_hand
 	spin_lock_irqsave(&kretprobe_lock, flags);
 	head = kretprobe_inst_table_head(current);
 	/* fixup registers */
-	regs->xcs = __KERNEL_CS;
+	regs->xcs = __KERNEL_CS | get_kernel_rpl();
 	regs->eip = trampoline_address;
 	regs->orig_eax = 0xffffffff;
 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [9/10] VMI: Vmi timer race
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
                   ` (6 preceding siblings ...)
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [8/10] VMI: Kprobe rpl fix Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [10/10] VMI: Paravirt debug defaults off Andi Kleen
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>
Because timer code moves around, and we might eventually move our init to a
late_time_init hook, save and restore IRQs around this code because it is
definitely not interrupt safe.

Signed-off-by: Zachary Amsden <zach@vmware.com>

diff -r dd4d4324a5b3 arch/i386/kernel/vmitime.c
--- a/arch/i386/kernel/vmitime.c	Thu Feb 01 23:53:06 2007 -0800
+++ b/arch/i386/kernel/vmitime.c	Fri Feb 02 00:03:05 2007 -0800
@@ -178,7 +178,9 @@ void __init vmi_time_init(void)
 void __init vmi_time_init(void)
 {
 	unsigned long long cycles_per_sec, cycles_per_msec;
-
+	unsigned long flags;
+
+	local_irq_save(flags);
 	setup_irq(0, &vmi_timer_irq);
 #ifdef CONFIG_X86_LOCAL_APIC
 	set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
@@ -222,6 +224,8 @@ void __init vmi_time_init(void)
 		      VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
 		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
 		      cycles_per_alarm);
+
+	local_irq_restore(flags);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2.6.21 review II] [10/10] VMI: Paravirt debug defaults off
  2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
                   ` (7 preceding siblings ...)
  2007-02-10 11:52 ` [PATCH 2.6.21 review II] [9/10] VMI: Vmi timer race Andi Kleen
@ 2007-02-10 11:52 ` Andi Kleen
  8 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2007-02-10 11:52 UTC (permalink / raw)
  To: Zachary Amsden, patches, linux-kernel


From: Zachary Amsden <zach@vmware.com>
Deliberate register clobber around performance critical inline code is great for
testing, bad to leave on by default.  Many people ship with DEBUG_KERNEL turned
on, so stop making DEBUG_PARAVIRT default on.

Signed-off-by: Zachary Amsden <zach@vmware.com>

diff -r 3a8033f42ecf arch/i386/Kconfig.debug
--- a/arch/i386/Kconfig.debug	Fri Feb 02 17:33:35 2007 -0800
+++ b/arch/i386/Kconfig.debug	Fri Feb 02 17:48:45 2007 -0800
@@ -89,7 +89,7 @@ config DOUBLEFAULT
 
 config DEBUG_PARAVIRT
 	bool "Enable some paravirtualization debugging"
-	default y
+	default n
 	depends on PARAVIRT && DEBUG_KERNEL
 	help
 	  Currently deliberately clobbers regs which are allowed to be


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2007-02-10 11:57 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-10 11:52 [PATCH 2.6.21 review II] [1/10] i386: page allocation hooks for VMI backend Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [2/10] i386: paravirt CPU hypercall batching mode Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [3/10] i386: iOPL handling for paravirt guests Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [4/10] i386: sMP boot hook for paravirt Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [5/10] i386: vMI backend for paravirt-ops Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [6/10] i386: vMI timer patches Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [7/10] VMI: Profile pc badness Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [8/10] VMI: Kprobe rpl fix Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [9/10] VMI: Vmi timer race Andi Kleen
2007-02-10 11:52 ` [PATCH 2.6.21 review II] [10/10] VMI: Paravirt debug defaults off Andi Kleen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.