All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-11 10:34 Paul Mackerras
  2011-05-11 10:36 ` [PATCH 01/13] kvm/powerpc: Move fields between struct kvm_vcpu_arch and kvmppc_vcpu_book3s Paul Mackerras
                   ` (13 more replies)
  0 siblings, 14 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:34 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

The following series of patches enable KVM to exploit the hardware
hypervisor mode on 64-bit Power ISA Book3S machines.  At present only
POWER7 is supported, but it would be easy to add other processors.

Running the KVM host in hypervisor mode means that the guest can use
both supervisor mode and user mode.  That means that the guest can
execute supervisor-privilege instructions and access supervisor-
privilege registers.  In addition the hardware directs most exceptions
to the guest.  Thus we don't need to emulate any instructions in the
host.  Generally, the only times we need to exit the guest are when it
does a hypercall or when an external interrupt or host timer
(decrementer) interrupt occurs.

The focus of this KVM implementation is to run guests that use the
PAPR (Power Architecture Platform Requirements) paravirtualization
interface, which is the interface supplied by PowerVM on IBM pSeries
machines.

These patches are against Ben Herrenschmidt's next branch in his tree
at git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* [PATCH 01/13] kvm/powerpc: Move fields between struct kvm_vcpu_arch and kvmppc_vcpu_book3s
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
@ 2011-05-11 10:36 ` Paul Mackerras
  2011-05-11 10:38 ` [PATCH 02/13] kvm/powerpc: Fix kvmppc_core_pending_dec Paul Mackerras
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:36 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

This moves the slb field, which represents the state of the emulated
SLB, from the kvmppc_vcpu_book3s struct to the kvm_vcpu_arch, and the
hpte_hash_[v]pte[_long] fields from kvm_vcpu_arch to kvmppc_vcpu_book3s.
This is in accord with the principle that the kvm_vcpu_arch struct
represents the state of the emulated CPU, and the kvmppc_vcpu_book3s
struct holds the auxiliary data structures used in the emulation.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h |   34 +++++++++-------
 arch/powerpc/include/asm/kvm_host.h   |   34 +++++++---------
 arch/powerpc/kvm/book3s.c             |    7 ++-
 arch/powerpc/kvm/book3s_64_mmu.c      |   54 +++++++++++-------------
 arch/powerpc/kvm/book3s_mmu_hpte.c    |   71 +++++++++++++++++++-------------
 arch/powerpc/kvm/trace.h              |    2 +-
 6 files changed, 106 insertions(+), 96 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index d62e703..cfb2012 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -24,20 +24,6 @@
 #include <linux/kvm_host.h>
 #include <asm/kvm_book3s_asm.h>
 
-struct kvmppc_slb {
-	u64 esid;
-	u64 vsid;
-	u64 orige;
-	u64 origv;
-	bool valid	: 1;
-	bool Ks		: 1;
-	bool Kp		: 1;
-	bool nx		: 1;
-	bool large	: 1;	/* PTEs are 16MB */
-	bool tb		: 1;	/* 1TB segment */
-	bool class	: 1;
-};
-
 struct kvmppc_bat {
 	u64 raw;
 	u32 bepi;
@@ -67,11 +53,22 @@ struct kvmppc_sid_map {
 #define VSID_POOL_SIZE	(SID_CONTEXTS * 16)
 #endif
 
+struct hpte_cache {
+	struct hlist_node list_pte;
+	struct hlist_node list_pte_long;
+	struct hlist_node list_vpte;
+	struct hlist_node list_vpte_long;
+	struct rcu_head rcu_head;
+	u64 host_va;
+	u64 pfn;
+	ulong slot;
+	struct kvmppc_pte pte;
+};
+
 struct kvmppc_vcpu_book3s {
 	struct kvm_vcpu vcpu;
 	struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
 	struct kvmppc_sid_map sid_map[SID_MAP_NUM];
-	struct kvmppc_slb slb[64];
 	struct {
 		u64 esid;
 		u64 vsid;
@@ -94,6 +91,13 @@ struct kvmppc_vcpu_book3s {
 #endif
 	int context_id[SID_CONTEXTS];
 	ulong prog_flags; /* flags to inject when giving a 700 trap */
+
+	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
+	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
+	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+	int hpte_cache_count;
+	spinlock_t mmu_lock;
 };
 
 #define CONTEXT_HOST		0
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index bba3b9b..3ebe51b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -163,16 +163,18 @@ struct kvmppc_mmu {
 	bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
 };
 
-struct hpte_cache {
-	struct hlist_node list_pte;
-	struct hlist_node list_pte_long;
-	struct hlist_node list_vpte;
-	struct hlist_node list_vpte_long;
-	struct rcu_head rcu_head;
-	u64 host_va;
-	u64 pfn;
-	ulong slot;
-	struct kvmppc_pte pte;
+struct kvmppc_slb {
+	u64 esid;
+	u64 vsid;
+	u64 orige;
+	u64 origv;
+	bool valid	: 1;
+	bool Ks		: 1;
+	bool Kp		: 1;
+	bool nx		: 1;
+	bool large	: 1;	/* PTEs are 16MB */
+	bool tb		: 1;	/* 1TB segment */
+	bool class	: 1;
 };
 
 struct kvm_vcpu_arch {
@@ -187,6 +189,9 @@ struct kvm_vcpu_arch {
 	ulong highmem_handler;
 	ulong rmcall;
 	ulong host_paca_phys;
+	struct kvmppc_slb slb[64];
+	int slb_max;		/* # valid entries in slb[] */
+	int slb_nr;		/* total number of entries in SLB */
 	struct kvmppc_mmu mmu;
 #endif
 
@@ -293,15 +298,6 @@ struct kvm_vcpu_arch {
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
-
-#ifdef CONFIG_PPC_BOOK3S
-	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
-	struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
-	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
-	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
-	int hpte_cache_count;
-	spinlock_t mmu_lock;
-#endif
 };
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index c961de4..3501961 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -17,7 +17,6 @@
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/slab.h>
-#include "trace.h"
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -33,6 +32,8 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 
+#include "trace.h"
+
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
@@ -1190,8 +1191,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
 	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
 		for (i = 0; i < 64; i++) {
-			sregs->u.s.ppc64.slb[i].slbe = vcpu3s->slb[i].orige | i;
-			sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
+			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
+			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
 		}
 	} else {
 		for (i = 0; i < 16; i++)
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index d7889ef..c6d3e19 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -41,36 +41,36 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
-				struct kvmppc_vcpu_book3s *vcpu_book3s,
+				struct kvm_vcpu *vcpu,
 				gva_t eaddr)
 {
 	int i;
 	u64 esid = GET_ESID(eaddr);
 	u64 esid_1t = GET_ESID_1T(eaddr);
 
-	for (i = 0; i < vcpu_book3s->slb_nr; i++) {
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
 		u64 cmp_esid = esid;
 
-		if (!vcpu_book3s->slb[i].valid)
+		if (!vcpu->arch.slb[i].valid)
 			continue;
 
-		if (vcpu_book3s->slb[i].tb)
+		if (vcpu->arch.slb[i].tb)
 			cmp_esid = esid_1t;
 
-		if (vcpu_book3s->slb[i].esid == cmp_esid)
-			return &vcpu_book3s->slb[i];
+		if (vcpu->arch.slb[i].esid == cmp_esid)
+			return &vcpu->arch.slb[i];
 	}
 
 	dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
 		eaddr, esid, esid_1t);
-	for (i = 0; i < vcpu_book3s->slb_nr; i++) {
-	    if (vcpu_book3s->slb[i].vsid)
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+	    if (vcpu->arch.slb[i].vsid)
 		dprintk("  %d: %c%c%c %llx %llx\n", i,
-			vcpu_book3s->slb[i].valid ? 'v' : ' ',
-			vcpu_book3s->slb[i].large ? 'l' : ' ',
-			vcpu_book3s->slb[i].tb    ? 't' : ' ',
-			vcpu_book3s->slb[i].esid,
-			vcpu_book3s->slb[i].vsid);
+			vcpu->arch.slb[i].valid ? 'v' : ' ',
+			vcpu->arch.slb[i].large ? 'l' : ' ',
+			vcpu->arch.slb[i].tb    ? 't' : ' ',
+			vcpu->arch.slb[i].esid,
+			vcpu->arch.slb[i].vsid);
 	}
 
 	return NULL;
@@ -81,7 +81,7 @@ static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
 {
 	struct kvmppc_slb *slb;
 
-	slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr);
+	slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
 	if (!slb)
 		return 0;
 
@@ -180,7 +180,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		return 0;
 	}
 
-	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
 	if (!slbe)
 		goto no_seg_found;
 
@@ -320,10 +320,10 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
 	esid_1t = GET_ESID_1T(rb);
 	slb_nr = rb & 0xfff;
 
-	if (slb_nr > vcpu_book3s->slb_nr)
+	if (slb_nr > vcpu->arch.slb_nr)
 		return;
 
-	slbe = &vcpu_book3s->slb[slb_nr];
+	slbe = &vcpu->arch.slb[slb_nr];
 
 	slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
 	slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;
@@ -344,38 +344,35 @@ static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
 
 static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_slb *slbe;
 
-	if (slb_nr > vcpu_book3s->slb_nr)
+	if (slb_nr > vcpu->arch.slb_nr)
 		return 0;
 
-	slbe = &vcpu_book3s->slb[slb_nr];
+	slbe = &vcpu->arch.slb[slb_nr];
 
 	return slbe->orige;
 }
 
 static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_slb *slbe;
 
-	if (slb_nr > vcpu_book3s->slb_nr)
+	if (slb_nr > vcpu->arch.slb_nr)
 		return 0;
 
-	slbe = &vcpu_book3s->slb[slb_nr];
+	slbe = &vcpu->arch.slb[slb_nr];
 
 	return slbe->origv;
 }
 
 static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	struct kvmppc_slb *slbe;
 
 	dprintk("KVM MMU: slbie(0x%llx)\n", ea);
 
-	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea);
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
 
 	if (!slbe)
 		return;
@@ -389,13 +386,12 @@ static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
 
 static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
 {
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
 	int i;
 
 	dprintk("KVM MMU: slbia()\n");
 
-	for (i = 1; i < vcpu_book3s->slb_nr; i++)
-		vcpu_book3s->slb[i].valid = false;
+	for (i = 1; i < vcpu->arch.slb_nr; i++)
+		vcpu->arch.slb[i].valid = false;
 
 	if (vcpu->arch.shared->msr & MSR_IR) {
 		kvmppc_mmu_flush_segments(vcpu);
@@ -464,7 +460,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 	ulong mp_ea = vcpu->arch.magic_page_ea;
 
 	if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-		slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
+		slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
 		if (slb)
 			gvsid = slb->vsid;
 	}
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 79751d8..41cb001 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,7 +21,6 @@
 #include <linux/kvm_host.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
-#include "trace.h"
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -29,6 +28,8 @@
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 
+#include "trace.h"
+
 #define PTE_SIZE	12
 
 static struct kmem_cache *hpte_cache;
@@ -58,30 +59,31 @@ static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
 void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	u64 index;
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 
 	trace_kvm_book3s_mmu_map(pte);
 
-	spin_lock(&vcpu->arch.mmu_lock);
+	spin_lock(&vcpu3s->mmu_lock);
 
 	/* Add to ePTE list */
 	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
-	hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+	hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]);
 
 	/* Add to ePTE_long list */
 	index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
 	hlist_add_head_rcu(&pte->list_pte_long,
-			   &vcpu->arch.hpte_hash_pte_long[index]);
+			   &vcpu3s->hpte_hash_pte_long[index]);
 
 	/* Add to vPTE list */
 	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
-	hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+	hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]);
 
 	/* Add to vPTE_long list */
 	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
 	hlist_add_head_rcu(&pte->list_vpte_long,
-			   &vcpu->arch.hpte_hash_vpte_long[index]);
+			   &vcpu3s->hpte_hash_vpte_long[index]);
 
-	spin_unlock(&vcpu->arch.mmu_lock);
+	spin_unlock(&vcpu3s->mmu_lock);
 }
 
 static void free_pte_rcu(struct rcu_head *head)
@@ -92,16 +94,18 @@ static void free_pte_rcu(struct rcu_head *head)
 
 static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
 	trace_kvm_book3s_mmu_invalidate(pte);
 
 	/* Different for 32 and 64 bit */
 	kvmppc_mmu_invalidate_pte(vcpu, pte);
 
-	spin_lock(&vcpu->arch.mmu_lock);
+	spin_lock(&vcpu3s->mmu_lock);
 
 	/* pte already invalidated in between? */
 	if (hlist_unhashed(&pte->list_pte)) {
-		spin_unlock(&vcpu->arch.mmu_lock);
+		spin_unlock(&vcpu3s->mmu_lock);
 		return;
 	}
 
@@ -115,14 +119,15 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 	else
 		kvm_release_pfn_clean(pte->pfn);
 
-	spin_unlock(&vcpu->arch.mmu_lock);
+	spin_unlock(&vcpu3s->mmu_lock);
 
-	vcpu->arch.hpte_cache_count--;
+	vcpu3s->hpte_cache_count--;
 	call_rcu(&pte->rcu_head, free_pte_rcu);
 }
 
 static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hpte_cache *pte;
 	struct hlist_node *node;
 	int i;
@@ -130,7 +135,7 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 	rcu_read_lock();
 
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
-		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
 
 		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			invalidate_pte(vcpu, pte);
@@ -141,12 +146,13 @@ static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
 
 static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 
 	/* Find the list of entries in the map */
-	list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+	list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
 
 	rcu_read_lock();
 
@@ -160,12 +166,13 @@ static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
 
 static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 
 	/* Find the list of entries in the map */
-	list = &vcpu->arch.hpte_hash_pte_long[
+	list = &vcpu3s->hpte_hash_pte_long[
 			kvmppc_mmu_hash_pte_long(guest_ea)];
 
 	rcu_read_lock();
@@ -203,12 +210,13 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
 /* Flush with mask 0xfffffffff */
 static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xfffffffffULL;
 
-	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+	list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
 
 	rcu_read_lock();
 
@@ -223,12 +231,13 @@ static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
 /* Flush with mask 0xffffff000 */
 static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_head *list;
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 	u64 vp_mask = 0xffffff000ULL;
 
-	list = &vcpu->arch.hpte_hash_vpte_long[
+	list = &vcpu3s->hpte_hash_vpte_long[
 		kvmppc_mmu_hash_vpte_long(guest_vp)];
 
 	rcu_read_lock();
@@ -261,6 +270,7 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
 
 void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hlist_node *node;
 	struct hpte_cache *pte;
 	int i;
@@ -270,7 +280,7 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 	rcu_read_lock();
 
 	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
-		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
 
 		hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
 			if ((pte->pte.raddr >= pa_start) &&
@@ -283,12 +293,13 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 
 struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct hpte_cache *pte;
 
 	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
-	vcpu->arch.hpte_cache_count++;
+	vcpu3s->hpte_cache_count++;
 
-	if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+	if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
 		kvmppc_mmu_pte_flush_all(vcpu);
 
 	return pte;
@@ -309,17 +320,19 @@ static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
 
 int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
 	/* init hpte lookup hashes */
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long));
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
-	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
-				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
-
-	spin_lock_init(&vcpu->arch.mmu_lock);
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte_long));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long));
+
+	spin_lock_init(&vcpu3s->mmu_lock);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 3aca1b0..d62a14b 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -252,7 +252,7 @@ TRACE_EVENT(kvm_book3s_mmu_flush,
 	),
 
 	TP_fast_assign(
-		__entry->count		= vcpu->arch.hpte_cache_count;
+		__entry->count		= to_book3s(vcpu)->hpte_cache_count;
 		__entry->p1		= p1;
 		__entry->p2		= p2;
 		__entry->type		= type;
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 02/13] kvm/powerpc: Fix kvmppc_core_pending_dec
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
  2011-05-11 10:36 ` [PATCH 01/13] kvm/powerpc: Move fields between struct kvm_vcpu_arch and kvmppc_vcpu_book3s Paul Mackerras
@ 2011-05-11 10:38 ` Paul Mackerras
  2011-05-11 10:39 ` [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors Paul Mackerras
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:38 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

The vcpu->arch.pending_exceptions field is a bitfield indexed by
interrupt priority number as returned by kvmppc_book3s_vec2irqprio.
However, kvmppc_core_pending_dec was using an interrupt vector shifted
by 7 as the bit index.  Fix it to use the irqprio value for the
decrementer interrupt instead.  This problem was found by code
inspection.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kvm/book3s.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 3501961..42a2fed 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -237,7 +237,7 @@ void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 
 int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
 {
-	return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions);
+	return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
 }
 
 void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
  2011-05-11 10:36 ` [PATCH 01/13] kvm/powerpc: Move fields between struct kvm_vcpu_arch and kvmppc_vcpu_book3s Paul Mackerras
  2011-05-11 10:38 ` [PATCH 02/13] kvm/powerpc: Fix kvmppc_core_pending_dec Paul Mackerras
@ 2011-05-11 10:39 ` Paul Mackerras
  2011-05-12  9:33     ` Alexander Graf
  2011-05-11 10:40 ` [PATCH 04/13] kvm/powerpc: Split out code from book3s.c into book3s_pr.c Paul Mackerras
                   ` (10 subsequent siblings)
  13 siblings, 1 reply; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:39 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

Commits a5d4f3ad3a ("powerpc: Base support for exceptions using
HSRR0/1") and 673b189a2e ("powerpc: Always use SPRN_SPRG_HSCRATCH0
when running in HV mode") cause compile and link errors for 32-bit
classic Book 3S processors when KVM is enabled.  This fixes these
errors.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/reg.h       |    5 +++++
 arch/powerpc/kvm/book3s_rmhandlers.S |    2 ++
 2 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 47e3416..05658b7 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -823,6 +823,11 @@
 	FTR_SECTION_ELSE_NESTED(66);			\
 	mtspr	SPRN_SPRG_HSCRATCH0,rX;			\
 	ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
+
+#else /* CONFIG_PPC_BOOK3S_64 */
+#define GET_SCRATCH0(rX)	mfspr	rX,SPRN_SPRG_SCRATCH0
+#define SET_SCRATCH0(rX)	mtspr	SPRN_SPRG_SCRATCH0,rX
+
 #endif
 
 #ifdef CONFIG_PPC_BOOK3E_64
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index ae99af6..1a1b344 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_MACHINE_CHECK
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_STORAGE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_STORAGE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL
+#ifdef CONFIG_PPC_BOOK3S_64
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL_HV
+#endif
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALIGNMENT
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PROGRAM
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_FP_UNAVAIL
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 04/13] kvm/powerpc: Split out code from book3s.c into book3s_pr.c
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (2 preceding siblings ...)
  2011-05-11 10:39 ` [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors Paul Mackerras
@ 2011-05-11 10:40 ` Paul Mackerras
  2011-05-11 10:41 ` [PATCH 05/13] powerpc, kvm: Rework KVM checks in first-level interrupt handlers Paul Mackerras
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:40 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

In preparation for adding code to enable KVM to use hypervisor mode
on 64-bit Book 3S processors, this splits book3s.c into two files,
book3s.c and book3s_pr.c, where book3s_pr.c contains the code that is
specific to running the guest in problem state (user mode) and book3s.c
contains code which should apply to all Book 3S processors.

In doing this, we abstract the accesses to the register values stored
in the shared page using a new vcpu_guest_state() accessor.  A few
other details are also abstracted, namely the interrupt offset,
updating the interrupt pending flag, and detecting if the guest is
in a critical section.  These are all things that will be different
when we use hypervisor mode.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s.h |   37 ++
 arch/powerpc/kvm/Makefile             |    2 +
 arch/powerpc/kvm/book3s.c             | 1031 +--------------------------------
 arch/powerpc/kvm/book3s_pr.c          | 1009 ++++++++++++++++++++++++++++++++
 4 files changed, 1073 insertions(+), 1006 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_pr.c

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index cfb2012..12829bb 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -114,6 +114,7 @@ extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong ea, ulong ea_mask)
 extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
 extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end);
 extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
+extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
@@ -151,6 +152,22 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
+#define vcpu_guest_state(vcpu)	((vcpu)->arch.shared)
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+	return to_book3s(vcpu)->hior;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	if (pending_now)
+		vcpu_guest_state(vcpu)->int_pending = 1;
+	else if (old_pending)
+		vcpu_guest_state(vcpu)->int_pending = 0;
+}
+
 static inline ulong dsisr(void)
 {
 	ulong r;
@@ -248,6 +265,26 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return to_svcpu(vcpu)->fault_dar;
 }
 
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	ulong crit_raw = vcpu_guest_state(vcpu)->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu_guest_state(vcpu)->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu_guest_state(vcpu)->msr & MSR_PR);
+
+	return crit;
+}
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3			0x113724FA
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4d68638..bf9854f 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -43,6 +43,7 @@ kvm-book3s_64-objs := \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s.o \
+	book3s_pr.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_mmu_hpte.o \
@@ -56,6 +57,7 @@ kvm-book3s_32-objs := \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s.o \
+	book3s_pr.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_mmu_hpte.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 42a2fed..224e6a8 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -37,17 +37,6 @@
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 /* #define EXIT_DEBUG */
-/* #define DEBUG_EXT */
-
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-			     ulong msr);
-
-/* Some compatibility defines */
-#ifdef CONFIG_PPC_BOOK3S_32
-#define MSR_USER32 MSR_USER
-#define MSR_USER64 MSR_USER
-#define HW_PAGE_SIZE PAGE_SIZE
-#endif
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "exits",       VCPU_STAT(sum_exits) },
@@ -78,100 +67,11 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 {
 }
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-	memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
-	memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
-	       sizeof(get_paca()->shadow_vcpu));
-	to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_32
-	current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
-#endif
-}
-
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-	memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
-	memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
-	       sizeof(get_paca()->shadow_vcpu));
-	to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
-#endif
-
-	kvmppc_giveup_ext(vcpu, MSR_FP);
-	kvmppc_giveup_ext(vcpu, MSR_VEC);
-	kvmppc_giveup_ext(vcpu, MSR_VSX);
-}
-
-static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
-{
-	ulong smsr = vcpu->arch.shared->msr;
-
-	/* Guest MSR values */
-	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
-	/* Process MSR values */
-	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
-	/* External providers the guest reserved */
-	smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
-	/* 64-bit Process MSR values */
-#ifdef CONFIG_PPC_BOOK3S_64
-	smsr |= MSR_ISF | MSR_HV;
-#endif
-	vcpu->arch.shadow_msr = smsr;
-}
-
-void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
-{
-	ulong old_msr = vcpu->arch.shared->msr;
-
-#ifdef EXIT_DEBUG
-	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
-#endif
-
-	msr &= to_book3s(vcpu)->msr_mask;
-	vcpu->arch.shared->msr = msr;
-	kvmppc_recalc_shadow_msr(vcpu);
-
-	if (msr & MSR_POW) {
-		if (!vcpu->arch.pending_exceptions) {
-			kvm_vcpu_block(vcpu);
-			vcpu->stat.halt_wakeup++;
-
-			/* Unset POW bit after we woke up */
-			msr &= ~MSR_POW;
-			vcpu->arch.shared->msr = msr;
-		}
-	}
-
-	if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
-		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
-		kvmppc_mmu_flush_segments(vcpu);
-		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-
-		/* Preload magic page segment when in kernel mode */
-		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
-			struct kvm_vcpu_arch *a = &vcpu->arch;
-
-			if (msr & MSR_DR)
-				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
-			else
-				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
-		}
-	}
-
-	/* Preload FPU if it's enabled */
-	if (vcpu->arch.shared->msr & MSR_FP)
-		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-}
-
 void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
 {
-	vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
-	vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
-	kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
+	vcpu_guest_state(vcpu)->srr0 = kvmppc_get_pc(vcpu);
+	vcpu_guest_state(vcpu)->srr1 = vcpu_guest_state(vcpu)->msr | flags;
+	kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
 	vcpu->arch.mmu.reset_msr(vcpu);
 }
 
@@ -205,11 +105,13 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
 					  unsigned int vec)
 {
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
+
 	clear_bit(kvmppc_book3s_vec2irqprio(vec),
 		  &vcpu->arch.pending_exceptions);
 
-	if (!vcpu->arch.pending_exceptions)
-		vcpu->arch.shared->int_pending = 0;
+	kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions,
+				  old_pending);
 }
 
 void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
@@ -268,29 +170,16 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 	int deliver = 1;
 	int vec = 0;
 	ulong flags = 0ULL;
-	ulong crit_raw = vcpu->arch.shared->critical;
-	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
-	bool crit;
-
-	/* Truncate crit indicators in 32 bit mode */
-	if (!(vcpu->arch.shared->msr & MSR_SF)) {
-		crit_raw &= 0xffffffff;
-		crit_r1 &= 0xffffffff;
-	}
-
-	/* Critical section when crit == r1 */
-	crit = (crit_raw == crit_r1);
-	/* ... and we're in supervisor mode */
-	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+	bool crit = kvmppc_critical_section(vcpu);
 
 	switch (priority) {
 	case BOOK3S_IRQPRIO_DECREMENTER:
-		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
+		deliver = (vcpu_guest_state(vcpu)->msr & MSR_EE) && !crit;
 		vec = BOOK3S_INTERRUPT_DECREMENTER;
 		break;
 	case BOOK3S_IRQPRIO_EXTERNAL:
 	case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
-		deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
+		deliver = (vcpu_guest_state(vcpu)->msr & MSR_EE) && !crit;
 		vec = BOOK3S_INTERRUPT_EXTERNAL;
 		break;
 	case BOOK3S_IRQPRIO_SYSTEM_RESET:
@@ -393,64 +282,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 	}
 
 	/* Tell the guest about our interrupt status */
-	if (*pending)
-		vcpu->arch.shared->int_pending = 1;
-	else if (old_pending)
-		vcpu->arch.shared->int_pending = 0;
-}
-
-void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
-{
-	u32 host_pvr;
-
-	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
-	vcpu->arch.pvr = pvr;
-#ifdef CONFIG_PPC_BOOK3S_64
-	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
-		kvmppc_mmu_book3s_64_init(vcpu);
-		to_book3s(vcpu)->hior = 0xfff00000;
-		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
-	} else
-#endif
-	{
-		kvmppc_mmu_book3s_32_init(vcpu);
-		to_book3s(vcpu)->hior = 0;
-		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
-	}
-
-	/* If we are in hypervisor level on 970, we can tell the CPU to
-	 * treat DCBZ as 32 bytes store */
-	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
-	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
-	    !strcmp(cur_cpu_spec->platform, "ppc970"))
-		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-
-	/* Cell performs badly if MSR_FEx are set. So let's hope nobody
-	   really needs them in a VM on Cell and force disable them. */
-	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
-		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
-
-#ifdef CONFIG_PPC_BOOK3S_32
-	/* 32 bit Book3S always has 32 byte dcbz */
-	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
-#endif
-
-	/* On some CPUs we can execute paired single operations natively */
-	asm ( "mfpvr %0" : "=r"(host_pvr));
-	switch (host_pvr) {
-	case 0x00080200:	/* lonestar 2.0 */
-	case 0x00088202:	/* lonestar 2.2 */
-	case 0x70000100:	/* gekko 1.0 */
-	case 0x00080100:	/* gekko 2.0 */
-	case 0x00083203:	/* gekko 2.3a */
-	case 0x00083213:	/* gekko 2.3b */
-	case 0x00083204:	/* gekko 2.4 */
-	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
-	case 0x00087200:	/* broadway */
-		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
-		/* Enable HID2.PSE - in case we need it later */
-		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
-	}
+	kvmppc_update_int_pending(vcpu, *pending, old_pending);
 }
 
 pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -472,48 +304,10 @@ pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 	return gfn_to_pfn(vcpu->kvm, gfn);
 }
 
-/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
- * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
- * emulate 32 bytes dcbz length.
- *
- * The Book3s_64 inventors also realized this case and implemented a special bit
- * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
- *
- * My approach here is to patch the dcbz instruction on executing pages.
- */
-static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
-{
-	struct page *hpage;
-	u64 hpage_offset;
-	u32 *page;
-	int i;
-
-	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
-	if (is_error_page(hpage)) {
-		kvm_release_page_clean(hpage);
-		return;
-	}
-
-	hpage_offset = pte->raddr & ~PAGE_MASK;
-	hpage_offset &= ~0xFFFULL;
-	hpage_offset /= 4;
-
-	get_page(hpage);
-	page = kmap_atomic(hpage, KM_USER0);
-
-	/* patch dcbz into reserved instruction, so we trap */
-	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
-		if ((page[i] & 0xff0007ff) == INS_DCBZ)
-			page[i] &= 0xfffffff7;
-
-	kunmap_atomic(page, KM_USER0);
-	put_page(hpage);
-}
-
 static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
 			 struct kvmppc_pte *pte)
 {
-	int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR));
+	int relocated = (vcpu_guest_state(vcpu)->msr & (data ? MSR_DR : MSR_IR));
 	int r;
 
 	if (relocated) {
@@ -607,519 +401,6 @@ mmio:
 	return EMULATE_DO_MMIO;
 }
 
-static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	ulong mp_pa = vcpu->arch.magic_page_pa;
-
-	if (unlikely(mp_pa) &&
-	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
-		return 1;
-	}
-
-	return kvm_is_visible_gfn(vcpu->kvm, gfn);
-}
-
-int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
-			    ulong eaddr, int vec)
-{
-	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
-	int r = RESUME_GUEST;
-	int relocated;
-	int page_found = 0;
-	struct kvmppc_pte pte;
-	bool is_mmio = false;
-	bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
-	bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
-	u64 vsid;
-
-	relocated = data ? dr : ir;
-
-	/* Resolve real address if translation turned on */
-	if (relocated) {
-		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
-	} else {
-		pte.may_execute = true;
-		pte.may_read = true;
-		pte.may_write = true;
-		pte.raddr = eaddr & KVM_PAM;
-		pte.eaddr = eaddr;
-		pte.vpage = eaddr >> 12;
-	}
-
-	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
-	case 0:
-		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
-		break;
-	case MSR_DR:
-	case MSR_IR:
-		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
-
-		if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
-			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
-		else
-			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
-		pte.vpage |= vsid;
-
-		if (vsid == -1)
-			page_found = -EINVAL;
-		break;
-	}
-
-	if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-		/*
-		 * If we do the dcbz hack, we have to NX on every execution,
-		 * so we can patch the executing code. This renders our guest
-		 * NX-less.
-		 */
-		pte.may_execute = !data;
-	}
-
-	if (page_found == -ENOENT) {
-		/* Page not found in guest PTE entries */
-		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-		vcpu->arch.shared->msr |=
-			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-		kvmppc_book3s_queue_irqprio(vcpu, vec);
-	} else if (page_found == -EPERM) {
-		/* Storage protection */
-		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->arch.shared->dsisr =
-			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
-		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
-		vcpu->arch.shared->msr |=
-			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
-		kvmppc_book3s_queue_irqprio(vcpu, vec);
-	} else if (page_found == -EINVAL) {
-		/* Page not found in guest SLB */
-		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
-	} else if (!is_mmio &&
-		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
-		/* The guest's PTE is not mapped yet. Map on the host */
-		kvmppc_mmu_map_page(vcpu, &pte);
-		if (data)
-			vcpu->stat.sp_storage++;
-		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-			(!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
-			kvmppc_patch_dcbz(vcpu, &pte);
-	} else {
-		/* MMIO */
-		vcpu->stat.mmio_exits++;
-		vcpu->arch.paddr_accessed = pte.raddr;
-		r = kvmppc_emulate_mmio(run, vcpu);
-		if ( r == RESUME_HOST_NV )
-			r = RESUME_HOST;
-	}
-
-	return r;
-}
-
-static inline int get_fpr_index(int i)
-{
-#ifdef CONFIG_VSX
-	i *= 2;
-#endif
-	return i;
-}
-
-/* Give up external provider (FPU, Altivec, VSX) */
-void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
-{
-	struct thread_struct *t = &current->thread;
-	u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-	u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-	u64 *thread_fpr = (u64*)t->fpr;
-	int i;
-
-	if (!(vcpu->arch.guest_owned_ext & msr))
-		return;
-
-#ifdef DEBUG_EXT
-	printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
-#endif
-
-	switch (msr) {
-	case MSR_FP:
-		giveup_fpu(current);
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
-
-		vcpu->arch.fpscr = t->fpscr.val;
-		break;
-	case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-		giveup_altivec(current);
-		memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
-		vcpu->arch.vscr = t->vscr;
-#endif
-		break;
-	case MSR_VSX:
-#ifdef CONFIG_VSX
-		__giveup_vsx(current);
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-			vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
-#endif
-		break;
-	default:
-		BUG();
-	}
-
-	vcpu->arch.guest_owned_ext &= ~msr;
-	current->thread.regs->msr &= ~msr;
-	kvmppc_recalc_shadow_msr(vcpu);
-}
-
-static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
-{
-	ulong srr0 = kvmppc_get_pc(vcpu);
-	u32 last_inst = kvmppc_get_last_inst(vcpu);
-	int ret;
-
-	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
-	if (ret == -ENOENT) {
-		ulong msr = vcpu->arch.shared->msr;
-
-		msr = kvmppc_set_field(msr, 33, 33, 1);
-		msr = kvmppc_set_field(msr, 34, 36, 0);
-		vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
-		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
-		return EMULATE_AGAIN;
-	}
-
-	return EMULATE_DONE;
-}
-
-static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
-{
-
-	/* Need to do paired single emulation? */
-	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
-		return EMULATE_DONE;
-
-	/* Read out the instruction */
-	if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
-		/* Need to emulate */
-		return EMULATE_FAIL;
-
-	return EMULATE_AGAIN;
-}
-
-/* Handle external providers (FPU, Altivec, VSX) */
-static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
-			     ulong msr)
-{
-	struct thread_struct *t = &current->thread;
-	u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-	u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-	u64 *thread_fpr = (u64*)t->fpr;
-	int i;
-
-	/* When we have paired singles, we emulate in software */
-	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
-		return RESUME_GUEST;
-
-	if (!(vcpu->arch.shared->msr & msr)) {
-		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-		return RESUME_GUEST;
-	}
-
-	/* We already own the ext */
-	if (vcpu->arch.guest_owned_ext & msr) {
-		return RESUME_GUEST;
-	}
-
-#ifdef DEBUG_EXT
-	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
-#endif
-
-	current->thread.regs->msr |= msr;
-
-	switch (msr) {
-	case MSR_FP:
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
-
-		t->fpscr.val = vcpu->arch.fpscr;
-		t->fpexc_mode = 0;
-		kvmppc_load_up_fpu();
-		break;
-	case MSR_VEC:
-#ifdef CONFIG_ALTIVEC
-		memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
-		t->vscr = vcpu->arch.vscr;
-		t->vrsave = -1;
-		kvmppc_load_up_altivec();
-#endif
-		break;
-	case MSR_VSX:
-#ifdef CONFIG_VSX
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
-			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
-		kvmppc_load_up_vsx();
-#endif
-		break;
-	default:
-		BUG();
-	}
-
-	vcpu->arch.guest_owned_ext |= msr;
-
-	kvmppc_recalc_shadow_msr(vcpu);
-
-	return RESUME_GUEST;
-}
-
-int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                       unsigned int exit_nr)
-{
-	int r = RESUME_HOST;
-
-	vcpu->stat.sum_exits++;
-
-	run->exit_reason = KVM_EXIT_UNKNOWN;
-	run->ready_for_interrupt_injection = 1;
-
-	trace_kvm_book3s_exit(exit_nr, vcpu);
-	kvm_resched(vcpu);
-	switch (exit_nr) {
-	case BOOK3S_INTERRUPT_INST_STORAGE:
-		vcpu->stat.pf_instruc++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-		/* We set segments as unused segments when invalidating them. So
-		 * treat the respective fault as segment fault. */
-		if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
-		    == SR_INVALID) {
-			kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
-			r = RESUME_GUEST;
-			break;
-		}
-#endif
-
-		/* only care about PTEG not found errors, but leave NX alone */
-		if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
-			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
-			vcpu->stat.sp_instruc++;
-		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
-			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
-			/*
-			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
-			 *     so we can't use the NX bit inside the guest. Let's cross our fingers,
-			 *     that no guest that needs the dcbz hack does NX.
-			 */
-			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
-			r = RESUME_GUEST;
-		} else {
-			vcpu->arch.shared->msr |=
-				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			r = RESUME_GUEST;
-		}
-		break;
-	case BOOK3S_INTERRUPT_DATA_STORAGE:
-	{
-		ulong dar = kvmppc_get_fault_dar(vcpu);
-		vcpu->stat.pf_storage++;
-
-#ifdef CONFIG_PPC_BOOK3S_32
-		/* We set segments as unused segments when invalidating them. So
-		 * treat the respective fault as segment fault. */
-		if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
-			kvmppc_mmu_map_segment(vcpu, dar);
-			r = RESUME_GUEST;
-			break;
-		}
-#endif
-
-		/* The only case we need to handle is missing shadow PTEs */
-		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
-			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
-		} else {
-			vcpu->arch.shared->dar = dar;
-			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			r = RESUME_GUEST;
-		}
-		break;
-	}
-	case BOOK3S_INTERRUPT_DATA_SEGMENT:
-		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
-			vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
-			kvmppc_book3s_queue_irqprio(vcpu,
-				BOOK3S_INTERRUPT_DATA_SEGMENT);
-		}
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_INST_SEGMENT:
-		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
-			kvmppc_book3s_queue_irqprio(vcpu,
-				BOOK3S_INTERRUPT_INST_SEGMENT);
-		}
-		r = RESUME_GUEST;
-		break;
-	/* We're good on these - the host merely wanted to get our attention */
-	case BOOK3S_INTERRUPT_DECREMENTER:
-		vcpu->stat.dec_exits++;
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_EXTERNAL:
-		vcpu->stat.ext_intr_exits++;
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_PERFMON:
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_PROGRAM:
-	{
-		enum emulation_result er;
-		ulong flags;
-
-program_interrupt:
-		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
-
-		if (vcpu->arch.shared->msr & MSR_PR) {
-#ifdef EXIT_DEBUG
-			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-#endif
-			if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
-			    (INS_DCBZ & 0xfffffff7)) {
-				kvmppc_core_queue_program(vcpu, flags);
-				r = RESUME_GUEST;
-				break;
-			}
-		}
-
-		vcpu->stat.emulated_inst_exits++;
-		er = kvmppc_emulate_instruction(run, vcpu);
-		switch (er) {
-		case EMULATE_DONE:
-			r = RESUME_GUEST_NV;
-			break;
-		case EMULATE_AGAIN:
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_FAIL:
-			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-			       __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
-			kvmppc_core_queue_program(vcpu, flags);
-			r = RESUME_GUEST;
-			break;
-		case EMULATE_DO_MMIO:
-			run->exit_reason = KVM_EXIT_MMIO;
-			r = RESUME_HOST_NV;
-			break;
-		default:
-			BUG();
-		}
-		break;
-	}
-	case BOOK3S_INTERRUPT_SYSCALL:
-		if (vcpu->arch.osi_enabled &&
-		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
-		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
-			/* MOL hypercalls */
-			u64 *gprs = run->osi.gprs;
-			int i;
-
-			run->exit_reason = KVM_EXIT_OSI;
-			for (i = 0; i < 32; i++)
-				gprs[i] = kvmppc_get_gpr(vcpu, i);
-			vcpu->arch.osi_needed = 1;
-			r = RESUME_HOST_NV;
-		} else if (!(vcpu->arch.shared->msr & MSR_PR) &&
-		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
-			/* KVM PV hypercalls */
-			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
-			r = RESUME_GUEST;
-		} else {
-			/* Guest syscalls */
-			vcpu->stat.syscall_exits++;
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-			r = RESUME_GUEST;
-		}
-		break;
-	case BOOK3S_INTERRUPT_FP_UNAVAIL:
-	case BOOK3S_INTERRUPT_ALTIVEC:
-	case BOOK3S_INTERRUPT_VSX:
-	{
-		int ext_msr = 0;
-
-		switch (exit_nr) {
-		case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
-		case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break;
-		case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break;
-		}
-
-		switch (kvmppc_check_ext(vcpu, exit_nr)) {
-		case EMULATE_DONE:
-			/* everything ok - let's enable the ext */
-			r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
-			break;
-		case EMULATE_FAIL:
-			/* we need to emulate this instruction */
-			goto program_interrupt;
-			break;
-		default:
-			/* nothing to worry about - go again */
-			break;
-		}
-		break;
-	}
-	case BOOK3S_INTERRUPT_ALIGNMENT:
-		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
-			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
-				kvmppc_get_last_inst(vcpu));
-			vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
-				kvmppc_get_last_inst(vcpu));
-			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-		}
-		r = RESUME_GUEST;
-		break;
-	case BOOK3S_INTERRUPT_MACHINE_CHECK:
-	case BOOK3S_INTERRUPT_TRACE:
-		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
-		r = RESUME_GUEST;
-		break;
-	default:
-		/* Ugh - bork here! What did we get? */
-		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
-			exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
-		r = RESUME_HOST;
-		BUG();
-		break;
-	}
-
-
-	if (!(r & RESUME_HOST)) {
-		/* To avoid clobbering exit_reason, only check for signals if
-		 * we aren't already exiting to userspace for some other
-		 * reason. */
-		if (signal_pending(current)) {
-#ifdef EXIT_DEBUG
-			printk(KERN_EMERG "KVM: Going back to host\n");
-#endif
-			vcpu->stat.signal_exits++;
-			run->exit_reason = KVM_EXIT_INTR;
-			r = -EINTR;
-		} else {
-			/* In case an interrupt came in that was triggered
-			 * from userspace (like DEC), we need to check what
-			 * to inject now! */
-			kvmppc_core_deliver_interrupts(vcpu);
-		}
-	}
-
-	trace_kvm_book3s_reenter(r, vcpu);
-
-	return r;
-}
-
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	return 0;
@@ -1134,14 +415,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->ctr = kvmppc_get_ctr(vcpu);
 	regs->lr = kvmppc_get_lr(vcpu);
 	regs->xer = kvmppc_get_xer(vcpu);
-	regs->msr = vcpu->arch.shared->msr;
-	regs->srr0 = vcpu->arch.shared->srr0;
-	regs->srr1 = vcpu->arch.shared->srr1;
+	regs->msr = vcpu_guest_state(vcpu)->msr;
+	regs->srr0 = vcpu_guest_state(vcpu)->srr0;
+	regs->srr1 = vcpu_guest_state(vcpu)->srr1;
 	regs->pid = vcpu->arch.pid;
-	regs->sprg0 = vcpu->arch.shared->sprg0;
-	regs->sprg1 = vcpu->arch.shared->sprg1;
-	regs->sprg2 = vcpu->arch.shared->sprg2;
-	regs->sprg3 = vcpu->arch.shared->sprg3;
+	regs->sprg0 = vcpu_guest_state(vcpu)->sprg0;
+	regs->sprg1 = vcpu_guest_state(vcpu)->sprg1;
+	regs->sprg2 = vcpu_guest_state(vcpu)->sprg2;
+	regs->sprg3 = vcpu_guest_state(vcpu)->sprg3;
 	regs->sprg4 = vcpu->arch.sprg4;
 	regs->sprg5 = vcpu->arch.sprg5;
 	regs->sprg6 = vcpu->arch.sprg6;
@@ -1163,12 +444,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	kvmppc_set_lr(vcpu, regs->lr);
 	kvmppc_set_xer(vcpu, regs->xer);
 	kvmppc_set_msr(vcpu, regs->msr);
-	vcpu->arch.shared->srr0 = regs->srr0;
-	vcpu->arch.shared->srr1 = regs->srr1;
-	vcpu->arch.shared->sprg0 = regs->sprg0;
-	vcpu->arch.shared->sprg1 = regs->sprg1;
-	vcpu->arch.shared->sprg2 = regs->sprg2;
-	vcpu->arch.shared->sprg3 = regs->sprg3;
+	vcpu_guest_state(vcpu)->srr0 = regs->srr0;
+	vcpu_guest_state(vcpu)->srr1 = regs->srr1;
+	vcpu_guest_state(vcpu)->sprg0 = regs->sprg0;
+	vcpu_guest_state(vcpu)->sprg1 = regs->sprg1;
+	vcpu_guest_state(vcpu)->sprg2 = regs->sprg2;
+	vcpu_guest_state(vcpu)->sprg3 = regs->sprg3;
 	vcpu->arch.sprg4 = regs->sprg4;
 	vcpu->arch.sprg5 = regs->sprg5;
 	vcpu->arch.sprg6 = regs->sprg6;
@@ -1180,69 +461,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
-int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-	int i;
-
-	sregs->pvr = vcpu->arch.pvr;
-
-	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
-	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-		for (i = 0; i < 64; i++) {
-			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
-			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
-		}
-	} else {
-		for (i = 0; i < 16; i++)
-			sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
-
-		for (i = 0; i < 8; i++) {
-			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
-			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
-		}
-	}
-
-	return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
-                                  struct kvm_sregs *sregs)
-{
-	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
-	int i;
-
-	kvmppc_set_pvr(vcpu, sregs->pvr);
-
-	vcpu3s->sdr1 = sregs->u.s.sdr1;
-	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
-		for (i = 0; i < 64; i++) {
-			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
-						    sregs->u.s.ppc64.slb[i].slbe);
-		}
-	} else {
-		for (i = 0; i < 16; i++) {
-			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
-		}
-		for (i = 0; i < 8; i++) {
-			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
-				       (u32)sregs->u.s.ppc32.ibat[i]);
-			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
-				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
-			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
-				       (u32)sregs->u.s.ppc32.dbat[i]);
-			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
-				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
-		}
-	}
-
-	/* Flush the MMU after messing with the segments */
-	kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-	return 0;
-}
-
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	return -ENOTSUPP;
@@ -1297,202 +515,3 @@ out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
-
-int kvmppc_core_check_processor_compat(void)
-{
-	return 0;
-}
-
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
-{
-	struct kvmppc_vcpu_book3s *vcpu_book3s;
-	struct kvm_vcpu *vcpu;
-	int err = -ENOMEM;
-	unsigned long p;
-
-	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
-	if (!vcpu_book3s)
-		goto out;
-
-	vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
-		kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
-	if (!vcpu_book3s->shadow_vcpu)
-		goto free_vcpu;
-
-	vcpu = &vcpu_book3s->vcpu;
-	err = kvm_vcpu_init(vcpu, kvm, id);
-	if (err)
-		goto free_shadow_vcpu;
-
-	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
-	/* the real shared page fills the last 4k of our page */
-	vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
-	if (!p)
-		goto uninit_vcpu;
-
-	vcpu->arch.host_retip = kvm_return_point;
-	vcpu->arch.host_msr = mfmsr();
-#ifdef CONFIG_PPC_BOOK3S_64
-	/* default to book3s_64 (970fx) */
-	vcpu->arch.pvr = 0x3C0301;
-#else
-	/* default to book3s_32 (750) */
-	vcpu->arch.pvr = 0x84202;
-#endif
-	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
-	vcpu_book3s->slb_nr = 64;
-
-	/* remember where some real-mode handlers are */
-	vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
-	vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
-	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
-#ifdef CONFIG_PPC_BOOK3S_64
-	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
-#else
-	vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
-#endif
-
-	vcpu->arch.shadow_msr = MSR_USER64;
-
-	err = kvmppc_mmu_init(vcpu);
-	if (err < 0)
-		goto uninit_vcpu;
-
-	return vcpu;
-
-uninit_vcpu:
-	kvm_vcpu_uninit(vcpu);
-free_shadow_vcpu:
-	kfree(vcpu_book3s->shadow_vcpu);
-free_vcpu:
-	vfree(vcpu_book3s);
-out:
-	return ERR_PTR(err);
-}
-
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
-{
-	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
-
-	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
-	kvm_vcpu_uninit(vcpu);
-	kfree(vcpu_book3s->shadow_vcpu);
-	vfree(vcpu_book3s);
-}
-
-extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
-{
-	int ret;
-	double fpr[32][TS_FPRWIDTH];
-	unsigned int fpscr;
-	int fpexc_mode;
-#ifdef CONFIG_ALTIVEC
-	vector128 vr[32];
-	vector128 vscr;
-	unsigned long uninitialized_var(vrsave);
-	int used_vr;
-#endif
-#ifdef CONFIG_VSX
-	int used_vsr;
-#endif
-	ulong ext_msr;
-
-	/* No need to go into the guest when all we do is going out */
-	if (signal_pending(current)) {
-		kvm_run->exit_reason = KVM_EXIT_INTR;
-		return -EINTR;
-	}
-
-	/* Save FPU state in stack */
-	if (current->thread.regs->msr & MSR_FP)
-		giveup_fpu(current);
-	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
-	fpscr = current->thread.fpscr.val;
-	fpexc_mode = current->thread.fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-	/* Save Altivec state in stack */
-	used_vr = current->thread.used_vr;
-	if (used_vr) {
-		if (current->thread.regs->msr & MSR_VEC)
-			giveup_altivec(current);
-		memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
-		vscr = current->thread.vscr;
-		vrsave = current->thread.vrsave;
-	}
-#endif
-
-#ifdef CONFIG_VSX
-	/* Save VSX state in stack */
-	used_vsr = current->thread.used_vsr;
-	if (used_vsr && (current->thread.regs->msr & MSR_VSX))
-			__giveup_vsx(current);
-#endif
-
-	/* Remember the MSR with disabled extensions */
-	ext_msr = current->thread.regs->msr;
-
-	/* XXX we get called with irq disabled - change that! */
-	local_irq_enable();
-
-	/* Preload FPU if it's enabled */
-	if (vcpu->arch.shared->msr & MSR_FP)
-		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
-
-	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
-
-	local_irq_disable();
-
-	current->thread.regs->msr = ext_msr;
-
-	/* Make sure we save the guest FPU/Altivec/VSX state */
-	kvmppc_giveup_ext(vcpu, MSR_FP);
-	kvmppc_giveup_ext(vcpu, MSR_VEC);
-	kvmppc_giveup_ext(vcpu, MSR_VSX);
-
-	/* Restore FPU state from stack */
-	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
-	current->thread.fpscr.val = fpscr;
-	current->thread.fpexc_mode = fpexc_mode;
-
-#ifdef CONFIG_ALTIVEC
-	/* Restore Altivec state from stack */
-	if (used_vr && current->thread.used_vr) {
-		memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
-		current->thread.vscr = vscr;
-		current->thread.vrsave = vrsave;
-	}
-	current->thread.used_vr = used_vr;
-#endif
-
-#ifdef CONFIG_VSX
-	current->thread.used_vsr = used_vsr;
-#endif
-
-	return ret;
-}
-
-static int kvmppc_book3s_init(void)
-{
-	int r;
-
-	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-		     THIS_MODULE);
-
-	if (r)
-		return r;
-
-	r = kvmppc_mmu_hpte_sysinit();
-
-	return r;
-}
-
-static void kvmppc_book3s_exit(void)
-{
-	kvmppc_mmu_hpte_sysexit();
-	kvm_exit();
-}
-
-module_init(kvmppc_book3s_init);
-module_exit(kvmppc_book3s_exit);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
new file mode 100644
index 0000000..0eb66e5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ * Functions relating to running KVM on Book 3S processors where
+ * we don't have access to hypervisor mode, and we run the guest
+ * in problem state (user mode).
+ *
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "trace.h"
+
+/* #define EXIT_DEBUG */
+/* #define DEBUG_EXT */
+
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr);
+
+/* Some compatibility defines */
+#ifdef CONFIG_PPC_BOOK3S_32
+#define MSR_USER32 MSR_USER
+#define MSR_USER64 MSR_USER
+#define HW_PAGE_SIZE PAGE_SIZE
+#endif
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb));
+	memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu,
+	       sizeof(get_paca()->shadow_vcpu));
+	to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max;
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_32
+	current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
+#endif
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb));
+	memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu,
+	       sizeof(get_paca()->shadow_vcpu));
+	to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max;
+#endif
+
+	kvmppc_giveup_ext(vcpu, MSR_FP);
+	kvmppc_giveup_ext(vcpu, MSR_VEC);
+	kvmppc_giveup_ext(vcpu, MSR_VSX);
+}
+
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+	ulong smsr = vcpu->arch.shared->msr;
+
+	/* Guest MSR values */
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
+	/* Process MSR values */
+	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
+	/* External providers the guest reserved */
+	smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
+	/* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+	smsr |= MSR_ISF | MSR_HV;
+#endif
+	vcpu->arch.shadow_msr = smsr;
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	ulong old_msr = vcpu->arch.shared->msr;
+
+#ifdef EXIT_DEBUG
+	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
+#endif
+
+	msr &= to_book3s(vcpu)->msr_mask;
+	vcpu->arch.shared->msr = msr;
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	if (msr & MSR_POW) {
+		if (!vcpu->arch.pending_exceptions) {
+			kvm_vcpu_block(vcpu);
+			vcpu->stat.halt_wakeup++;
+
+			/* Unset POW bit after we woke up */
+			msr &= ~MSR_POW;
+			vcpu->arch.shared->msr = msr;
+		}
+	}
+
+	if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
+		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
+		kvmppc_mmu_flush_segments(vcpu);
+		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+
+		/* Preload magic page segment when in kernel mode */
+		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
+			struct kvm_vcpu_arch *a = &vcpu->arch;
+
+			if (msr & MSR_DR)
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
+			else
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
+		}
+	}
+
+	/* Preload FPU if it's enabled */
+	if (vcpu->arch.shared->msr & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	u32 host_pvr;
+
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
+	vcpu->arch.pvr = pvr;
+#ifdef CONFIG_PPC_BOOK3S_64
+	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
+		kvmppc_mmu_book3s_64_init(vcpu);
+		to_book3s(vcpu)->hior = 0xfff00000;
+		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+	} else
+#endif
+	{
+		kvmppc_mmu_book3s_32_init(vcpu);
+		to_book3s(vcpu)->hior = 0;
+		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+	}
+
+	/* If we are in hypervisor level on 970, we can tell the CPU to
+	 * treat DCBZ as 32 bytes store */
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
+	    !strcmp(cur_cpu_spec->platform, "ppc970"))
+		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+
+	/* Cell performs badly if MSR_FEx are set. So let's hope nobody
+	   really needs them in a VM on Cell and force disable them. */
+	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
+		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
+
+#ifdef CONFIG_PPC_BOOK3S_32
+	/* 32 bit Book3S always has 32 byte dcbz */
+	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+#endif
+
+	/* On some CPUs we can execute paired single operations natively */
+	asm ( "mfpvr %0" : "=r"(host_pvr));
+	switch (host_pvr) {
+	case 0x00080200:	/* lonestar 2.0 */
+	case 0x00088202:	/* lonestar 2.2 */
+	case 0x70000100:	/* gekko 1.0 */
+	case 0x00080100:	/* gekko 2.0 */
+	case 0x00083203:	/* gekko 2.3a */
+	case 0x00083213:	/* gekko 2.3b */
+	case 0x00083204:	/* gekko 2.4 */
+	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
+	case 0x00087200:	/* broadway */
+		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
+		/* Enable HID2.PSE - in case we need it later */
+		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
+	}
+}
+
+/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
+ * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
+ * emulate 32 bytes dcbz length.
+ *
+ * The Book3s_64 inventors also realized this case and implemented a special bit
+ * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
+ *
+ * My approach here is to patch the dcbz instruction on executing pages.
+ */
+static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	struct page *hpage;
+	u64 hpage_offset;
+	u32 *page;
+	int i;
+
+	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+	if (is_error_page(hpage)) {
+		kvm_release_page_clean(hpage);
+		return;
+	}
+
+	hpage_offset = pte->raddr & ~PAGE_MASK;
+	hpage_offset &= ~0xFFFULL;
+	hpage_offset /= 4;
+
+	get_page(hpage);
+	page = kmap_atomic(hpage, KM_USER0);
+
+	/* patch dcbz into reserved instruction, so we trap */
+	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
+		if ((page[i] & 0xff0007ff) == INS_DCBZ)
+			page[i] &= 0xfffffff7;
+
+	kunmap_atomic(page, KM_USER0);
+	put_page(hpage);
+}
+
+static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	if (unlikely(mp_pa) &&
+	    unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
+		return 1;
+	}
+
+	return kvm_is_visible_gfn(vcpu->kvm, gfn);
+}
+
+int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			    ulong eaddr, int vec)
+{
+	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
+	int r = RESUME_GUEST;
+	int relocated;
+	int page_found = 0;
+	struct kvmppc_pte pte;
+	bool is_mmio = false;
+	bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
+	bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
+	u64 vsid;
+
+	relocated = data ? dr : ir;
+
+	/* Resolve real address if translation turned on */
+	if (relocated) {
+		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
+	} else {
+		pte.may_execute = true;
+		pte.may_read = true;
+		pte.may_write = true;
+		pte.raddr = eaddr & KVM_PAM;
+		pte.eaddr = eaddr;
+		pte.vpage = eaddr >> 12;
+	}
+
+	switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
+	case 0:
+		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
+		break;
+	case MSR_DR:
+	case MSR_IR:
+		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+
+		if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
+			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
+		else
+			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
+		pte.vpage |= vsid;
+
+		if (vsid == -1)
+			page_found = -EINVAL;
+		break;
+	}
+
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+		/*
+		 * If we do the dcbz hack, we have to NX on every execution,
+		 * so we can patch the executing code. This renders our guest
+		 * NX-less.
+		 */
+		pte.may_execute = !data;
+	}
+
+	if (page_found == -ENOENT) {
+		/* Page not found in guest PTE entries */
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		kvmppc_book3s_queue_irqprio(vcpu, vec);
+	} else if (page_found == -EPERM) {
+		/* Storage protection */
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->arch.shared->dsisr =
+			to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
+		vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
+		vcpu->arch.shared->msr |=
+			(to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
+		kvmppc_book3s_queue_irqprio(vcpu, vec);
+	} else if (page_found == -EINVAL) {
+		/* Page not found in guest SLB */
+		vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
+	} else if (!is_mmio &&
+		   kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
+		/* The guest's PTE is not mapped yet. Map on the host */
+		kvmppc_mmu_map_page(vcpu, &pte);
+		if (data)
+			vcpu->stat.sp_storage++;
+		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			(!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
+			kvmppc_patch_dcbz(vcpu, &pte);
+	} else {
+		/* MMIO */
+		vcpu->stat.mmio_exits++;
+		vcpu->arch.paddr_accessed = pte.raddr;
+		r = kvmppc_emulate_mmio(run, vcpu);
+		if ( r == RESUME_HOST_NV )
+			r = RESUME_HOST;
+	}
+
+	return r;
+}
+
+static inline int get_fpr_index(int i)
+{
+#ifdef CONFIG_VSX
+	i *= 2;
+#endif
+	return i;
+}
+
+/* Give up external provider (FPU, Altivec, VSX) */
+void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+	u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
+	u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
+	u64 *thread_fpr = (u64*)t->fpr;
+	int i;
+
+	if (!(vcpu->arch.guest_owned_ext & msr))
+		return;
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
+#endif
+
+	switch (msr) {
+	case MSR_FP:
+		giveup_fpu(current);
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
+
+		vcpu->arch.fpscr = t->fpscr.val;
+		break;
+	case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+		giveup_altivec(current);
+		memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
+		vcpu->arch.vscr = t->vscr;
+#endif
+		break;
+	case MSR_VSX:
+#ifdef CONFIG_VSX
+		__giveup_vsx(current);
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+			vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
+#endif
+		break;
+	default:
+		BUG();
+	}
+
+	vcpu->arch.guest_owned_ext &= ~msr;
+	current->thread.regs->msr &= ~msr;
+	kvmppc_recalc_shadow_msr(vcpu);
+}
+
+static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
+{
+	ulong srr0 = kvmppc_get_pc(vcpu);
+	u32 last_inst = kvmppc_get_last_inst(vcpu);
+	int ret;
+
+	ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
+	if (ret == -ENOENT) {
+		ulong msr = vcpu->arch.shared->msr;
+
+		msr = kvmppc_set_field(msr, 33, 33, 1);
+		msr = kvmppc_set_field(msr, 34, 36, 0);
+		vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
+		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
+		return EMULATE_AGAIN;
+	}
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_check_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr)
+{
+
+	/* Need to do paired single emulation? */
+	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
+		return EMULATE_DONE;
+
+	/* Read out the instruction */
+	if (kvmppc_read_inst(vcpu) == EMULATE_DONE)
+		/* Need to emulate */
+		return EMULATE_FAIL;
+
+	return EMULATE_AGAIN;
+}
+
+/* Handle external providers (FPU, Altivec, VSX) */
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+	u64 *vcpu_fpr = vcpu->arch.fpr;
+#ifdef CONFIG_VSX
+	u64 *vcpu_vsx = vcpu->arch.vsr;
+#endif
+	u64 *thread_fpr = (u64*)t->fpr;
+	int i;
+
+	/* When we have paired singles, we emulate in software */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
+		return RESUME_GUEST;
+
+	if (!(vcpu->arch.shared->msr & msr)) {
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		return RESUME_GUEST;
+	}
+
+	/* We already own the ext */
+	if (vcpu->arch.guest_owned_ext & msr) {
+		return RESUME_GUEST;
+	}
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
+#endif
+
+	current->thread.regs->msr |= msr;
+
+	switch (msr) {
+	case MSR_FP:
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
+			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
+
+		t->fpscr.val = vcpu->arch.fpscr;
+		t->fpexc_mode = 0;
+		kvmppc_load_up_fpu();
+		break;
+	case MSR_VEC:
+#ifdef CONFIG_ALTIVEC
+		memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
+		t->vscr = vcpu->arch.vscr;
+		t->vrsave = -1;
+		kvmppc_load_up_altivec();
+#endif
+		break;
+	case MSR_VSX:
+#ifdef CONFIG_VSX
+		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
+			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
+		kvmppc_load_up_vsx();
+#endif
+		break;
+	default:
+		BUG();
+	}
+
+	vcpu->arch.guest_owned_ext |= msr;
+
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	return RESUME_GUEST;
+}
+
+int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int exit_nr)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+
+	trace_kvm_book3s_exit(exit_nr, vcpu);
+	kvm_resched(vcpu);
+	switch (exit_nr) {
+	case BOOK3S_INTERRUPT_INST_STORAGE:
+		vcpu->stat.pf_instruc++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]
+		    == SR_INVALID) {
+			kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+			r = RESUME_GUEST;
+			break;
+		}
+#endif
+
+		/* only care about PTEG not found errors, but leave NX alone */
+		if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) {
+			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
+			vcpu->stat.sp_instruc++;
+		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+			/*
+			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
+			 *     so we can't use the NX bit inside the guest. Let's cross our fingers,
+			 *     that no guest that needs the dcbz hack does NX.
+			 */
+			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
+			r = RESUME_GUEST;
+		} else {
+			vcpu->arch.shared->msr |=
+				to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	case BOOK3S_INTERRUPT_DATA_STORAGE:
+	{
+		ulong dar = kvmppc_get_fault_dar(vcpu);
+		vcpu->stat.pf_storage++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) {
+			kvmppc_mmu_map_segment(vcpu, dar);
+			r = RESUME_GUEST;
+			break;
+		}
+#endif
+
+		/* The only case we need to handle is missing shadow PTEs */
+		if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
+			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
+		} else {
+			vcpu->arch.shared->dar = dar;
+			vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_DATA_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
+			vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_DATA_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_INST_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_INST_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		enum emulation_result er;
+		ulong flags;
+
+program_interrupt:
+		flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
+
+		if (vcpu->arch.shared->msr & MSR_PR) {
+#ifdef EXIT_DEBUG
+			printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
+#endif
+			if ((kvmppc_get_last_inst(vcpu) & 0xff0007ff) !=
+			    (INS_DCBZ & 0xfffffff7)) {
+				kvmppc_core_queue_program(vcpu, flags);
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+
+		vcpu->stat.emulated_inst_exits++;
+		er = kvmppc_emulate_instruction(run, vcpu);
+		switch (er) {
+		case EMULATE_DONE:
+			r = RESUME_GUEST_NV;
+			break;
+		case EMULATE_AGAIN:
+			r = RESUME_GUEST;
+			break;
+		case EMULATE_FAIL:
+			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+			       __func__, kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
+			kvmppc_core_queue_program(vcpu, flags);
+			r = RESUME_GUEST;
+			break;
+		case EMULATE_DO_MMIO:
+			run->exit_reason = KVM_EXIT_MMIO;
+			r = RESUME_HOST_NV;
+			break;
+		default:
+			BUG();
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+		if (vcpu->arch.osi_enabled &&
+		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+			/* MOL hypercalls */
+			u64 *gprs = run->osi.gprs;
+			int i;
+
+			run->exit_reason = KVM_EXIT_OSI;
+			for (i = 0; i < 32; i++)
+				gprs[i] = kvmppc_get_gpr(vcpu, i);
+			vcpu->arch.osi_needed = 1;
+			r = RESUME_HOST_NV;
+		} else if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			vcpu->stat.syscall_exits++;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	case BOOK3S_INTERRUPT_FP_UNAVAIL:
+	case BOOK3S_INTERRUPT_ALTIVEC:
+	case BOOK3S_INTERRUPT_VSX:
+	{
+		int ext_msr = 0;
+
+		switch (exit_nr) {
+		case BOOK3S_INTERRUPT_FP_UNAVAIL: ext_msr = MSR_FP;  break;
+		case BOOK3S_INTERRUPT_ALTIVEC:    ext_msr = MSR_VEC; break;
+		case BOOK3S_INTERRUPT_VSX:        ext_msr = MSR_VSX; break;
+		}
+
+		switch (kvmppc_check_ext(vcpu, exit_nr)) {
+		case EMULATE_DONE:
+			/* everything ok - let's enable the ext */
+			r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
+			break;
+		case EMULATE_FAIL:
+			/* we need to emulate this instruction */
+			goto program_interrupt;
+			break;
+		default:
+			/* nothing to worry about - go again */
+			break;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_ALIGNMENT:
+		if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
+			vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
+				kvmppc_get_last_inst(vcpu));
+			vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
+				kvmppc_get_last_inst(vcpu));
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		}
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+	case BOOK3S_INTERRUPT_TRACE:
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		r = RESUME_GUEST;
+		break;
+	default:
+		/* Ugh - bork here! What did we get? */
+		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
+			exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+		if (signal_pending(current)) {
+#ifdef EXIT_DEBUG
+			printk(KERN_EMERG "KVM: Going back to host\n");
+#endif
+			vcpu->stat.signal_exits++;
+			run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+		} else {
+			/* In case an interrupt came in that was triggered
+			 * from userspace (like DEC), we need to check what
+			 * to inject now! */
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	}
+
+	trace_kvm_book3s_reenter(r, vcpu);
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
+			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+		}
+	} else {
+		for (i = 0; i < 16; i++)
+			sregs->u.s.ppc32.sr[i] = vcpu_guest_state(vcpu)->sr[i];
+
+		for (i = 0; i < 8; i++) {
+			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
+			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
+		}
+	}
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	vcpu3s->sdr1 = sregs->u.s.sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
+						    sregs->u.s.ppc64.slb[i].slbe);
+		}
+	} else {
+		for (i = 0; i < 16; i++) {
+			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
+		}
+		for (i = 0; i < 8; i++) {
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
+				       (u32)sregs->u.s.ppc32.ibat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
+				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
+				       (u32)sregs->u.s.ppc32.dbat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
+				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
+		}
+	}
+
+	/* Flush the MMU after messing with the segments */
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	return 0;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s;
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long p;
+
+	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
+	if (!vcpu_book3s)
+		goto out;
+
+	vcpu_book3s->shadow_vcpu = (struct kvmppc_book3s_shadow_vcpu *)
+		kzalloc(sizeof(*vcpu_book3s->shadow_vcpu), GFP_KERNEL);
+	if (!vcpu_book3s->shadow_vcpu)
+		goto free_vcpu;
+
+	vcpu = &vcpu_book3s->vcpu;
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_shadow_vcpu;
+
+	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+	/* the real shared page fills the last 4k of our page */
+	vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
+	if (!p)
+		goto uninit_vcpu;
+
+	vcpu->arch.host_retip = kvm_return_point;
+	vcpu->arch.host_msr = mfmsr();
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* default to book3s_64 (970fx) */
+	vcpu->arch.pvr = 0x3C0301;
+#else
+	/* default to book3s_32 (750) */
+	vcpu->arch.pvr = 0x84202;
+#endif
+	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+	vcpu_book3s->slb_nr = 64;
+
+	/* remember where some real-mode handlers are */
+	vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
+	vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
+	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+#ifdef CONFIG_PPC_BOOK3S_64
+	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
+#else
+	vcpu->arch.rmcall = (ulong)kvmppc_rmcall;
+#endif
+
+	vcpu->arch.shadow_msr = MSR_USER64;
+
+	err = kvmppc_mmu_init(vcpu);
+	if (err < 0)
+		goto uninit_vcpu;
+
+	return vcpu;
+
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+free_shadow_vcpu:
+	kfree(vcpu_book3s->shadow_vcpu);
+free_vcpu:
+	vfree(vcpu_book3s);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu_book3s->shadow_vcpu);
+	vfree(vcpu_book3s);
+}
+
+extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ret;
+	double fpr[32][TS_FPRWIDTH];
+	unsigned int fpscr;
+	int fpexc_mode;
+#ifdef CONFIG_ALTIVEC
+	vector128 vr[32];
+	vector128 vscr;
+	unsigned long uninitialized_var(vrsave);
+	int used_vr;
+#endif
+#ifdef CONFIG_VSX
+	int used_vsr;
+#endif
+	ulong ext_msr;
+
+	/* No need to go into the guest when all we do is going out */
+	if (signal_pending(current)) {
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	/* Save FPU state in stack */
+	if (current->thread.regs->msr & MSR_FP)
+		giveup_fpu(current);
+	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
+	fpscr = current->thread.fpscr.val;
+	fpexc_mode = current->thread.fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+	/* Save Altivec state in stack */
+	used_vr = current->thread.used_vr;
+	if (used_vr) {
+		if (current->thread.regs->msr & MSR_VEC)
+			giveup_altivec(current);
+		memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
+		vscr = current->thread.vscr;
+		vrsave = current->thread.vrsave;
+	}
+#endif
+
+#ifdef CONFIG_VSX
+	/* Save VSX state in stack */
+	used_vsr = current->thread.used_vsr;
+	if (used_vsr && (current->thread.regs->msr & MSR_VSX))
+			__giveup_vsx(current);
+#endif
+
+	/* Remember the MSR with disabled extensions */
+	ext_msr = current->thread.regs->msr;
+
+	/* XXX we get called with irq disabled - change that! */
+	local_irq_enable();
+
+	/* Preload FPU if it's enabled */
+	if (vcpu->arch.shared->msr & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
+
+	local_irq_disable();
+
+	current->thread.regs->msr = ext_msr;
+
+	/* Make sure we save the guest FPU/Altivec/VSX state */
+	kvmppc_giveup_ext(vcpu, MSR_FP);
+	kvmppc_giveup_ext(vcpu, MSR_VEC);
+	kvmppc_giveup_ext(vcpu, MSR_VSX);
+
+	/* Restore FPU state from stack */
+	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
+	current->thread.fpscr.val = fpscr;
+	current->thread.fpexc_mode = fpexc_mode;
+
+#ifdef CONFIG_ALTIVEC
+	/* Restore Altivec state from stack */
+	if (used_vr && current->thread.used_vr) {
+		memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
+		current->thread.vscr = vscr;
+		current->thread.vrsave = vrsave;
+	}
+	current->thread.used_vr = used_vr;
+#endif
+
+#ifdef CONFIG_VSX
+	current->thread.used_vsr = used_vsr;
+#endif
+
+	return ret;
+}
+
+static int kvmppc_book3s_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+		     THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hpte_sysinit();
+
+	return r;
+}
+
+static void kvmppc_book3s_exit(void)
+{
+	kvmppc_mmu_hpte_sysexit();
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 05/13] powerpc, kvm: Rework KVM checks in first-level interrupt handlers
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (3 preceding siblings ...)
  2011-05-11 10:40 ` [PATCH 04/13] kvm/powerpc: Split out code from book3s.c into book3s_pr.c Paul Mackerras
@ 2011-05-11 10:41 ` Paul Mackerras
  2011-05-11 10:42 ` [PATCH 06/13] kvm/powerpc: Deliver program interrupts right away instead of queueing them Paul Mackerras
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:41 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

Instead of branching out-of-line with the DO_KVM macro to check if we
are in a KVM guest at the time of an interrupt, this moves the KVM
check inline in the first-level interrupt handlers.  This speeds up
the non-KVM case and makes sure that none of the interrupt handlers
are missing the check.

Because the first-level interrupt handlers are now larger, some things
had to be move out of line in exceptions-64s.S.

This all necessitated some minor changes to the interrupt entry code
in KVM.  This also streamlines the book3s_32 KVM test.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/exception-64s.h   |  117 ++++++++++++------
 arch/powerpc/kernel/exceptions-64s.S       |  190 +++++++++++++++++-----------
 arch/powerpc/kvm/book3s_rmhandlers.S       |   78 ++++++------
 arch/powerpc/kvm/book3s_segment.S          |    7 +
 arch/powerpc/platforms/iseries/exception.S |    2 +-
 arch/powerpc/platforms/iseries/exception.h |    4 +-
 6 files changed, 246 insertions(+), 152 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index f5dfe34..2a770d8 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -61,19 +61,22 @@
 #define EXC_HV	H
 #define EXC_STD
 
-#define EXCEPTION_PROLOG_1(area)					\
+#define __EXCEPTION_PROLOG_1(area, extra, vec)				\
 	GET_PACA(r13);							\
 	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
 	std	r10,area+EX_R10(r13);					\
+	mfcr	r9;							\
+	extra(vec);							\
 	std	r11,area+EX_R11(r13);					\
 	std	r12,area+EX_R12(r13);					\
 	BEGIN_FTR_SECTION_NESTED(66);					\
 	mfspr	r10,SPRN_CFAR;						\
 	std	r10,area+EX_CFAR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		\
-	GET_SCRATCH0(r9);						\
-	std	r9,area+EX_R13(r13);					\
-	mfcr	r9
+	GET_SCRATCH0(r10);						\
+	std	r10,area+EX_R13(r13)
+#define EXCEPTION_PROLOG_1(area, extra, vec)				\
+	__EXCEPTION_PROLOG_1(area, extra, vec)
 
 #define __EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
@@ -85,13 +88,54 @@
 	mtspr	SPRN_##h##SRR1,r10;					\
 	h##rfid;							\
 	b	.	/* prevent speculative execution */
-#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
+#define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
 
-#define EXCEPTION_PROLOG_PSERIES(area, label, h)			\
-	EXCEPTION_PROLOG_1(area);					\
+#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec)		\
+	EXCEPTION_PROLOG_1(area, extra, vec);				\
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
+#define __KVMTEST(n)							\
+	lbz	r10,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13);			\
+	cmpwi	r10,0;							\
+	bne	do_kvm_##n
+
+#define __KVM_HANDLER(area, h, n)					\
+do_kvm_##n:								\
+	ld	r10,area+EX_R10(r13);					\
+	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	ld	r9,area+EX_R9(r13);					\
+	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	li	r12,n;							\
+	b	kvmppc_interrupt
+
+#define __KVM_HANDLER_SKIP(area, h, n)					\
+do_kvm_##n:								\
+	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
+	ld	r10,area+EX_R10(r13);					\
+	beq	89f;							\
+	stw	r9,PACA_KVM_SVCPU+SVCPU_SCRATCH1(r13);			\
+	ld	r9,area+EX_R9(r13);					\
+	std	r12,PACA_KVM_SVCPU+SVCPU_SCRATCH0(r13);			\
+	li	r12,n;							\
+	b	kvmppc_interrupt;					\
+89:	mtocrf	0x80,r9;						\
+	ld	r9,area+EX_R9(r13);					\
+	b	kvmppc_skip_##h##interrupt
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#define KVMTEST(n)			__KVMTEST(n)
+#define KVM_HANDLER(area, h, n)		__KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST(n)
+#define KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_SKIP(area, h, n)
+#endif
+
+#define NOTEST(n)
+
 /*
  * The common exception prolog is used for all except a few exceptions
  * such as a segment miss on a kernel address.  We have to be prepared
@@ -164,57 +208,54 @@
 	.globl label##_pSeries;				\
 label##_pSeries:					\
 	HMT_MEDIUM;					\
-	DO_KVM	vec;					\
 	SET_SCRATCH0(r13);		/* save r13 */		\
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_STD)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
+				 EXC_STD, KVMTEST, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)		\
 	. = loc;					\
 	.globl label##_hv;				\
 label##_hv:						\
 	HMT_MEDIUM;					\
-	DO_KVM	vec;					\
-	SET_SCRATCH0(r13);	/* save r13 */		\
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common, EXC_HV)
+	SET_SCRATCH0(r13);	/* save r13 */			\
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
+				 EXC_HV, KVMTEST, vec)
 
-#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h)			\
-	HMT_MEDIUM;							\
-	DO_KVM	vec;							\
-	SET_SCRATCH0(r13);    /* save r13 */				\
-	GET_PACA(r13);							\
-	std	r9,PACA_EXGEN+EX_R9(r13);	/* save r9, r10 */	\
-	std	r10,PACA_EXGEN+EX_R10(r13);				\
+#define __SOFTEN_TEST(h)						\
 	lbz	r10,PACASOFTIRQEN(r13);					\
-	mfcr	r9;							\
 	cmpwi	r10,0;							\
-	beq	masked_##h##interrupt;					\
-	GET_SCRATCH0(r10);						\
-	std	r10,PACA_EXGEN+EX_R13(r13);				\
-	std	r11,PACA_EXGEN+EX_R11(r13);				\
-	std	r12,PACA_EXGEN+EX_R12(r13);				\
-	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
-	ld	r10,PACAKMSR(r13);	/* get MSR value for kernel */	\
-	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
-	LOAD_HANDLER(r12,label##_common)				\
-	mtspr	SPRN_##h##SRR0,r12;					\
-	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
-	mtspr	SPRN_##h##SRR1,r10;					\
-	h##rfid;							\
-	b	.	/* prevent speculative execution */
-#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h)			\
-	__MASKABLE_EXCEPTION_PSERIES(vec, label, h)
+	beq	masked_##h##interrupt
+#define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
+
+#define SOFTEN_TEST(vec)						\
+	KVMTEST(vec);							\
+	_SOFTEN_TEST(EXC_STD)
+
+#define SOFTEN_TEST_HV(vec)						\
+	KVMTEST(vec);							\
+	_SOFTEN_TEST(EXC_HV)
+
+#define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
+	HMT_MEDIUM;							\
+	SET_SCRATCH0(r13);    /* save r13 */				\
+	__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);		\
+	EXCEPTION_PROLOG_PSERIES_1(label##_common, h);
+#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
+	__MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
 
 #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label)			\
 	. = loc;							\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
-	_MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_STD)
+	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
+				    EXC_STD, SOFTEN_TEST)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
 	. = loc;							\
 	.globl label##_hv;						\
 label##_hv:								\
-	_MASKABLE_EXCEPTION_PSERIES(vec, label, EXC_HV)
+	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
+				    EXC_HV, SOFTEN_TEST_HV)
 
 #ifdef CONFIG_PPC_ISERIES
 #define DISABLE_INTS				\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 0ec3b42..cbdf374 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -40,7 +40,6 @@ __start_interrupts:
 	.globl system_reset_pSeries;
 system_reset_pSeries:
 	HMT_MEDIUM;
-	DO_KVM	0x100;
 	SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -65,67 +64,45 @@ BEGIN_FTR_SECTION
 	beq	.
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 #endif /* CONFIG_PPC_P7_NAP */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+				 NOTEST, 0x100)
 
 	. = 0x200
-_machine_check_pSeries:
-	HMT_MEDIUM
-	DO_KVM	0x200
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+machine_check_pSeries_1:
+	/* This is moved out of line as it can be patched by FW, but
+	 * some code path might still want to branch into the original
+	 * vector
+	 */
+	b	machine_check_pSeries
 
 	. = 0x300
 	.globl data_access_pSeries
 data_access_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0x300
 	SET_SCRATCH0(r13)
+#ifndef CONFIG_POWER4_ONLY
 BEGIN_FTR_SECTION
-	GET_PACA(r13)
-	std	r9,PACA_EXSLB+EX_R9(r13)
-	std	r10,PACA_EXSLB+EX_R10(r13)
-	mfspr	r10,SPRN_DAR
-	mfspr	r9,SPRN_DSISR
-	srdi	r10,r10,60
-	rlwimi	r10,r9,16,0x20
-	mfcr	r9
-	cmpwi	r10,0x2c
-	beq	do_stab_bolted_pSeries
-	ld	r10,PACA_EXSLB+EX_R10(r13)
-	std	r11,PACA_EXGEN+EX_R11(r13)
-	ld	r11,PACA_EXSLB+EX_R9(r13)
-	std	r12,PACA_EXGEN+EX_R12(r13)
-	GET_SCRATCH0(r12)
-	std	r10,PACA_EXGEN+EX_R10(r13)
-	std	r11,PACA_EXGEN+EX_R9(r13)
-	std	r12,PACA_EXGEN+EX_R13(r13)
-	EXCEPTION_PROLOG_PSERIES_1(data_access_common, EXC_STD)
-FTR_SECTION_ELSE
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD)
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
+	b	data_access_check_stab
+data_access_not_stab:
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
+#endif
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
+				 KVMTEST, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0x380
 	SET_SCRATCH0(r13)
-	GET_PACA(r13)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
-	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
-	mfcr	r9
 #ifdef __DISABLED__
 	/* Keep that around for when we re-implement dynamic VSIDs */
 	cmpdi	r3,0
 	bge	slb_miss_user_pseries
 #endif /* __DISABLED__ */
-	std	r10,PACA_EXSLB+EX_R10(r13)
-	std	r11,PACA_EXSLB+EX_R11(r13)
-	std	r12,PACA_EXSLB+EX_R12(r13)
-	GET_SCRATCH0(r10)
-	std	r10,PACA_EXSLB+EX_R13(r13)
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	mfspr	r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
 	b	.slb_miss_realmode
 #else
@@ -147,24 +124,16 @@ data_access_slb_pSeries:
 	.globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0x480
 	SET_SCRATCH0(r13)
-	GET_PACA(r13)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
-	std	r9,PACA_EXSLB+EX_R9(r13)	/* save r9 - r12 */
-	mfcr	r9
 #ifdef __DISABLED__
 	/* Keep that around for when we re-implement dynamic VSIDs */
 	cmpdi	r3,0
 	bge	slb_miss_user_pseries
 #endif /* __DISABLED__ */
-	std	r10,PACA_EXSLB+EX_R10(r13)
-	std	r11,PACA_EXSLB+EX_R11(r13)
-	std	r12,PACA_EXSLB+EX_R12(r13)
-	GET_SCRATCH0(r10)
-	std	r10,PACA_EXSLB+EX_R13(r13)
-	mfspr	r12,SPRN_SRR1		/* and SRR1 */
+	mfspr	r12,SPRN_SRR1
 #ifndef CONFIG_RELOCATABLE
 	b	.slb_miss_realmode
 #else
@@ -184,26 +153,46 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
-		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD)
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+					    EXC_STD, SOFTEN_TEST)
+		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
 	FTR_SECTION_ELSE
-		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV)
+		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
+					    EXC_HV, SOFTEN_TEST_HV)
+		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
 	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
+
 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
+
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
 
 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
-	MASKABLE_EXCEPTION_HV(0x980, 0x980, decrementer)
+	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
+
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
 	.globl	system_call_pSeries
 system_call_pSeries:
 	HMT_MEDIUM
-	DO_KVM	0xc00
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	std	r9,PACA_EXGEN+EX_R9(r13)
+	std	r10,PACA_EXGEN+EX_R10(r13)
+	mfcr	r9
+	KVMTEST(0xc00)
+	GET_SCRATCH0(r13)
+#endif
 BEGIN_FTR_SECTION
 	cmpdi	r0,0x1ebe
 	beq-	1f
@@ -220,6 +209,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	rfid
 	b	.	/* prevent speculative execution */
 
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
+
 /* Fast LE/BE switch system call */
 1:	mfspr	r12,SPRN_SRR1
 	xori	r12,r12,MSR_LE
@@ -228,6 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	b	.
 
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
 	 * out of line to handle them
@@ -262,30 +254,94 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
+
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
+
 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
+
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
 	. = 0x3000
 
 /*** Out of line interrupts support ***/
 
+	/* moved from 0x200 */
+machine_check_pSeries:
+	.globl machine_check_fwnmi
+machine_check_fwnmi:
+	HMT_MEDIUM
+	SET_SCRATCH0(r13)		/* save r13 */
+	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common,
+				 EXC_STD, KVMTEST, 0x200)
+	KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
+
+#ifndef CONFIG_POWER4_ONLY
+	/* moved from 0x300 */
+data_access_check_stab:
+	GET_PACA(r13)
+	std	r9,PACA_EXSLB+EX_R9(r13)
+	std	r10,PACA_EXSLB+EX_R10(r13)
+	mfspr	r10,SPRN_DAR
+	mfspr	r9,SPRN_DSISR
+	srdi	r10,r10,60
+	rlwimi	r10,r9,16,0x20
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
+	rlwimi	r10,r9,8,0x300
+#endif
+	mfcr	r9
+	cmpwi	r10,0x2c
+	beq	do_stab_bolted_pSeries
+	mtcrf	0x80,r9
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	b	data_access_not_stab
+do_stab_bolted_pSeries:
+	std	r11,PACA_EXSLB+EX_R11(r13)
+	std	r12,PACA_EXSLB+EX_R12(r13)
+	GET_SCRATCH0(r10)
+	std	r10,PACA_EXSLB+EX_R13(r13)
+	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
+#endif /* CONFIG_POWER4_ONLY */
+
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
+	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
+
+	.align	7
+
 	/* moved from 0xe00 */
-	STD_EXCEPTION_HV(., 0xe00, h_data_storage)
-	STD_EXCEPTION_HV(., 0xe20, h_instr_storage)
-	STD_EXCEPTION_HV(., 0xe40, emulation_assist)
-	STD_EXCEPTION_HV(., 0xe60, hmi_exception) /* need to flush cache ? */
+	STD_EXCEPTION_HV(., 0xe02, h_data_storage)
+	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
+	STD_EXCEPTION_HV(., 0xe22, h_instr_storage)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
+	STD_EXCEPTION_HV(., 0xe42, emulation_assist)
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
+	STD_EXCEPTION_HV(., 0xe62, hmi_exception) /* need to flush cache ? */
+	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
 
 	/* moved from 0xf00 */
 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
+	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -317,14 +373,6 @@ masked_Hinterrupt:
 	hrfid
 	b	.
 
-	.align	7
-do_stab_bolted_pSeries:
-	std	r11,PACA_EXSLB+EX_R11(r13)
-	std	r12,PACA_EXSLB+EX_R12(r13)
-	GET_SCRATCH0(r10)
-	std	r10,PACA_EXSLB+EX_R13(r13)
-	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
-
 #ifdef CONFIG_PPC_PSERIES
 /*
  * Vectors for the FWNMI option.  Share common code.
@@ -334,14 +382,8 @@ do_stab_bolted_pSeries:
 system_reset_fwnmi:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD)
-
-	.globl machine_check_fwnmi
-      .align 7
-machine_check_fwnmi:
-	HMT_MEDIUM
-	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common, EXC_STD)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+				 NOTEST, 0x100)
 
 #endif /* CONFIG_PPC_PSERIES */
 
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 1a1b344..feb6545 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -40,37 +40,42 @@
 #define MSR_NOIRQ		MSR_KERNEL & ~(MSR_IR | MSR_DR)
 #define FUNC(name) 		GLUE(.,name)
 
-#elif defined(CONFIG_PPC_BOOK3S_32)
+kvmppc_skip_interrupt:
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_SRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_SRR0, r13
+	GET_SCRATCH0(r13)
+	rfid
+	b	.
 
-#define LOAD_SHADOW_VCPU(reg)						\
-	mfspr	reg, SPRN_SPRG_THREAD;					\
-	lwz	reg, THREAD_KVM_SVCPU(reg);				\
-	/* PPC32 can have a NULL pointer - let's check for that */	\
-	mtspr   SPRN_SPRG_SCRATCH1, r12;	/* Save r12 */		\
-	mfcr	r12;							\
-	cmpwi	reg, 0;							\
-	bne	1f;							\
-	mfspr	reg, SPRN_SPRG_SCRATCH0;				\
-	mtcr	r12;							\
-	mfspr	r12, SPRN_SPRG_SCRATCH1;				\
-	b	kvmppc_resume_\intno;					\
-1:;									\
-	mtcr	r12;							\
-	mfspr	r12, SPRN_SPRG_SCRATCH1;				\
-	tophys(reg, reg)
+kvmppc_skip_Hinterrupt:
+	/*
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 */
+	mfspr	r13, SPRN_HSRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_HSRR0, r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+	
+#elif defined(CONFIG_PPC_BOOK3S_32)
 
 #define SHADOW_VCPU_OFF		0
 #define MSR_NOIRQ		MSR_KERNEL
 #define FUNC(name)		name
 
-#endif
-
 .macro INTERRUPT_TRAMPOLINE intno
 
 .global kvmppc_trampoline_\intno
 kvmppc_trampoline_\intno:
 
-	SET_SCRATCH0(r13)		/* Save r13 */
+	mtspr	SPRN_SPRG_SCRATCH0, r13		/* Save r13 */
 
 	/*
 	 * First thing to do is to find out if we're coming
@@ -78,19 +83,28 @@ kvmppc_trampoline_\intno:
 	 *
 	 * To distinguish, we check a magic byte in the PACA/current
 	 */
-	LOAD_SHADOW_VCPU(r13)
-	PPC_STL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+	mfspr	r13, SPRN_SPRG_THREAD
+	lwz	r13, THREAD_KVM_SVCPU(r13)
+	/* PPC32 can have a NULL pointer - let's check for that */
+	mtspr   SPRN_SPRG_SCRATCH1, r12		/* Save r12 */
 	mfcr	r12
+	cmpwi	r13, 0
+	bne	1f
+2:	mtcr	r12
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	mfspr	r13, SPRN_SPRG_SCRATCH0		/* r13 = original r13 */
+	b	kvmppc_resume_\intno		/* Get back original handler */
+
+1:	tophys(r13, r13)
 	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	stw	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
 	lbz	r12, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
 	cmpwi	r12, KVM_GUEST_MODE_NONE
 	bne	..kvmppc_handler_hasmagic_\intno
 	/* No KVM guest? Then jump back to the Linux handler! */
 	lwz	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
-	mtcr	r12
-	PPC_LL	r12, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
-	GET_SCRATCH0(r13)			/* r13 = original r13 */
-	b	kvmppc_resume_\intno		/* Get back original handler */
+	b	2b
 
 	/* Now we know we're handling a KVM guest */
 ..kvmppc_handler_hasmagic_\intno:
@@ -112,9 +126,6 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_MACHINE_CHECK
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_STORAGE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_STORAGE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL
-#ifdef CONFIG_PPC_BOOK3S_64
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL_HV
-#endif
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALIGNMENT
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PROGRAM
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_FP_UNAVAIL
@@ -124,14 +135,6 @@ INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_TRACE
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PERFMON
 INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALTIVEC
 
-/* Those are only available on 64 bit machines */
-
-#ifdef CONFIG_PPC_BOOK3S_64
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_SEGMENT
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_SEGMENT
-INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_VSX
-#endif
-
 /*
  * Bring us back to the faulting code, but skip the
  * faulting instruction.
@@ -163,6 +166,7 @@ kvmppc_handler_skip_ins:
 
 	/* And get back into the code */
 	RFI
+#endif
 
 /*
  * This trampoline brings us back to a real mode handler
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 4512642..4a623eb 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -125,6 +125,9 @@ kvmppc_handler_trampoline_enter_end:
 .global kvmppc_handler_trampoline_exit
 kvmppc_handler_trampoline_exit:
 
+.global kvmppc_interrupt
+kvmppc_interrupt:
+
 	/* Register usage at this point:
 	 *
 	 * SPRG_SCRATCH0  = guest R13
@@ -155,12 +158,16 @@ kvmppc_handler_trampoline_exit:
 	PPC_LL	r2, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
 
 	/* Save guest PC and MSR */
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
 	andi.	r0,r12,0x2
 	beq	1f
 	mfspr	r3,SPRN_HSRR0
 	mfspr	r4,SPRN_HSRR1
 	andi.	r12,r12,0x3ffd
 	b	2f
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
+#endif
 1:	mfsrr0	r3
 	mfsrr1	r4
 2:
diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S
index a67984c..fcec56a 100644
--- a/arch/powerpc/platforms/iseries/exception.S
+++ b/arch/powerpc/platforms/iseries/exception.S
@@ -156,7 +156,7 @@ BEGIN_FTR_SECTION
 	std	r12,PACA_EXGEN+EX_R13(r13)
 	EXCEPTION_PROLOG_ISERIES_1
 FTR_SECTION_ELSE
-	EXCEPTION_PROLOG_1(PACA_EXGEN)
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0)
 	EXCEPTION_PROLOG_ISERIES_1
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_SLB)
 	b	data_access_common
diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h
index bae3fba..50271b5 100644
--- a/arch/powerpc/platforms/iseries/exception.h
+++ b/arch/powerpc/platforms/iseries/exception.h
@@ -39,7 +39,7 @@
 label##_iSeries:							\
 	HMT_MEDIUM;							\
 	mtspr	SPRN_SPRG_SCRATCH0,r13;	/* save r13 */			\
-	EXCEPTION_PROLOG_1(area);					\
+	EXCEPTION_PROLOG_1(area, NOTEST, 0);				\
 	EXCEPTION_PROLOG_ISERIES_1;					\
 	b	label##_common
 
@@ -48,7 +48,7 @@ label##_iSeries:							\
 label##_iSeries:							\
 	HMT_MEDIUM;							\
 	mtspr	SPRN_SPRG_SCRATCH0,r13;	/* save r13 */			\
-	EXCEPTION_PROLOG_1(PACA_EXGEN);					\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0);			\
 	lbz	r10,PACASOFTIRQEN(r13);					\
 	cmpwi	0,r10,0;						\
 	beq-	label##_iSeries_masked;					\
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 06/13] kvm/powerpc: Deliver program interrupts right away instead of queueing them
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (4 preceding siblings ...)
  2011-05-11 10:41 ` [PATCH 05/13] powerpc, kvm: Rework KVM checks in first-level interrupt handlers Paul Mackerras
@ 2011-05-11 10:42 ` Paul Mackerras
  2011-05-11 10:42 ` [PATCH 07/13] kvm/powerpc: Pass init/destroy vm and prepare/commit memory region ops down Paul Mackerras
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:42 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

Doing so means that we don't have to save the flags anywhere and gets
rid of the last reference to to_book3s(vcpu) in arch/powerpc/kvm/book3s.c.

Doing so is OK because a program interrupt won't be generated at the
same time as any other synchronous interrupt.  If a program interrupt
and an asynchronous interrupt (external or decrementer) are generated
at the same time, the program interrupt will be delivered, which is
correct because it has a higher priority, and then the asynchronous
interrupt will be masked.

We don't ever generate system reset or machine check interrupts to the
guest, but if we did, then we would need to make sure they got delivered
rather than the program interrupt.  The current code would be wrong in
this situation anyway since it would deliver the program interrupt as
well as the reset/machine check interrupt.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kvm/book3s.c |    8 +++-----
 1 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 224e6a8..729f799 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -128,8 +128,8 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
 
 void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
 {
-	to_book3s(vcpu)->prog_flags = flags;
-	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM);
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);
 }
 
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
@@ -169,7 +169,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 {
 	int deliver = 1;
 	int vec = 0;
-	ulong flags = 0ULL;
 	bool crit = kvmppc_critical_section(vcpu);
 
 	switch (priority) {
@@ -205,7 +204,6 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 		break;
 	case BOOK3S_IRQPRIO_PROGRAM:
 		vec = BOOK3S_INTERRUPT_PROGRAM;
-		flags = to_book3s(vcpu)->prog_flags;
 		break;
 	case BOOK3S_IRQPRIO_VSX:
 		vec = BOOK3S_INTERRUPT_VSX;
@@ -236,7 +234,7 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
 #endif
 
 	if (deliver)
-		kvmppc_inject_interrupt(vcpu, vec, flags);
+		kvmppc_inject_interrupt(vcpu, vec, 0);
 
 	return deliver;
 }
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 07/13] kvm/powerpc: Pass init/destroy vm and prepare/commit memory region ops down
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (5 preceding siblings ...)
  2011-05-11 10:42 ` [PATCH 06/13] kvm/powerpc: Deliver program interrupts right away instead of queueing them Paul Mackerras
@ 2011-05-11 10:42 ` Paul Mackerras
  2011-05-11 10:43 ` [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code Paul Mackerras
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:42 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

>From d8fc78f42250481db50898ab3df95c21283cb9b0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 10 May 2011 15:37:13 +1000
Subject: [PATCH 07/13] kvm/powerpc: Pass init/destroy vm and prepare/commit
 memory region ops down

This arranges for the top-level arch/powerpc/kvm/powerpc.c file to
pass down some of the calls it gets to the lower-level subarchitecture
specific code.  The lower-level implementations (in booke.c and book3s.c)
are no-ops.  The coming book3s_hv.c will need this.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_ppc.h |    7 +++++++
 arch/powerpc/kvm/book3s_pr.c       |   20 ++++++++++++++++++++
 arch/powerpc/kvm/booke.c           |   20 ++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c         |    9 ++++++---
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ecb3bc7..f3c218a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -109,6 +109,13 @@ extern void kvmppc_booke_exit(void);
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 
+extern int kvmppc_core_init_vm(struct kvm *kvm);
+extern void kvmppc_core_destroy_vm(struct kvm *kvm);
+extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 0eb66e5..08cedf0 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -984,6 +984,26 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_userspace_memory_region *mem)
+{
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 static int kvmppc_book3s_init(void)
 {
 	int r;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ef76acb..ee65ec4 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -620,6 +620,26 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 	return -ENOTSUPP;
 }
 
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_userspace_memory_region *mem)
+{
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 int __init kvmppc_booke_init(void)
 {
 	unsigned long ivor[16];
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9975846..b6a2a04 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -147,7 +147,7 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm)
 {
-	return 0;
+	return kvmppc_core_init_vm(kvm);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -163,6 +163,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		kvm->vcpus[i] = NULL;
 
 	atomic_set(&kvm->online_vcpus, 0);
+
+	kvmppc_core_destroy_vm(kvm);
+
 	mutex_unlock(&kvm->lock);
 }
 
@@ -207,7 +210,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_userspace_memory_region *mem,
                                    int user_alloc)
 {
-	return 0;
+	return kvmppc_core_prepare_memory_region(kvm, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -215,7 +218,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
                struct kvm_memory_slot old,
                int user_alloc)
 {
-       return;
+	kvmppc_core_commit_memory_region(kvm, mem);
 }
 
 
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (6 preceding siblings ...)
  2011-05-11 10:42 ` [PATCH 07/13] kvm/powerpc: Pass init/destroy vm and prepare/commit memory region ops down Paul Mackerras
@ 2011-05-11 10:43 ` Paul Mackerras
  2011-05-17 18:05     ` Marcelo Tosatti
  2011-05-11 10:44 ` [PATCH 09/13] powerpc: Set up LPCR for running guest partitions Paul Mackerras
                   ` (5 subsequent siblings)
  13 siblings, 1 reply; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:43 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

>From 964ee93b2d728e4fb16ae66eaceb6e912bf114ad Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 10 May 2011 22:23:18 +1000
Subject: [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into
 subarch-specific code

Instead of doing the kvm_guest_enter/exit() and local_irq_dis/enable()
calls in powerpc.c, this moves them down into the subarch-specific
book3s_pr.c and booke.c.  This eliminates an extra local_irq_enable()
call in book3s_pr.c, and will be needed for when we do SMT4 guest
support in the book3s hypervisor mode code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_ppc.h   |    1 +
 arch/powerpc/kvm/book3s_interrupts.S |    2 +-
 arch/powerpc/kvm/book3s_pr.c         |   12 ++++++------
 arch/powerpc/kvm/booke.c             |   13 +++++++++++++
 arch/powerpc/kvm/powerpc.c           |    6 +-----
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index f3c218a..3210911 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -42,6 +42,7 @@ enum emulation_result {
 	EMULATE_AGAIN,        /* something went wrong. go again */
 };
 
+extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 2f0bc92..8c5e0e1 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -85,7 +85,7 @@
  *  r3: kvm_run pointer
  *  r4: vcpu pointer
  */
-_GLOBAL(__kvmppc_vcpu_entry)
+_GLOBAL(__kvmppc_vcpu_run)
 
 kvm_start_entry:
 	/* Write correct stack frame */
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 08cedf0..f769915 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -891,8 +891,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	vfree(vcpu_book3s);
 }
 
-extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int ret;
 	double fpr[32][TS_FPRWIDTH];
@@ -944,14 +943,15 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	/* Remember the MSR with disabled extensions */
 	ext_msr = current->thread.regs->msr;
 
-	/* XXX we get called with irq disabled - change that! */
-	local_irq_enable();
-
 	/* Preload FPU if it's enabled */
 	if (vcpu->arch.shared->msr & MSR_FP)
 		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
 
-	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
+	kvm_guest_enter();
+
+	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+
+	kvm_guest_exit();
 
 	local_irq_disable();
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ee65ec4..403e17c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -257,6 +257,19 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 		vcpu->arch.shared->int_pending = 0;
 }
 
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	local_irq_disable();
+	kvm_guest_enter();
+	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+	kvm_guest_exit();
+	local_irq_enable();
+
+	return ret;
+}
+
 /**
  * kvmppc_handle_exit
  *
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b6a2a04..4b54148 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -478,11 +478,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 	kvmppc_core_deliver_interrupts(vcpu);
 
-	local_irq_disable();
-	kvm_guest_enter();
-	r = __kvmppc_vcpu_run(run, vcpu);
-	kvm_guest_exit();
-	local_irq_enable();
+	r = kvmppc_vcpu_run(run, vcpu);
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 09/13] powerpc: Set up LPCR for running guest partitions
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (7 preceding siblings ...)
  2011-05-11 10:43 ` [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code Paul Mackerras
@ 2011-05-11 10:44 ` Paul Mackerras
  2011-05-11 10:44 ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:44 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

In hypervisor mode, the LPCR controls several aspects of guest
partitions, including virtual partition memory mode, and also controls
whether the hypervisor decrementer interrupts are enabled.  This sets
up LPCR at boot time so that guest partitions will use a virtual real
memory area (VRMA) composed of 16MB large pages, and hypervisor
decrementer interrupts are disabled.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/reg.h         |    4 ++++
 arch/powerpc/kernel/cpu_setup_power7.S |   18 +++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 05658b7..c07b7be 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -231,10 +231,12 @@
 #define   LPCR_VPM0	(1ul << (63-0))
 #define   LPCR_VPM1	(1ul << (63-1))
 #define   LPCR_ISL	(1ul << (63-2))
+#define   LPCR_VC_SH	(63-2)
 #define   LPCR_DPFD_SH	(63-11)
 #define   LPCR_VRMA_L	(1ul << (63-12))
 #define   LPCR_VRMA_LP0	(1ul << (63-15))
 #define   LPCR_VRMA_LP1	(1ul << (63-16))
+#define   LPCR_VRMASD_SH (63-16)
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_PECE	0x00007000	/* powersave exit cause enable */
@@ -242,8 +244,10 @@
 #define     LPCR_PECE1	0x00002000	/* decrementer can cause exit */
 #define     LPCR_PECE2	0x00001000	/* machine check etc can cause exit */
 #define   LPCR_MER	0x00000800	/* Mediated External Exception */
+#define   LPCR_LPES    0x0000000c
 #define   LPCR_LPES0   0x00000008      /* LPAR Env selector 0 */
 #define   LPCR_LPES1   0x00000004      /* LPAR Env selector 1 */
+#define   LPCR_LPES_SH	2
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power7.S
index 4f9a93f..2ef6749 100644
--- a/arch/powerpc/kernel/cpu_setup_power7.S
+++ b/arch/powerpc/kernel/cpu_setup_power7.S
@@ -61,19 +61,23 @@ __init_LPCR:
 	 *   LPES = 0b01 (HSRR0/1 used for 0x500)
 	 *   PECE = 0b111
 	 *   DPFD = 4
+	 *   HDICE = 0
+	 *   VC = 0b100 (VPM0=1, VPM1=0, ISL=0)
+	 *   VRMASD = 0b10000 (L=1, LP=00)
 	 *
 	 * Other bits untouched for now
 	 */
 	mfspr	r3,SPRN_LPCR
-	ori	r3,r3,(LPCR_LPES0|LPCR_LPES1)
-	xori	r3,r3, LPCR_LPES0
+	li	r5,1
+	rldimi	r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
 	ori	r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
-	li	r5,7
-	sldi	r5,r5,LPCR_DPFD_SH
-	andc	r3,r3,r5
 	li	r5,4
-	sldi	r5,r5,LPCR_DPFD_SH
-	or	r3,r3,r5
+	rldimi	r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
+	clrrdi	r3,r3,1		/* clear HDICE */
+	li	r5,4
+	rldimi	r3,r5, LPCR_VC_SH, 0
+	li	r5,0x10
+	rldimi	r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
 	mtspr	SPRN_LPCR,r3
 	isync
 	blr
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (8 preceding siblings ...)
  2011-05-11 10:44 ` [PATCH 09/13] powerpc: Set up LPCR for running guest partitions Paul Mackerras
@ 2011-05-11 10:44 ` Paul Mackerras
  2011-05-12  9:07     ` Avi Kivity
  2011-05-15 21:58     ` Alexander Graf
  2011-05-11 10:45 ` [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel Paul Mackerras
                   ` (3 subsequent siblings)
  13 siblings, 2 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:44 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode.  Using hypervisor mode means
that the guest can use the processor's supervisor mode.  That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host.  This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.

This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses.  That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification.  In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.

Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.

This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.

With the guest running in supervisor mode, most exceptions go straight
to the guest.  We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.

We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.

In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount.  Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.

The POWER7 processor has a restriction that all threads in a core have
to be in the same partition.  MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest.  At present we require the host and guest to run
in single-thread mode because of this hardware restriction.

This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management.  This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/exception-64s.h  |   27 +-
 arch/powerpc/include/asm/kvm_asm.h        |    4 +
 arch/powerpc/include/asm/kvm_book3s.h     |  148 ++++++-
 arch/powerpc/include/asm/kvm_book3s_asm.h |    4 +-
 arch/powerpc/include/asm/kvm_booke.h      |    4 +
 arch/powerpc/include/asm/kvm_host.h       |   60 +++-
 arch/powerpc/include/asm/kvm_ppc.h        |    6 +
 arch/powerpc/include/asm/mmu-hash64.h     |   10 +-
 arch/powerpc/include/asm/paca.h           |   10 +
 arch/powerpc/include/asm/reg.h            |    4 +
 arch/powerpc/kernel/asm-offsets.c         |   95 ++++-
 arch/powerpc/kernel/exceptions-64s.S      |   60 ++--
 arch/powerpc/kvm/Kconfig                  |   40 ++-
 arch/powerpc/kvm/Makefile                 |   16 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c       |  258 +++++++++++
 arch/powerpc/kvm/book3s_hv.c              |  413 ++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_interrupts.S   |  326 ++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  663 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c                |   23 +-
 arch/powerpc/kvm/trace.h                  |    2 +-
 include/linux/kvm.h                       |    6 +
 21 files changed, 2094 insertions(+), 85 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv.c
 create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
 create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 2a770d8..d32e1ef 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -65,14 +65,14 @@
 	GET_PACA(r13);							\
 	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
 	std	r10,area+EX_R10(r13);					\
-	mfcr	r9;							\
-	extra(vec);							\
-	std	r11,area+EX_R11(r13);					\
-	std	r12,area+EX_R12(r13);					\
 	BEGIN_FTR_SECTION_NESTED(66);					\
 	mfspr	r10,SPRN_CFAR;						\
 	std	r10,area+EX_CFAR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		\
+	mfcr	r9;							\
+	extra(vec);							\
+	std	r11,area+EX_R11(r13);					\
+	std	r12,area+EX_R12(r13);					\
 	GET_SCRATCH0(r10);						\
 	std	r10,area+EX_R13(r13)
 #define EXCEPTION_PROLOG_1(area, extra, vec)				\
@@ -134,6 +134,17 @@ do_kvm_##n:								\
 #define KVM_HANDLER_SKIP(area, h, n)
 #endif
 
+#ifdef CONFIG_KVM_BOOK3S_NONHV
+#define KVMTEST_NONHV(n)		__KVMTEST(n)
+#define KVM_HANDLER_NONHV(area, h, n)	__KVM_HANDLER(area, h, n)
+#define KVM_HANDLER_NONHV_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
+
+#else
+#define KVMTEST_NONHV(n)
+#define KVM_HANDLER_NONHV(area, h, n)
+#define KVM_HANDLER_NONHV_SKIP(area, h, n)
+#endif
+
 #define NOTEST(n)
 
 /*
@@ -210,7 +221,7 @@ label##_pSeries:					\
 	HMT_MEDIUM;					\
 	SET_SCRATCH0(r13);		/* save r13 */		\
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
-				 EXC_STD, KVMTEST, vec)
+				 EXC_STD, KVMTEST_NONHV, vec)
 
 #define STD_EXCEPTION_HV(loc, vec, label)		\
 	. = loc;					\
@@ -227,8 +238,8 @@ label##_hv:						\
 	beq	masked_##h##interrupt
 #define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
 
-#define SOFTEN_TEST(vec)						\
-	KVMTEST(vec);							\
+#define SOFTEN_TEST_NONHV(vec)						\
+	KVMTEST_NONHV(vec);						\
 	_SOFTEN_TEST(EXC_STD)
 
 #define SOFTEN_TEST_HV(vec)						\
@@ -248,7 +259,7 @@ label##_hv:						\
 	.globl label##_pSeries;						\
 label##_pSeries:							\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
-				    EXC_STD, SOFTEN_TEST)
+				    EXC_STD, SOFTEN_TEST_NONHV)
 
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
 	. = loc;							\
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 0951b17..7b1f0e0 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -64,8 +64,12 @@
 #define BOOK3S_INTERRUPT_PROGRAM	0x700
 #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
 #define BOOK3S_INTERRUPT_DECREMENTER	0x900
+#define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
 #define BOOK3S_INTERRUPT_SYSCALL	0xc00
 #define BOOK3S_INTERRUPT_TRACE		0xd00
+#define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
+#define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
+#define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
 #define BOOK3S_INTERRUPT_PERFMON	0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
 #define BOOK3S_INTERRUPT_VSX		0xf40
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 12829bb..5b76073 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -117,6 +117,7 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
 extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -128,10 +129,12 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
 extern int kvmppc_mmu_hpte_sysinit(void);
 extern void kvmppc_mmu_hpte_sysexit(void);
+extern int kvmppc_mmu_hv_init(void);
 
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
 			   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
@@ -152,6 +155,19 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
 }
 
+extern void kvm_return_point(void);
+
+/* Also add subarch specific defines */
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+#include <asm/kvm_book3s_32.h>
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/kvm_book3s_64.h>
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_NONHV
+
 #define vcpu_guest_state(vcpu)	((vcpu)->arch.shared)
 
 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
@@ -168,16 +184,6 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
 		vcpu_guest_state(vcpu)->int_pending = 0;
 }
 
-static inline ulong dsisr(void)
-{
-	ulong r;
-	asm ( "mfdsisr %0 " : "=r" (r) );
-	return r;
-}
-
-extern void kvm_return_point(void);
-static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
-
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 	if ( num < 14 ) {
@@ -265,6 +271,11 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return to_svcpu(vcpu)->fault_dar;
 }
 
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->msr;
+}
+
 static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 {
 	ulong crit_raw = vcpu_guest_state(vcpu)->critical;
@@ -284,6 +295,115 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 	return crit;
 }
+#else /* CONFIG_KVM_BOOK3S_NONHV */
+
+#define vcpu_guest_state(vcpu)	(&(vcpu)->arch)
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	/* Recalculate LPCR:MER based on the presence of
+	 * a pending external interrupt
+	 */
+	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions) ||
+	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions))
+		vcpu->arch.lpcr |= LPCR_MER;
+	else
+		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
+}
+
+static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
+{
+	vcpu->arch.gpr[num] = val;
+}
+
+static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
+{
+	return vcpu->arch.gpr[num];
+}
+
+static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.cr = val;
+}
+
+static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.cr;
+}
+
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+{
+	vcpu->arch.xer = val;
+}
+
+static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.xer;
+}
+
+static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.ctr = val;
+}
+
+static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.ctr;
+}
+
+static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.lr = val;
+}
+
+static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.lr;
+}
+
+static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
+{
+	vcpu->arch.pc = val;
+}
+
+static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pc;
+}
+
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+	ulong pc = kvmppc_get_pc(vcpu);
+
+	/* Load the instruction manually if it failed to do so in the
+	 * exit path */
+	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
+		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
+
+	return vcpu->arch.last_inst;
+}
+
+static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.fault_dar;
+}
+
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.msr;
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+#endif
 
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
@@ -292,12 +412,4 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ			0x7c0007ec
 
-/* Also add subarch specific defines */
-
-#ifdef CONFIG_PPC_BOOK3S_32
-#include <asm/kvm_book3s_32.h>
-#else
-#include <asm/kvm_book3s_64.h>
-#endif
-
 #endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d5a8a38..d7279f5 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -61,6 +61,7 @@ kvmppc_resume_\intno:
 #else  /*__ASSEMBLY__ */
 
 struct kvmppc_book3s_shadow_vcpu {
+#ifdef CONFIG_KVM_BOOK3S_NONHV
 	ulong gpr[14];
 	u32 cr;
 	u32 xer;
@@ -72,6 +73,7 @@ struct kvmppc_book3s_shadow_vcpu {
 	ulong pc;
 	ulong shadow_srr1;
 	ulong fault_dar;
+#endif
 
 	ulong host_r1;
 	ulong host_r2;
@@ -84,7 +86,7 @@ struct kvmppc_book3s_shadow_vcpu {
 #ifdef CONFIG_PPC_BOOK3S_32
 	u32     sr[16];			/* Guest SRs */
 #endif
-#ifdef CONFIG_PPC_BOOK3S_64
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_NONHV)
 	u8 slb_max;			/* highest used guest slb entry */
 	struct  {
 		u64     esid;
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index 9c9ba3d..a90e091 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 	return vcpu->arch.fault_dear;
 }
 
+static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->msr;
+}
 #endif /* __ASM_KVM_BOOKE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 3ebe51b..ec62365 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -33,7 +33,9 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 
+#ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#endif
 
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
@@ -133,7 +135,24 @@ struct kvmppc_exit_timing {
 	};
 };
 
+struct kvmppc_pginfo {
+	unsigned long pfn;
+	atomic_t refcnt;
+};
+
 struct kvm_arch {
+	unsigned long hpt_virt;
+	unsigned long ram_npages;
+	unsigned long ram_psize;
+	unsigned long ram_porder;
+	struct kvmppc_pginfo *ram_pginfo;
+	unsigned int lpid;
+	unsigned int host_lpid;
+	unsigned long host_lpcr;
+	unsigned long sdr1;
+	unsigned long host_sdr1;
+	int tlbie_lock;
+	unsigned short last_vcpu[NR_CPUS];
 };
 
 struct kvmppc_pte {
@@ -190,7 +209,7 @@ struct kvm_vcpu_arch {
 	ulong rmcall;
 	ulong host_paca_phys;
 	struct kvmppc_slb slb[64];
-	int slb_max;		/* # valid entries in slb[] */
+	int slb_max;		/* 1 + index of last valid entry in slb[] */
 	int slb_nr;		/* total number of entries in SLB */
 	struct kvmppc_mmu mmu;
 #endif
@@ -204,9 +223,10 @@ struct kvm_vcpu_arch {
 	vector128 vr[32];
 	vector128 vscr;
 #endif
+	u32 vrsave;
 
 #ifdef CONFIG_VSX
-	u64 vsr[32];
+	u64 vsr[64];
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S
@@ -214,29 +234,45 @@ struct kvm_vcpu_arch {
 	u32 qpr[32];
 #endif
 
-#ifdef CONFIG_BOOKE
-	ulong pc;
 	ulong ctr;
 	ulong lr;
 
 	ulong xer;
 	u32 cr;
-#endif
+
+	ulong pc;
+	ulong msr;
 
 #ifdef CONFIG_PPC_BOOK3S
 	ulong shadow_msr;
 	ulong hflags;
 	ulong guest_owned_ext;
+	ulong purr;
+	ulong spurr;
+	ulong lpcr;
+	ulong dscr;
+	ulong amr;
+	ulong uamor;
+	u32 ctrl;
+	u32 dsisr;
+	ulong dabr;
 #endif
 	u32 mmucr;
+	ulong sprg0;
+	ulong sprg1;
+	ulong sprg2;
+	ulong sprg3;
 	ulong sprg4;
 	ulong sprg5;
 	ulong sprg6;
 	ulong sprg7;
+	ulong srr0;
+	ulong srr1;
 	ulong csrr0;
 	ulong csrr1;
 	ulong dsrr0;
 	ulong dsrr1;
+	ulong dear;
 	ulong esr;
 	u32 dec;
 	u32 decar;
@@ -259,6 +295,9 @@ struct kvm_vcpu_arch {
 	u32 dbcr1;
 	u32 dbsr;
 
+	u64 mmcr[3];
+	u32 pmc[6];
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct kvmppc_exit_timing timing_exit;
 	struct kvmppc_exit_timing timing_last_enter;
@@ -272,8 +311,12 @@ struct kvm_vcpu_arch {
 	struct dentry *debugfs_exit_timing;
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S
+	ulong fault_dar;
+	u32 fault_dsisr;
+#endif
+
 #ifdef CONFIG_BOOKE
-	u32 last_inst;
 	ulong fault_dear;
 	ulong fault_esr;
 	ulong queued_dear;
@@ -288,13 +331,18 @@ struct kvm_vcpu_arch {
 	u8 dcr_is_write;
 	u8 osi_needed;
 	u8 osi_enabled;
+	u8 hcall_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
 	struct hrtimer dec_timer;
 	struct tasklet_struct tasklet;
 	u64 dec_jiffies;
+	u64 dec_expires;
 	unsigned long pending_exceptions;
+	u16 last_cpu;
+	u32 last_inst;
+	int trap;
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 3210911..cd9ad96 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -110,6 +110,12 @@ extern void kvmppc_booke_exit(void);
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
 
+extern long kvmppc_alloc_hpt(struct kvm *kvm);
+extern void kvmppc_free_hpt(struct kvm *kvm);
+extern long kvmppc_prepare_vrma(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem);
+extern void kvmppc_map_vrma(struct kvm *kvm,
+			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index ae7b3ef..0bb3fc1 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -90,13 +90,19 @@ extern char initial_stab[];
 
 #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
+#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT	12
-#define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
-#define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
+#define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
 #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
 #define HPTE_R_N		ASM_CONST(0x0000000000000004)
+#define HPTE_R_G		ASM_CONST(0x0000000000000008)
+#define HPTE_R_M		ASM_CONST(0x0000000000000010)
+#define HPTE_R_I		ASM_CONST(0x0000000000000020)
+#define HPTE_R_W		ASM_CONST(0x0000000000000040)
+#define HPTE_R_WIMG		ASM_CONST(0x0000000000000078)
 #define HPTE_R_C		ASM_CONST(0x0000000000000080)
 #define HPTE_R_R		ASM_CONST(0x0000000000000100)
+#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)
 
 #define HPTE_V_1TB_SEG		ASM_CONST(0x4000000000000000)
 #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 7412676..8dba5f6 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -149,6 +149,16 @@ struct paca_struct {
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
 	/* We use this to store guest state in */
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	struct kvm_vcpu *kvm_vcpu;
+	u64 dabr;
+	u64 host_mmcr[3];
+	u32 host_pmc[6];
+	u64 host_purr;
+	u64 host_spurr;
+	u64 host_dscr;
+	u64 dec_expires;
+#endif
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c07b7be..0036977 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -189,6 +189,10 @@
 #define SPRN_CTR	0x009	/* Count Register */
 #define SPRN_DSCR	0x11
 #define SPRN_CFAR	0x1c	/* Come From Address Register */
+#define SPRN_AMR	0x1d	/* Authority Mask Register */
+#define SPRN_UAMOR	0x9d	/* User Authority Mask Override Register */
+#define SPRN_AMOR	0x15d	/* Authority Mask Override Register */
+#define SPRN_RWMR	885
 #define SPRN_CTRLF	0x088
 #define SPRN_CTRLT	0x098
 #define   CTRL_CT	0xc0000000	/* current thread */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6887661..49e97fd 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -187,6 +187,7 @@ int main(void)
 	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
 	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
+	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
@@ -200,9 +201,17 @@ int main(void)
 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
-	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
-	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	DEFINE(PACA_KVM_VCPU, offsetof(struct paca_struct, kvm_vcpu));
+	DEFINE(PACA_HOST_MMCR, offsetof(struct paca_struct, host_mmcr));
+	DEFINE(PACA_HOST_PMC, offsetof(struct paca_struct, host_pmc));
+	DEFINE(PACA_HOST_PURR, offsetof(struct paca_struct, host_purr));
+	DEFINE(PACA_HOST_SPURR, offsetof(struct paca_struct, host_spurr));
+	DEFINE(PACA_HOST_DSCR, offsetof(struct paca_struct, host_dscr));
+	DEFINE(PACA_DABR, offsetof(struct paca_struct, dabr));
+	DEFINE(PACA_KVM_DECEXP, offsetof(struct paca_struct, dec_expires));
 #endif
+#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
 #endif /* CONFIG_PPC64 */
 
 	/* RTAS */
@@ -396,6 +405,28 @@ int main(void)
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
+	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
+	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+#ifdef CONFIG_ALTIVEC
+	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
+	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
+#endif
+#ifdef CONFIG_VSX
+	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+#endif
+	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
+	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
+	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
+	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
+	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
+	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
+	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.srr0));
+	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.srr1));
+	DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.sprg0));
+	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.sprg1));
+	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.sprg2));
+	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.sprg3));
 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
@@ -406,16 +437,65 @@ int main(void)
 
 	/* book3s */
 #ifdef CONFIG_PPC_BOOK3S
+	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
+	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
+	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
+	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
+	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
+	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
+	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
+	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
+	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
+	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
+	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.dsisr));
+	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.dear));
+	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
+	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
+	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
+	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
+	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
+	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
+	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
+	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
+	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
+	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
+	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
+					scratch0));
+	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
+					scratch1));
+	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
+					in_guest));
+	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
+	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
+	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+#ifdef CONFIG_KVM_BOOK3S_NONHV
+#ifdef CONFIG_PPC64
+	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
+	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
+#endif
+	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
+					 vmhandler));
 	DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
 	DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
 	DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
@@ -435,16 +515,6 @@ int main(void)
 	DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
 	DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
 	DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
-	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
-	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
-	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					 vmhandler));
-	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					scratch0));
-	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					scratch1));
-	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
-					in_guest));
 	DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
 					   fault_dsisr));
 	DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
@@ -453,6 +523,7 @@ int main(void)
 					 last_inst));
 	DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
 					   shadow_srr1));
+#endif /* CONFIG_KVM_BOOK3S_NONHV */
 #ifdef CONFIG_PPC_BOOK3S_32
 	DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
 #endif
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index cbdf374..80c6456 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -87,14 +87,14 @@ data_access_not_stab:
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
 #endif
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST, 0x300)
+				 KVMTEST_NONHV, 0x300)
 
 	. = 0x380
 	.globl data_access_slb_pSeries
 data_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 #ifdef __DISABLED__
@@ -125,7 +125,7 @@ data_access_slb_pSeries:
 instruction_access_slb_pSeries:
 	HMT_MEDIUM
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x480)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
 #ifdef __DISABLED__
@@ -153,32 +153,32 @@ instruction_access_slb_pSeries:
 hardware_interrupt_pSeries:
 hardware_interrupt_hv:
 	BEGIN_FTR_SECTION
-		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST)
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
-	FTR_SECTION_ELSE
 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
 					    EXC_HV, SOFTEN_TEST_HV)
 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
-	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
+	FTR_SECTION_ELSE
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
+					    EXC_STD, SOFTEN_TEST_NONHV)
+		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
 
 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x600)
 
 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x700)
 
 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x800)
 
 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
 	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
 
 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xa00)
 
 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xb00)
 
 	. = 0xc00
 	.globl	system_call_pSeries
@@ -219,7 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 	b	.
 
 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xd00)
 
 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
 	 * out of line to handle them
@@ -254,23 +254,23 @@ vsx_unavailable_pSeries_1:
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
+	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
+	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
+	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
 #endif /* CONFIG_CBE_RAS */
 
 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x1700)
 
 #ifdef CONFIG_CBE_RAS
 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
+	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
 #endif /* CONFIG_CBE_RAS */
 
 	. = 0x3000
@@ -297,7 +297,7 @@ data_access_check_stab:
 	mfspr	r9,SPRN_DSISR
 	srdi	r10,r10,60
 	rlwimi	r10,r9,16,0x20
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_NONHV
 	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
 	rlwimi	r10,r9,8,0x300
 #endif
@@ -316,11 +316,11 @@ do_stab_bolted_pSeries:
 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
 #endif /* CONFIG_POWER4_ONLY */
 
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
-	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
+	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x300)
+	KVM_HANDLER_NONHV_SKIP(PACA_EXSLB, EXC_STD, 0x380)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x400)
+	KVM_HANDLER_NONHV(PACA_EXSLB, EXC_STD, 0x480)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x900)
 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
 
 	.align	7
@@ -337,11 +337,11 @@ do_stab_bolted_pSeries:
 
 	/* moved from 0xf00 */
 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf00)
 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf20)
 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
+	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf40)
 
 /*
  * An interrupt came in while soft-disabled; clear EE in SRR1,
@@ -418,7 +418,11 @@ slb_miss_user_pseries:
 /* KVM's trampoline code needs to be close to the interrupt handlers */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#ifdef CONFIG_KVM_BOOK3S_NONHV
 #include "../kvm/book3s_rmhandlers.S"
+#else
+#include "../kvm/book3s_hv_rmhandlers.S"
+#endif
 #endif
 
 	.align	7
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index b7baff7..6ff191b 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ config KVM
 	bool
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
-	select KVM_MMIO
 
 config KVM_BOOK3S_HANDLER
 	bool
@@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
 config KVM_BOOK3S_32_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
+	select KVM_MMIO
 
 config KVM_BOOK3S_64_HANDLER
 	bool
 	select KVM_BOOK3S_HANDLER
 
+config KVM_BOOK3S_NONHV
+	bool
+	select KVM_MMIO
+
 config KVM_BOOK3S_32
 	tristate "KVM support for PowerPC book3s_32 processors"
 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
 	select KVM
 	select KVM_BOOK3S_32_HANDLER
+	select KVM_BOOK3S_NONHV
 	---help---
 	  Support running unmodified book3s_32 guest kernels
 	  in virtual machines on book3s_32 host processors.
@@ -48,10 +53,38 @@ config KVM_BOOK3S_32
 	  If unsure, say N.
 
 config KVM_BOOK3S_64
-	tristate "KVM support for PowerPC book3s_64 processors"
+	bool
+	select KVM_BOOK3S_64_HANDLER
+
+config KVM_BOOK3S_64_HV
+	bool "KVM support for POWER7 using hypervisor mode in host"
 	depends on EXPERIMENTAL && PPC_BOOK3S_64
 	select KVM
-	select KVM_BOOK3S_64_HANDLER
+	select KVM_BOOK3S_64
+	---help---
+	  Support running unmodified book3s_64 guest kernels in
+	  virtual machines on POWER7 processors that have hypervisor
+	  mode available to the host.
+
+	  If you say Y here, KVM will use the hardware virtualization
+	  facilities of POWER7 (and later) processors, meaning that
+	  guest operating systems will run at full hardware speed
+	  using supervisor and user modes.  However, this also means
+	  that KVM is not usable under PowerVM (pHyp), is only usable
+	  on POWER7 (or later) processors, and can only emulate
+	  POWER5+, POWER6 and POWER7 processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_NONHV
+	tristate "KVM support for PowerPC book3s_64 processors"
+	depends on EXPERIMENTAL && PPC_BOOK3S_64 && !KVM_BOOK3S_64_HV
+	select KVM
+	select KVM_BOOK3S_64
+	select KVM_BOOK3S_NONHV
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.
@@ -65,6 +98,7 @@ config KVM_440
 	bool "KVM support for PowerPC 440 processors"
 	depends on EXPERIMENTAL && 44x
 	select KVM
+	select KVM_MMIO
 	---help---
 	  Support running unmodified 440 guest kernels in virtual machines on
 	  440 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index bf9854f..37c1a60 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -14,7 +14,7 @@ CFLAGS_emulate.o  := -I.
 
 common-objs-y += powerpc.o emulate.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
-obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
+obj-$(CONFIG_KVM_BOOK3S_NONHV) += book3s_exports.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
 
@@ -38,7 +38,7 @@ kvm-e500-objs := \
 	e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
-kvm-book3s_64-objs := \
+kvm-book3s_64_nonhv-objs := \
 	$(common-objs-y) \
 	fpu.o \
 	book3s_paired_singles.o \
@@ -50,7 +50,17 @@ kvm-book3s_64-objs := \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
-kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+kvm-objs-$(CONFIG_KVM_BOOK3S_64_NONHV) := $(kvm-book3s_64_nonhv-objs)
+
+kvm-book3s_64_hv-objs := \
+	../../../virt/kvm/kvm_main.o \
+	powerpc.o \
+	emulate.o \
+	book3s.o \
+	book3s_hv.o \
+	book3s_hv_interrupts.o \
+	book3s_64_mmu_hv.o
+kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) := $(kvm-book3s_64_hv-objs)
 
 kvm-book3s_32-objs := \
 	$(common-objs-y) \
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644
index 0000000..52d1be1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -0,0 +1,258 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+
+/* For now use fixed-size 16MB page table */
+#define HPT_ORDER	24
+#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
+#define HPT_HASH_MASK	(HPT_NPTEG - 1)
+
+/* Pages in the VRMA are 16MB pages */
+#define VRMA_PAGE_ORDER	24
+#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
+
+#define NR_LPIDS	1024
+unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
+
+long kvmppc_alloc_hpt(struct kvm *kvm)
+{
+	unsigned long hpt;
+	unsigned long lpid;
+
+	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
+			       HPT_ORDER - PAGE_SHIFT);
+	if (!hpt) {
+		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
+		return -ENOMEM;
+	}
+	kvm->arch.hpt_virt = hpt;
+
+	do {
+		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
+		if (lpid >= NR_LPIDS) {
+			pr_err("kvm_alloc_hpt: No LPIDs free\n");
+			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
+			return -ENOMEM;
+		}
+	} while (test_and_set_bit(lpid, lpid_inuse));
+
+	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
+	kvm->arch.lpid = lpid;
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+	kvm->arch.host_lpid = mfspr(SPRN_LPID);
+	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
+	
+	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
+	return 0;
+}
+
+void kvmppc_free_hpt(struct kvm *kvm)
+{
+	unsigned long i;
+	struct kvmppc_pginfo *pginfo;
+
+	clear_bit(kvm->arch.lpid, lpid_inuse);
+	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
+
+	if (kvm->arch.ram_pginfo) {
+		pginfo = kvm->arch.ram_pginfo;
+		kvm->arch.ram_pginfo = NULL;
+		for (i = 0; i < kvm->arch.ram_npages; ++i)
+			put_page(pfn_to_page(pginfo[i].pfn));
+		kfree(pginfo);
+	}
+}
+
+static unsigned long user_page_size(unsigned long addr)
+{
+	struct vm_area_struct *vma;
+	unsigned long size = PAGE_SIZE;
+
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, addr);
+	if (vma)
+		size = vma_kernel_pagesize(vma);
+	up_read(&current->mm->mmap_sem);
+	return size;
+}
+
+static pfn_t hva_to_pfn(unsigned long addr)
+{
+	struct page *page[1];
+	int npages;
+
+	might_sleep();
+
+	npages = get_user_pages_fast(addr, 1, 1, page);
+
+	if (unlikely(npages != 1))
+		return 0;
+
+	return page_to_pfn(page[0]);
+}
+
+long kvmppc_prepare_vrma(struct kvm *kvm,
+			 struct kvm_userspace_memory_region *mem)
+{
+	unsigned long psize, porder;
+	unsigned long i, npages;
+	struct kvmppc_pginfo *pginfo;
+	pfn_t pfn;
+	unsigned long hva;
+
+	/* First see what page size we have */
+	psize = user_page_size(mem->userspace_addr);
+	/* For now, only allow 16MB pages */
+	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
+		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
+		       psize, mem->memory_size, mem->userspace_addr);
+		return -EINVAL;
+	}
+	porder = __ilog2(psize);
+
+	npages = mem->memory_size >> porder;
+	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
+	if (!pginfo) {
+		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
+		       npages * sizeof(struct kvmppc_pginfo));
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < npages; ++i) {
+		hva = mem->userspace_addr + (i << porder);
+		if (user_page_size(hva) != psize)
+			goto err;
+		pfn = hva_to_pfn(hva);
+		if (pfn == 0) {
+			pr_err("oops, no pfn for hva %lx\n", hva);
+			goto err;
+		}
+		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
+			pr_err("oops, unaligned pfn %llx\n", pfn);
+			put_page(pfn_to_page(pfn));
+			goto err;
+		}
+		pginfo[i].pfn = pfn;
+	}
+
+	kvm->arch.ram_npages = npages;
+	kvm->arch.ram_psize = psize;
+	kvm->arch.ram_porder = porder;
+	kvm->arch.ram_pginfo = pginfo;
+
+	return 0;
+
+ err:
+	kfree(pginfo);
+	return -EINVAL;
+}
+
+void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
+{
+	unsigned long i;
+	unsigned long npages = kvm->arch.ram_npages;
+	unsigned long pfn;
+	unsigned long *hpte;
+	unsigned long hash;
+	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
+
+	if (!pginfo)
+		return;
+
+	/* VRMA can't be > 1TB */
+	if (npages > 1ul << (40 - kvm->arch.ram_porder))
+		npages = 1ul << (40 - kvm->arch.ram_porder);
+	/* Can't use more than 1 HPTE per HPTEG */
+	if (npages > HPT_NPTEG)
+		npages = HPT_NPTEG;
+
+	for (i = 0; i < npages; ++i) {
+		pfn = pginfo[i].pfn;
+		/* can't use hpt_hash since va > 64 bits */
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
+		/*
+		 * We assume that the hash table is empty and no
+		 * vcpus are using it at this stage.  Since we create
+		 * at most one HPTE per HPTEG, we just assume entry 7
+		 * is available and use it.
+		 */
+		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
+		hpte += 7 * 2;
+		/* HPTE low word - RPN, protection, etc. */
+		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
+			HPTE_R_M | PP_RWXX;
+		wmb();
+		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
+			HPTE_V_LARGE | HPTE_V_VALID;
+	}
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
+		return 0;
+	memset(lpid_inuse, 0, sizeof(lpid_inuse));
+	set_bit(mfspr(SPRN_LPID), lpid_inuse);
+	set_bit(NR_LPIDS - 1, lpid_inuse);
+
+	return 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+}
+
+static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				struct kvmppc_pte *gpte, bool data)
+{
+	return -ENOENT;
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
+
+	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 0000000..f6b7cd1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	local_paca->kvm_vcpu = vcpu;
+	vcpu->cpu = cpu;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vcpu->cpu = -1;
+}
+
+void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
+{
+	u64 now;
+	unsigned long dec_nsec;
+
+	now = get_tb();
+	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_queue_dec(vcpu);
+	if (vcpu->arch.pending_exceptions)
+		return;
+	if (vcpu->arch.dec_expires != ~(u64)0) {
+		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
+			tb_ticks_per_sec;
+		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+			      HRTIMER_MODE_REL);
+	}
+
+	kvm_vcpu_block(vcpu);
+	vcpu->stat.halt_wakeup++;
+
+	if (vcpu->arch.dec_expires != ~(u64)0)
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->arch.msr = msr;
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	vcpu->arch.pvr = pvr;
+	kvmppc_mmu_book3s_hv_init(vcpu);
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+	pr_err("pc  = %.16lx  msr = %.16lx  trap = %x\n",
+	       vcpu->arch.pc, vcpu->arch.msr, vcpu->arch.trap);
+	for (r = 0; r < 16; ++r)
+		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+		       r, kvmppc_get_gpr(vcpu, r),
+		       r+16, kvmppc_get_gpr(vcpu, r+16));
+	pr_err("ctr = %.16lx  lr  = %.16lx\n",
+	       vcpu->arch.ctr, vcpu->arch.lr);
+	pr_err("srr0 = %.16lx srr1 = %.16lx\n",
+	       vcpu->arch.srr0, vcpu->arch.srr1);
+	pr_err("sprg0 = %.16lx sprg1 = %.16lx\n",
+	       vcpu->arch.sprg0, vcpu->arch.sprg1);
+	pr_err("sprg2 = %.16lx sprg3 = %.16lx\n",
+	       vcpu->arch.sprg2, vcpu->arch.sprg3);
+	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.dsisr);
+	pr_err("dar = %.16lx\n", vcpu->arch.dear);
+	pr_err("fault dar = %.16lx dsisr = %.8x\n",
+	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+	for (r = 0; r < vcpu->arch.slb_max; ++r)
+		pr_err("  ESID = %.16llx VSID = %.16llx\n",
+		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->arch.last_inst);
+}
+
+static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			      struct task_struct *tsk)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		ulong flags;
+		/*
+		 * Normally program interrupts are delivered directly
+		 * to the guest by the hardware, but we can get here
+		 * as a result of a hypervisor emulation interrupt
+		 * (e40) getting turned into a 700 by BML RTAS.
+		 */
+		flags = vcpu->arch.msr & 0x1f0000ull;
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		/* hcall - punt to userspace */
+		int i;
+
+		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+		for (i = 0; i < 9; ++i)
+			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		vcpu->arch.hcall_needed = 1;
+		r = RESUME_HOST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
+		vcpu->arch.dear = vcpu->arch.fault_dar;
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
+					0x08000000);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		kvmppc_core_queue_program(vcpu, 0x80000);
+		r = RESUME_GUEST;
+		break;
+	default:
+		kvmppc_dump_regs(vcpu);
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%lx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu), vcpu->arch.msr);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+		if (signal_pending(tsk)) {
+			vcpu->stat.signal_exits++;
+			run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+		} else {
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	memset(sregs, 0, sizeof(struct kvm_sregs));
+	for (i = 0; i < vcpu->arch.slb_max; i++) {
+		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+	}
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int i, j;
+
+	kvmppc_set_pvr(vcpu, sregs->pvr);
+
+	j = 0;
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+			++j;
+		}
+	}
+	vcpu->arch.slb_max = j;
+
+	return 0;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	if (cpu_has_feature(CPU_FTR_HVMODE_206))
+		return 0;
+	return -EIO;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long lpcr;
+
+	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	if (!vcpu)
+		goto out;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	vcpu->arch.last_cpu = -1;
+	vcpu->arch.host_msr = mfmsr();
+	vcpu->arch.mmcr[0] = MMCR0_FC;
+	vcpu->arch.ctrl = CTRL_RUNLATCH;
+	/* default to book3s_64 (power7) */
+	vcpu->arch.pvr = 0x3f0200;
+	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+
+	/* remember where some real-mode handlers are */
+	vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
+	vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
+	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
+
+	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
+	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
+	vcpu->arch.lpcr = lpcr;
+
+	return vcpu;
+
+free_vcpu:
+	kfree(vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu);
+}
+
+extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	u64 now;
+
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+	preempt_disable();
+
+	kvm_guest_enter();
+
+	__kvmppc_vcore_entry(NULL, vcpu);
+
+	kvm_guest_exit();
+
+	preempt_enable();
+	kvm_resched(vcpu);
+
+	now = get_tb();
+	/* cancel pending dec exception if dec is positive */
+	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
+		kvmppc_core_dequeue_dec(vcpu);
+
+	return kvmppc_handle_exit(run, vcpu, current);
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		return kvmppc_prepare_vrma(kvm, mem);
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem)
+{
+	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
+		kvmppc_map_vrma(kvm, mem);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	long r;
+
+	/* Allocate hashed page table */
+	r = kvmppc_alloc_hpt(kvm);
+
+	return r;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvmppc_free_hpt(kvm);
+}
+
+/* These are stubs for now */
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	return EMULATE_FAIL;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_book3s_hv_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hv_init();
+
+	return r;
+}
+
+static void kvmppc_book3s_hv_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_hv_init);
+module_exit(kvmppc_book3s_hv_exit);
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644
index 0000000..19d152d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -0,0 +1,326 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+
+#define DISABLE_INTERRUPTS	\
+	mfmsr   r0;		\
+	rldicl  r0,r0,48,1;	\
+	rotldi  r0,r0,16;	\
+	mtmsrd  r0,1;		\
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+	/* Write correct stack frame */
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save non-volatile registers (r14 - r31) */
+	SAVE_NVGPRS(r1)
+
+	/* Save host PMU registers and load guest PMU registers */
+	/* R4 is live here (vcpu pointer) but not r3 or r5 */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
+	isync
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r5, LPPACA_PMCINUSE(r3)
+	cmpwi	r5, 0
+	beq	31f			/* skip if not */
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r7, PACA_HOST_MMCR(r13)
+	std	r5, PACA_HOST_MMCR + 8(r13)
+	std	r6, PACA_HOST_MMCR + 16(r13)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r5, SPRN_PMC2
+	mfspr	r6, SPRN_PMC3
+	mfspr	r7, SPRN_PMC4
+	mfspr	r8, SPRN_PMC5
+	mfspr	r9, SPRN_PMC6
+	stw	r3, PACA_HOST_PMC(r13)
+	stw	r5, PACA_HOST_PMC + 4(r13)
+	stw	r6, PACA_HOST_PMC + 8(r13)
+	stw	r7, PACA_HOST_PMC + 12(r13)
+	stw	r8, PACA_HOST_PMC + 16(r13)
+	stw	r9, PACA_HOST_PMC + 20(r13)
+31:
+
+	/* Save host DSCR */
+	mfspr	r3, SPRN_DSCR
+	std	r3, PACA_HOST_DSCR(r13)
+
+	/* Save host DABR */
+	mfspr	r3, SPRN_DABR
+	std	r3, PACA_DABR(r13)
+
+	DISABLE_INTERRUPTS
+
+	/*
+	 * Put whatever is in the decrementer into the
+	 * hypervisor decrementer.
+	 */
+	mfspr	r8,SPRN_DEC
+	mftb	r7
+	mtspr	SPRN_HDEC,r8
+	extsw	r8,r8
+	add	r8,r8,r7
+	std	r8,PACA_KVM_DECEXP(r13)
+
+	ld	r5, VCPU_TRAMPOLINE_ENTER(r4)
+	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+
+	/* Jump to segment patching handler and into our guest */
+	b	kvmppc_rmcall
+
+/*
+ * This is the handler in module memory. It gets jumped at from the
+ * lowmem trampoline code, so it's basically the guest exit code.
+ *
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 * SVCPU.*  = guest *
+	 *
+	 */
+
+	/* R7 = vcpu */
+	ld	r7, PACA_KVM_VCPU(r13)
+
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, PACA_KVM_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r4, LPPACA_PMCINUSE(r3)
+	cmpwi	r4, 0
+	beq	23f			/* skip if not */
+	lwz	r3, PACA_HOST_PMC(r13)
+	lwz	r4, PACA_HOST_PMC + 4(r13)
+	lwz	r5, PACA_HOST_PMC + 4(r13)
+	lwz	r6, PACA_HOST_PMC + 4(r13)
+	lwz	r8, PACA_HOST_PMC + 4(r13)
+	lwz	r9, PACA_HOST_PMC + 4(r13)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r4
+	mtspr	SPRN_PMC3, r5
+	mtspr	SPRN_PMC4, r6
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, PACA_HOST_MMCR(r13)
+	ld	r4, PACA_HOST_MMCR + 8(r13)
+	ld	r5, PACA_HOST_MMCR + 16(r13)
+	mtspr	SPRN_MMCR1, r4
+	mtspr	SPRN_MMCRA, r5
+	mtspr	SPRN_MMCR0, r3
+	isync
+23:
+
+	/* Restore host msr -> SRR1 */
+	ld	r4, VCPU_HOST_MSR(r7)
+
+	/*
+	 * For some interrupts, we need to call the real Linux
+	 * handler, so it can do work for us. This has to happen
+	 * as if the interrupt arrived from the kernel though,
+	 * so let's fake it here where most state is restored.
+	 *
+	 * Call Linux for hardware interrupts/decrementer
+	 * r3 = address of interrupt handler (exit reason)
+	 */
+	/* Note: preemption is disabled at this point */
+
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	1f
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	1f
+
+	/* Back to EE=1 */
+	mtmsr	r4
+	sync
+	b	kvm_return_point
+
+1:	bl	call_linux_handler
+
+.global kvm_return_point
+kvm_return_point:
+	/* Restore non-volatile host registers (r14 - r31) */
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
+
+call_linux_handler:
+	/* Restore host IP -> SRR0 */
+	mflr	r3
+	mtlr	r12
+
+	ld	r5, VCPU_TRAMPOLINE_LOWMEM(r7)
+	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+	b	kvmppc_rmcall
+
+/*
+ * Save away FP, VMX and VSX registers.
+ * r3 = vcpu pointer
+*/
+_GLOBAL(kvmppc_save_fp)
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+#ifdef CONFIG_VSX
+	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
+#else
+	oris	r8,r8,MSR_VEC@h
+#endif
+#endif
+	mtmsrd	r8
+	isync
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VSRS
+	stxvd2x	reg,r6,r3
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	stfd	reg,reg*8+VCPU_FPRS(r3)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+	mffs	fr0
+	stfd	fr0,VCPU_FPSCR(r3)
+
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r6,reg*16+VCPU_VRS
+	stvx	reg,r6,r3
+	reg = reg + 1
+	.endr
+	mfvscr	vr0
+	li	r6,VCPU_VSCR
+	stvx	vr0,r6,r3
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	mfspr	r6,SPRN_VRSAVE
+	stw	r6,VCPU_VRSAVE(r3)
+	mtmsrd	r9
+	isync
+	blr
+
+/*
+ * Load up FP, VMX and VSX registers
+ * r4 = vcpu pointer
+ */
+	.globl	kvmppc_load_fp
+kvmppc_load_fp:
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+#ifdef CONFIG_VSX
+	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
+#else
+	oris	r8,r8,MSR_VEC@h
+#endif
+#endif
+	mtmsrd	r8
+	isync
+	lfd	fr0,VCPU_FPSCR(r4)
+	MTFSF_L(fr0)
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VSRS
+	lxvd2x	reg,r7,r4
+	reg = reg + 1
+	.endr
+FTR_SECTION_ELSE
+#endif
+	reg = 0
+	.rept	32
+	lfd	reg,reg*8+VCPU_FPRS(r4)
+	reg = reg + 1
+	.endr
+#ifdef CONFIG_VSX
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	li	r7,VCPU_VSCR
+	lvx	vr0,r7,r4
+	mtvscr	vr0
+BEGIN_FTR_SECTION
+	reg = 0
+	.rept	32
+	li	r7,reg*16+VCPU_VRS
+	lvx	reg,r7,r4
+	reg = reg + 1
+	.endr
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	lwz	r7,VCPU_VRSAVE(r4)
+	mtspr	SPRN_VRSAVE,r7
+	blr
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644
index 0000000..813b01c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -0,0 +1,663 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in the linear mapping           *
+ *                                                                           *
+ ****************************************************************************/
+
+#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
+
+	.globl	kvmppc_skip_interrupt
+kvmppc_skip_interrupt:
+	mfspr	r13,SPRN_SRR0
+	addi	r13,r13,4
+	mtspr	SPRN_SRR0,r13
+	GET_SCRATCH0(r13)
+	rfid
+	b	.
+
+	.globl	kvmppc_skip_Hinterrupt
+kvmppc_skip_Hinterrupt:
+	mfspr	r13,SPRN_HSRR0
+	addi	r13,r13,4
+	mtspr	SPRN_HSRR0,r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
+
+/*
+ * This trampoline brings us back to a real mode handler
+ *
+ * Input Registers:
+ *
+ * R5 = SRR0
+ * R6 = SRR1
+ * R12 = real-mode IP
+ * LR = real-mode IP
+ *
+ */
+.global kvmppc_handler_lowmem_trampoline
+kvmppc_handler_lowmem_trampoline:
+	cmpwi	r12,0x500
+	beq	1f
+	cmpwi	r12,0x980
+	beq	1f
+	mtsrr0	r3
+	mtsrr1	r4
+	blr
+1:	mtspr	SPRN_HSRR0,r3
+	mtspr	SPRN_HSRR1,r4
+	blr
+
+/*
+ * Call a function in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * R5 = function
+ * R6 = MSR
+ * R7 = scratch register
+ *
+ */
+_GLOBAL(kvmppc_rmcall)
+	mfmsr	r7
+	li	r0,MSR_RI		/* clear RI in MSR */
+	andc	r7,r7,r0
+	mtmsrd	r7,1
+	mtsrr0	r5
+	mtsrr1	r6
+	RFI
+
+.global kvmppc_trampoline_lowmem
+kvmppc_trampoline_lowmem:
+	PPC_LONG kvmppc_handler_lowmem_trampoline - _stext
+
+.global kvmppc_trampoline_enter
+kvmppc_trampoline_enter:
+	PPC_LONG kvmppc_handler_trampoline_enter - _stext
+
+#define ULONG_SIZE 		8
+#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_enter
+kvmppc_handler_trampoline_enter:
+
+	/* Required state:
+	 *
+	 * R4 = vcpu pointer
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * all other volatile GPRS = free
+	 */
+	ld	r14, VCPU_GPR(r14)(r4)
+	ld	r15, VCPU_GPR(r15)(r4)
+	ld	r16, VCPU_GPR(r16)(r4)
+	ld	r17, VCPU_GPR(r17)(r4)
+	ld	r18, VCPU_GPR(r18)(r4)
+	ld	r19, VCPU_GPR(r19)(r4)
+	ld	r20, VCPU_GPR(r20)(r4)
+	ld	r21, VCPU_GPR(r21)(r4)
+	ld	r22, VCPU_GPR(r22)(r4)
+	ld	r23, VCPU_GPR(r23)(r4)
+	ld	r24, VCPU_GPR(r24)(r4)
+	ld	r25, VCPU_GPR(r25)(r4)
+	ld	r26, VCPU_GPR(r26)(r4)
+	ld	r27, VCPU_GPR(r27)(r4)
+	ld	r28, VCPU_GPR(r28)(r4)
+	ld	r29, VCPU_GPR(r29)(r4)
+	ld	r30, VCPU_GPR(r30)(r4)
+	ld	r31, VCPU_GPR(r31)(r4)
+
+	/* Load guest PMU registers */
+	/* R4 is live here (vcpu pointer) */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
+	lwz	r6, VCPU_PMC + 8(r4)
+	lwz	r7, VCPU_PMC + 12(r4)
+	lwz	r8, VCPU_PMC + 16(r4)
+	lwz	r9, VCPU_PMC + 20(r4)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r5
+	mtspr	SPRN_PMC3, r6
+	mtspr	SPRN_PMC4, r7
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, VCPU_MMCR(r4)
+	ld	r5, VCPU_MMCR + 8(r4)
+	ld	r6, VCPU_MMCR + 16(r4)
+	mtspr	SPRN_MMCR1, r5
+	mtspr	SPRN_MMCRA, r6
+	mtspr	SPRN_MMCR0, r3
+	isync
+
+	/* Load up FP, VMX and VSX registers */
+	bl	kvmppc_load_fp
+
+	/* Switch DSCR to guest value */
+	ld	r5, VCPU_DSCR(r4)
+	mtspr	SPRN_DSCR, r5
+
+	/*
+	 * Set the decrementer to the guest decrementer.
+	 */
+	ld	r8,VCPU_DEC_EXPIRES(r4)
+	mftb	r7
+	subf	r3,r7,r8
+	mtspr	SPRN_DEC,r3
+	stw	r3,VCPU_DEC(r4)
+
+	ld	r5, VCPU_SPRG0(r4)
+	ld	r6, VCPU_SPRG1(r4)
+	ld	r7, VCPU_SPRG2(r4)
+	ld	r8, VCPU_SPRG3(r4)
+	mtspr	SPRN_SPRG0, r5
+	mtspr	SPRN_SPRG1, r6
+	mtspr	SPRN_SPRG2, r7
+	mtspr	SPRN_SPRG3, r8
+
+	/* Save R1 in the PACA */
+	std	r1, PACA_KVM_SVCPU + SVCPU_HOST_R1(r13)
+
+	/* Load up DAR and DSISR */
+	ld	r5, VCPU_DAR(r4)
+	lwz	r6, VCPU_DSISR(r4)
+	mtspr	SPRN_DAR, r5
+	mtspr	SPRN_DSISR, r6
+
+	/* Set partition DABR */
+	li	r5,3
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+
+	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	ld	r5,VCPU_AMR(r4)
+	ld	r6,VCPU_UAMOR(r4)
+	li	r7,-1
+	mtspr	SPRN_AMR,r5
+	mtspr	SPRN_UAMOR,r6
+	mtspr	SPRN_AMOR,r7
+
+	/* Clear out SLB */
+	li	r6,0
+	slbmte	r6,r6
+	slbia
+	ptesync
+
+	/* Switch to guest partition. */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	ld	r6,KVM_SDR1(r9)
+	lwz	r7,KVM_LPID(r9)
+	li	r0,0x3ff		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r0
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	ld	r8,VCPU_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,10
+	li	r12,0x980
+	mr	r9,r4
+	blt	hdec_soon
+
+	/*
+	 * Invalidate the TLB if we could possibly have stale TLB
+	 * entries for this partition on this core due to the use
+	 * of tlbiel.
+	 */
+	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+	lwz	r5,VCPU_VCPUID(r4)
+	lhz	r6,PACAPACAINDEX(r13)
+	lhz	r8,VCPU_LAST_CPU(r4)
+	sldi	r7,r6,1			/* see if this is the same vcpu */
+	add	r7,r7,r9		/* as last ran on this pcpu */
+	lhz	r0,KVM_LAST_VCPU(r7)
+	cmpw	r6,r8			/* on the same cpu core as last time? */
+	bne	3f
+	cmpw	r0,r5			/* same vcpu as this core last ran? */
+	beq	1f
+3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
+	sth	r5,KVM_LAST_VCPU(r7)
+	li	r6,128
+	mtctr	r6
+	li	r7,0x800		/* IS field = 0b10 */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:
+
+	/* Save purr/spurr */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	std	r5,PACA_HOST_PURR(r13)
+	std	r6,PACA_HOST_SPURR(r13)
+	ld	r7,VCPU_PURR(r4)
+	ld	r8,VCPU_SPURR(r4)
+	mtspr	SPRN_PURR,r7
+	mtspr	SPRN_SPURR,r8
+
+	/* Load up guest SLB entries */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+
+	/* Restore state of CTRL run bit; assume 1 on entry */
+	lwz	r5,VCPU_CTRL(r4)
+	andi.	r5,r5,1
+	bne	4f
+	mfspr	r6,SPRN_CTRLF
+	clrrdi	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	ld	r6, VCPU_CTR(r4)
+	lwz	r7, VCPU_XER(r4)
+
+	mtctr	r6
+	mtxer	r7
+
+	/* Move SRR0 and SRR1 into the respective regs */
+	ld	r6, VCPU_SRR0(r4)
+	ld	r7, VCPU_SRR1(r4)
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r7
+
+	ld	r10, VCPU_PC(r4)
+
+	ld	r11, VCPU_MSR(r4)	/* r10 = vcpu->arch.msr & ~MSR_HV */
+	rldicl	r11, r11, 63 - MSR_HV_LG, 1
+	rotldi	r11, r11, 1 + MSR_HV_LG
+	ori	r11, r11, MSR_ME
+
+fast_guest_return:
+	mtspr	SPRN_HSRR0,r10
+	mtspr	SPRN_HSRR1,r11
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r9, KVM_GUEST_MODE_GUEST
+	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+
+	/* Enter guest */
+
+	ld	r5, VCPU_LR(r4)
+	lwz	r6, VCPU_CR(r4)
+	mtlr	r5
+	mtcr	r6
+
+	ld	r0, VCPU_GPR(r0)(r4)
+	ld	r1, VCPU_GPR(r1)(r4)
+	ld	r2, VCPU_GPR(r2)(r4)
+	ld	r3, VCPU_GPR(r3)(r4)
+	ld	r5, VCPU_GPR(r5)(r4)
+	ld	r6, VCPU_GPR(r6)(r4)
+	ld	r7, VCPU_GPR(r7)(r4)
+	ld	r8, VCPU_GPR(r8)(r4)
+	ld	r9, VCPU_GPR(r9)(r4)
+	ld	r10, VCPU_GPR(r10)(r4)
+	ld	r11, VCPU_GPR(r11)(r4)
+	ld	r12, VCPU_GPR(r12)(r4)
+	ld	r13, VCPU_GPR(r13)(r4)
+
+	ld	r4, VCPU_GPR(r4)(r4)
+
+	hrfid
+	b	.
+kvmppc_handler_trampoline_enter_end:
+
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+	.globl	kvmppc_interrupt
+kvmppc_interrupt:
+	/*
+	 * Register contents:
+	 * R12		= interrupt vector
+	 * R13		= PACA
+	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 */
+	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
+	std	r9, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
+	ld	r9, PACA_KVM_VCPU(r13)
+
+	/* Save registers */
+
+	std	r0, VCPU_GPR(r0)(r9)
+	std	r1, VCPU_GPR(r1)(r9)
+	std	r2, VCPU_GPR(r2)(r9)
+	std	r3, VCPU_GPR(r3)(r9)
+	std	r4, VCPU_GPR(r4)(r9)
+	std	r5, VCPU_GPR(r5)(r9)
+	std	r6, VCPU_GPR(r6)(r9)
+	std	r7, VCPU_GPR(r7)(r9)
+	std	r8, VCPU_GPR(r8)(r9)
+	ld	r0, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
+	std	r0, VCPU_GPR(r9)(r9)
+	std	r10, VCPU_GPR(r10)(r9)
+	std	r11, VCPU_GPR(r11)(r9)
+	ld	r3, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
+	lwz	r4, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
+	std	r3, VCPU_GPR(r12)(r9)
+	stw	r4, VCPU_CR(r9)
+
+	/* Restore R1/R2 so we can handle faults */
+	ld	r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
+	ld	r2, PACATOC(r13)
+
+	mfspr	r10, SPRN_SRR0
+	mfspr	r11, SPRN_SRR1
+	std	r10, VCPU_SRR0(r9)
+	std	r11, VCPU_SRR1(r9)
+	andi.	r0, r12, 2		/* need to read HSRR0/1? */
+	beq	1f
+	mfspr	r10, SPRN_HSRR0
+	mfspr	r11, SPRN_HSRR1
+	clrrdi	r12, r12, 2
+1:	std	r10, VCPU_PC(r9)
+	std	r11, VCPU_MSR(r9)
+
+	GET_SCRATCH0(r3)
+	mflr	r4
+	std	r3, VCPU_GPR(r13)(r9)
+	std	r4, VCPU_LR(r9)
+
+	/* Unset guest mode */
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
+
+	stw	r12,VCPU_TRAP(r9)
+
+	/* See if this is a leftover HDEC interrupt */
+	cmpwi	r12,0x980
+	bne	2f
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,0
+	bge	ignore_hdec
+2:
+
+	/* Check for mediated interrupts (could be done earlier really ...) */
+	cmpwi	r12,0x500
+	bne+	1f
+	ld	r5,VCPU_LPCR(r9)
+	andi.	r0,r11,MSR_EE
+	beq	1f
+	andi.	r0,r5,LPCR_MER
+	bne	bounce_ext_interrupt
+1:
+
+	/* Save DEC */
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+	add	r5,r5,r6
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+	/* Save HEIR (in last_inst) if this is a HEI (e40) */
+	li	r3,-1
+	cmpwi	r12,0xe40
+	bne	11f
+	mfspr	r3,SPRN_HEIR
+11:	stw	r3,VCPU_LAST_INST(r9)
+	
+	/* Save more register state  */
+	mfxer	r5
+	mfdar	r6
+	mfdsisr	r7
+	mfctr	r8
+
+	stw	r5, VCPU_XER(r9)
+	std	r6, VCPU_DAR(r9)
+	stw	r7, VCPU_DSISR(r9)
+	std	r8, VCPU_CTR(r9)
+	cmpwi	r12,0xe00		/* grab HDAR & HDSISR if HDSI */
+	beq	6f
+7:	std	r6, VCPU_FAULT_DAR(r9)
+	stw	r7, VCPU_FAULT_DSISR(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	ori	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	/* Read the guest SLB and save it away */
+	li	r6,0
+	addi	r7,r9,VCPU_SLB
+	li	r5,0
+1:	slbmfee	r8,r6
+	andis.	r0,r8,SLB_ESID_V@h
+	beq	2f
+	add	r8,r8,r6		/* put index in */
+	slbmfev	r3,r6
+	std	r8,VCPU_SLB_E(r7)
+	std	r3,VCPU_SLB_V(r7)
+	addi	r7,r7,VCPU_SLB_SIZE
+	addi	r5,r5,1
+2:	addi	r6,r6,1
+	cmpwi	r6,32
+	blt	1b
+	stw	r5,VCPU_SLB_MAX(r9)
+
+	/*
+	 * Save the guest PURR/SPURR
+	 */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	ld	r7,VCPU_PURR(r9)
+	ld	r8,VCPU_SPURR(r9)
+	std	r5,VCPU_PURR(r9)
+	std	r6,VCPU_SPURR(r9)
+	subf	r5,r7,r5
+	subf	r6,r8,r6
+
+	/*
+	 * Restore host PURR/SPURR and add guest times
+	 * so that the time in the guest gets accounted.
+	 */
+	ld	r3,PACA_HOST_PURR(r13)
+	ld	r4,PACA_HOST_SPURR(r13)
+	add	r3,r3,r5
+	add	r4,r4,r6
+	mtspr	SPRN_PURR,r3
+	mtspr	SPRN_SPURR,r4
+
+	/* Clear out SLB */
+	li	r5,0
+	slbmte	r5,r5
+	slbia
+	ptesync
+
+hdec_soon:
+	/* Switch back to host partition */
+	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r6,KVM_HOST_SDR1(r4)
+	lwz	r7,KVM_HOST_LPID(r4)
+	li	r8,0x3ff		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r8
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+	lis	r8,0x7fff
+	mtspr	SPRN_HDEC,r8
+
+	ld	r8,KVM_HOST_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* load host SLB entries */
+	ld	r8,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r8)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r8)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r8,r8,16
+	.endr
+
+	/* Save and reset AMR and UAMOR before turning on the MMU */
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	li	r6,0
+	mtspr	SPRN_AMR,r6
+
+	/* Restore host DABR and DABRX */
+	ld	r5,PACA_DABR(r13)
+	li	r6,7
+	mtspr	SPRN_DABR,r5
+	mtspr	SPRN_DABRX,r6
+
+	/* Switch DSCR back to host value */
+	mfspr	r8, SPRN_DSCR
+	ld	r7, PACA_HOST_DSCR(r13)
+	std	r8, VCPU_DSCR(r7)
+	mtspr	SPRN_DSCR, r7
+
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(r14)(r9)
+	std	r15, VCPU_GPR(r15)(r9)
+	std	r16, VCPU_GPR(r16)(r9)
+	std	r17, VCPU_GPR(r17)(r9)
+	std	r18, VCPU_GPR(r18)(r9)
+	std	r19, VCPU_GPR(r19)(r9)
+	std	r20, VCPU_GPR(r20)(r9)
+	std	r21, VCPU_GPR(r21)(r9)
+	std	r22, VCPU_GPR(r22)(r9)
+	std	r23, VCPU_GPR(r23)(r9)
+	std	r24, VCPU_GPR(r24)(r9)
+	std	r25, VCPU_GPR(r25)(r9)
+	std	r26, VCPU_GPR(r26)(r9)
+	std	r27, VCPU_GPR(r27)(r9)
+	std	r28, VCPU_GPR(r28)(r9)
+	std	r29, VCPU_GPR(r29)(r9)
+	std	r30, VCPU_GPR(r30)(r9)
+	std	r31, VCPU_GPR(r31)(r9)
+
+	/* Save SPRGs */
+	mfspr	r3, SPRN_SPRG0
+	mfspr	r4, SPRN_SPRG1
+	mfspr	r5, SPRN_SPRG2
+	mfspr	r6, SPRN_SPRG3
+	std	r3, VCPU_SPRG0(r9)
+	std	r4, VCPU_SPRG1(r9)
+	std	r5, VCPU_SPRG2(r9)
+	std	r6, VCPU_SPRG3(r9)
+
+	/* Save PMU registers */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r6, SPRN_MMCRA
+	std	r4, VCPU_MMCR(r9)
+	std	r5, VCPU_MMCR + 8(r9)
+	std	r6, VCPU_MMCR + 16(r9)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r4, SPRN_PMC2
+	mfspr	r5, SPRN_PMC3
+	mfspr	r6, SPRN_PMC4
+	mfspr	r7, SPRN_PMC5
+	mfspr	r8, SPRN_PMC6
+	stw	r3, VCPU_PMC(r9)
+	stw	r4, VCPU_PMC + 4(r9)
+	stw	r5, VCPU_PMC + 8(r9)
+	stw	r6, VCPU_PMC + 12(r9)
+	stw	r7, VCPU_PMC + 16(r9)
+	stw	r8, VCPU_PMC + 20(r9)
+22:
+	/* save FP state */
+	mr	r3, r9
+	bl	.kvmppc_save_fp
+
+	/* RFI into the highmem handler */
+	mfmsr	r7
+	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
+	mtsrr1	r7
+	/* Load highmem handler address */
+	ld	r8, VCPU_HIGHMEM_HANDLER(r3)
+	mtsrr0	r8
+
+	RFI
+kvmppc_handler_trampoline_exit_end:
+
+6:	mfspr	r6,SPRN_HDAR
+	mfspr	r7,SPRN_HDSISR
+	b	7b
+
+ignore_hdec:
+	mr	r4,r9
+	b	fast_guest_return
+
+bounce_ext_interrupt:
+	mr	r4,r9
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10,0x500
+	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
+	b	fast_guest_return
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4b54148..e5e3f92 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,8 +38,12 @@
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
-	return !(v->arch.shared->msr & MSR_WE) ||
+#ifndef CONFIG_KVM_BOOK3S_64_HV
+	return !(kvmppc_get_msr(v) & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
+#else
+	return 1;
+#endif
 }
 
 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
@@ -52,7 +56,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
 	unsigned long r2 = 0;
 
-	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
 		/* 32 bit mode */
 		param1 &= 0xffffffff;
 		param2 &= 0xffffffff;
@@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_IRQ_LEVEL:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_PPC_OSI:
+#ifndef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_CAP_PPC_GET_PVINFO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
+#endif
 	default:
 		r = 0;
 		break;
@@ -286,6 +292,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+	vcpu->arch.dec_expires = ~(u64)0;
 
 	return 0;
 }
@@ -474,6 +481,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		for (i = 0; i < 32; i++)
 			kvmppc_set_gpr(vcpu, i, gprs[i]);
 		vcpu->arch.osi_needed = 0;
+	} else if (vcpu->arch.hcall_needed) {
+		int i;
+
+		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
+		for (i = 0; i < 6; ++i)
+			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
+		vcpu->arch.hcall_needed = 0;
 	}
 
 	kvmppc_core_deliver_interrupts(vcpu);
@@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	} else if (vcpu->cpu != -1) {
+		smp_send_reschedule(vcpu->cpu);
+#endif
 	}
-
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index d62a14b..e5c99b8 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
  *                         Book3S trace points                           *
  *************************************************************************/
 
-#ifdef CONFIG_PPC_BOOK3S
+#ifdef CONFIG_KVM_BOOK3S_NONHV
 
 TRACE_EVENT(kvm_book3s_exit,
 	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..a4447ce 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL	  19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,11 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (9 preceding siblings ...)
  2011-05-11 10:44 ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
@ 2011-05-11 10:45 ` Paul Mackerras
  2011-05-17  7:54     ` Alexander Graf
  2011-05-11 10:46 ` [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode Paul Mackerras
                   ` (2 subsequent siblings)
  13 siblings, 1 reply; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:45 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

This adds the infrastructure for handling PAPR hcalls in the kernel,
either early in the guest exit path while we are still in real mode,
or later once the MMU has been turned back on and we are in the full
kernel context.  The advantage of handling hcalls in real mode if
possible is that we avoid two partition switches -- and this will
become more important when we support SMT4 guests, since a partition
switch means we have to pull all of the threads in the core out of
the guest.  The disadvantage is that we can only access the kernel
linear mapping, not anything vmalloced or ioremapped, since the MMU
is off.

This also adds code to handle the following hcalls in real mode:

H_ENTER       Add an HPTE to the hashed page table
H_REMOVE      Remove an HPTE from the hashed page table
H_READ        Read HPTEs from the hashed page table
H_PROTECT     Change the protection bits in an HPTE
H_BULK_REMOVE Remove up to 4 HPTEs from the hashed page table
H_SET_DABR    Set the data address breakpoint register

Plus code to handle the following hcalls in the kernel:

H_CEDE        Idle the vcpu until an interrupt or H_PROD hcall arrives
H_PROD        Wake up a ceded vcpu
H_REGISTER_VPA Register a virtual processor area (VPA)

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hvcall.h       |    5 +
 arch/powerpc/include/asm/kvm_host.h     |   11 +
 arch/powerpc/include/asm/kvm_ppc.h      |    1 +
 arch/powerpc/kernel/asm-offsets.c       |    2 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c     |  342 +++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c            |  170 +++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  150 +++++++++++++-
 arch/powerpc/kvm/powerpc.c              |    2 +-
 8 files changed, 679 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 8edec71..a226002 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -29,6 +29,10 @@
 #define H_LONG_BUSY_ORDER_100_SEC	9905  /* Long busy, hint that 100sec \
 						 is a good time to retry */
 #define H_LONG_BUSY_END_RANGE		9905  /* End of long busy range */
+
+/* Hacked in for HV-aware kvm support */
+#define H_TOO_HARD	9999
+
 #define H_HARDWARE	-1	/* Hardware error */
 #define H_FUNCTION	-2	/* Function not supported */
 #define H_PRIVILEGE	-3	/* Caller not privileged */
@@ -100,6 +104,7 @@
 #define H_PAGE_SET_ACTIVE	H_PAGE_STATE_CHANGE
 #define H_AVPN			(1UL<<(63-32))	/* An avpn is provided as a sanity test */
 #define H_ANDCOND		(1UL<<(63-33))
+#define H_LOCAL			(1UL<<(63-35))
 #define H_ICACHE_INVALIDATE	(1UL<<(63-40))	/* icbi, etc.  (ignored for IO pages) */
 #define H_ICACHE_SYNCHRONIZE	(1UL<<(63-41))	/* dcbst, icbi, etc (ignored for IO pages */
 #define H_ZERO_PAGE		(1UL<<(63-48))	/* zero the page before mapping (ignored for IO pages) */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index ec62365..af6703e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -59,6 +59,10 @@ struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
 
+struct lppaca;
+struct slb_shadow;
+struct dtl;
+
 struct kvm_vm_stat {
 	u32 remote_tlb_flush;
 };
@@ -341,7 +345,14 @@ struct kvm_vcpu_arch {
 	u64 dec_expires;
 	unsigned long pending_exceptions;
 	u16 last_cpu;
+	u8 ceded;
+	u8 prodded;
 	u32 last_inst;
+
+	struct lppaca *vpa;
+	struct slb_shadow *slb_shadow;
+	struct dtl *dtl;
+	struct dtl *dtl_end;
 	int trap;
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index cd9ad96..b4ee11a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -116,6 +116,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_map_vrma(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem);
+extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 49e97fd..fd56f14 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -189,6 +189,7 @@ int main(void)
 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
 	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
+	DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
 #endif /* CONFIG_PPC_STD_MMU_64 */
 	DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
@@ -467,6 +468,7 @@ int main(void)
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
 	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
+	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 52d1be1..623caae 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -219,6 +219,348 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	}
 }
 
+#define HPTE_V_HVLOCK	0x40UL
+
+static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
+{
+	unsigned long tmp, old;
+
+	asm volatile("	ldarx	%0,0,%2\n"
+		     "	and.	%1,%0,%3\n"
+		     "	bne	2f\n"
+		     "	ori	%0,%0,%4\n"
+		     "  stdcx.	%0,0,%2\n"
+		     "	beq+	2f\n"
+		     "	li	%1,%3\n"
+		     "2:	isync"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
+		     : "cc", "memory");
+	return old == 0;
+}
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+		    long pte_index, unsigned long pteh, unsigned long ptel)
+{
+	unsigned long porder;
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long i, lpn, pa;
+	unsigned long *hpte;
+
+	/* only handle 4k, 64k and 16M pages for now */
+	porder = 12;
+	if (pteh & HPTE_V_LARGE) {
+		if ((ptel & 0xf000) == 0x1000) {
+			/* 64k page */
+			porder = 16;
+		} else if ((ptel & 0xff000) == 0) {
+			/* 16M page */
+			porder = 24;
+			/* lowest AVA bit must be 0 for 16M pages */
+			if (pteh & 0x80)
+				return H_PARAMETER;
+		} else
+			return H_PARAMETER;
+	}
+	lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
+	if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
+		return H_PARAMETER;
+	pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
+	if (!pa)
+		return H_PARAMETER;
+	/* Check WIMG */
+	if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+	    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+		return H_PARAMETER;
+	pteh &= ~0x60UL;
+	ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
+	ptel |= pa;
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	if (likely((flags & H_EXACT) == 0)) {
+		pte_index &= ~7UL;
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		for (i = 0; ; ++i) {
+			if (i == 8)
+				return H_PTEG_FULL;
+			if ((*hpte & HPTE_V_VALID) == 0 &&
+			    lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+				break;
+			hpte += 2;
+		}
+	} else {
+		i = 0;
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
+			return H_PTEG_FULL;
+	}
+	hpte[1] = ptel;
+	eieio();
+	hpte[0] = pteh;
+	asm volatile("ptesync" : : : "memory");
+	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
+	vcpu->arch.gpr[4] = pte_index + i;
+	return H_SUCCESS;
+}
+
+static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
+				      unsigned long pte_index)
+{
+	unsigned long rb, va_low;
+
+	rb = (v & ~0x7fUL) << 16;		/* AVA field */
+	va_low = pte_index >> 3;
+	if (v & HPTE_V_SECONDARY)
+		va_low = ~va_low;
+	/* xor vsid from AVA */
+	if (!(v & HPTE_V_1TB_SEG))
+		va_low ^= v >> 12;
+	else
+		va_low ^= v >> 24;
+	va_low &= 0x7ff;
+	if (v & HPTE_V_LARGE) {
+		rb |= 1;			/* L field */
+		if (r & 0xff000) {
+			/* non-16MB large page, must be 64k */
+			/* (masks depend on page size) */
+			rb |= 0x1000;		/* page encoding in LP field */
+			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+			rb |= (va_low & 0xfe);	/* AVAL field (P7 doesn't seem to care) */
+		}
+	} else {
+		/* 4kB page */
+		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
+	}
+	rb |= (v >> 54) & 0x300;		/* B field */
+	return rb;
+}
+
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
+
+static inline int try_lock_tlbie(unsigned int *lock)
+{
+	unsigned int tmp, old;
+	unsigned int token = LOCK_TOKEN;
+
+	asm volatile("1:lwarx	%1,0,%2\n"
+		     "	cmpwi	cr0,%1,0\n"
+		     "	bne	2f\n"
+		     "  stwcx.	%3,0,%2\n"
+		     "	bne-	1b\n"
+		     "  isync\n"
+		     "2:"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (lock), "r" (token)
+		     : "cc", "memory");
+	return old == 0;
+}
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+		     unsigned long pte_index, unsigned long avpn,
+		     unsigned long va)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte;
+	unsigned long v, r, rb;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
+		hpte[0] &= ~HPTE_V_HVLOCK;
+		return H_NOT_FOUND;
+	}
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		flags |= H_LOCAL;
+	vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
+	vcpu->arch.gpr[5] = r = hpte[1];
+	rb = compute_tlbie_rb(v, r, pte_index);
+	hpte[0] = 0;
+	if (!(flags & H_LOCAL)) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+			     : : "r" (rb), "r" (kvm->arch.lpid));
+		asm volatile("ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		asm volatile("tlbiel %0" : : "r" (rb));
+		asm volatile("ptesync" : : : "memory");
+	}
+	return H_SUCCESS;
+}
+
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *args = &vcpu->arch.gpr[4];
+	unsigned long *hp, tlbrb[4];
+	long int i, found;
+	long int n_inval = 0;
+	unsigned long flags, req, pte_index;
+	long int local = 0;
+	long int ret = H_SUCCESS;
+
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		local = 1;
+	for (i = 0; i < 4; ++i) {
+		pte_index = args[i * 2];
+		flags = pte_index >> 56;
+		pte_index &= ((1ul << 56) - 1);
+		req = flags >> 6;
+		flags &= 3;
+		if (req == 3)
+			break;
+		if (req != 1 || flags == 3 ||
+		    pte_index >= (HPT_NPTEG << 3)) {
+			/* parameter error */
+			args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
+			ret = H_PARAMETER;
+			break;
+		}
+		hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		while (!lock_hpte(hp, HPTE_V_HVLOCK))
+			cpu_relax();
+		found = 0;
+		if (hp[0] & HPTE_V_VALID) {
+			switch (flags & 3) {
+			case 0:		/* absolute */
+				found = 1;
+				break;
+			case 1:		/* andcond */
+				if (!(hp[0] & args[i * 2 + 1]))
+					found = 1;
+				break;
+			case 2:		/* AVPN */
+				if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
+					found = 1;
+				break;
+			}
+		}
+		if (!found) {
+			hp[0] &= ~HPTE_V_HVLOCK;
+			args[i * 2] = ((0x90 | flags) << 56) + pte_index;
+			continue;
+		}
+		/* insert R and C bits from PTE */
+		flags |= (hp[1] >> 5) & 0x0c;
+		args[i * 2] = ((0x80 | flags) << 56) + pte_index;
+		tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
+		hp[0] = 0;
+	}
+	if (n_inval == 0)
+		return ret;
+
+	if (!local) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < n_inval; ++i)
+			asm volatile(PPC_TLBIE(%1,%0)
+				     : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < n_inval; ++i)
+			asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
+		asm volatile("ptesync" : : : "memory");
+	}
+	return ret;
+}
+
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+		      unsigned long pte_index, unsigned long avpn,
+		      unsigned long va)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte;
+	unsigned long v, r, rb;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((hpte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
+		hpte[0] &= ~HPTE_V_HVLOCK;
+		return H_NOT_FOUND;
+	}
+	if (atomic_read(&kvm->online_vcpus) == 1)
+		flags |= H_LOCAL;
+	v = hpte[0];
+	r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+	r |= (flags << 55) & HPTE_R_PP0;
+	r |= (flags << 48) & HPTE_R_KEY_HI;
+	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+	rb = compute_tlbie_rb(v, r, pte_index);
+	hpte[0] = v & ~HPTE_V_VALID;
+	if (!(flags & H_LOCAL)) {
+		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		asm volatile("ptesync" : : : "memory");
+		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
+			     : : "r" (rb), "r" (kvm->arch.lpid));
+		asm volatile("ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		asm volatile("ptesync" : : : "memory");
+		asm volatile("tlbiel %0" : : "r" (rb));
+		asm volatile("ptesync" : : : "memory");
+	}
+	hpte[1] = r;
+	eieio();
+	hpte[0] = v & ~HPTE_V_HVLOCK;
+	asm volatile("ptesync" : : : "memory");
+	return H_SUCCESS;
+}
+
+static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
+{
+	long int i;
+	unsigned long offset, rpn;
+
+	offset = realaddr & (kvm->arch.ram_psize - 1);
+	rpn = (realaddr - offset) >> PAGE_SHIFT;
+	for (i = 0; i < kvm->arch.ram_npages; ++i)
+		if (rpn == kvm->arch.ram_pginfo[i].pfn)
+			return (i << PAGE_SHIFT) + offset;
+	return HPTE_R_RPN;	/* all 1s in the RPN field */
+}
+
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+		   unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *hpte, r;
+	int i, n = 1;
+
+	if (pte_index >= (HPT_NPTEG << 3))
+		return H_PARAMETER;
+	if (flags & H_READ_4) {
+		pte_index &= ~3;
+		n = 4;
+	}
+	for (i = 0; i < n; ++i, ++pte_index) {
+		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
+		r = hpte[1];
+		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
+			r = reverse_xlate(kvm, r & HPTE_R_RPN) |
+				(r & ~HPTE_R_RPN);
+		vcpu->arch.gpr[4 + i * 2] = hpte[0];
+		vcpu->arch.gpr[5 + i * 2] = r;
+	}
+	return H_SUCCESS;
+}
+
 int kvmppc_mmu_hv_init(void)
 {
 	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f6b7cd1..377a35a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -126,6 +126,158 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 	       vcpu->arch.last_inst);
 }
 
+struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+{
+	int r;
+	struct kvm_vcpu *v, *ret = NULL;
+
+	mutex_lock(&kvm->lock);
+	kvm_for_each_vcpu(r, v, kvm) {
+		if (v->vcpu_id == id) {
+			ret = v;
+			break;
+		}
+	}
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
+{
+	vpa->shared_proc = 1;
+	vpa->yield_count = 1;
+}
+
+static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
+				       unsigned long flags,
+				       unsigned long vcpuid, unsigned long vpa)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long pg_index, ra, len;
+	unsigned long pg_offset;
+	void *va;
+	struct kvm_vcpu *tvcpu;
+
+	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
+	if (!tvcpu)
+		return H_PARAMETER;
+
+	flags >>= 63 - 18;
+	flags &= 7;
+	if (flags == 0 || flags == 4)
+		return H_PARAMETER;
+	if (flags < 4) {
+		if (vpa & 0x7f)
+			return H_PARAMETER;
+		/* registering new area; convert logical addr to real */
+		pg_index = vpa >> kvm->arch.ram_porder;
+		pg_offset = vpa & (kvm->arch.ram_psize - 1);
+		if (pg_index >= kvm->arch.ram_npages)
+			return H_PARAMETER;
+		if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
+			return H_PARAMETER;
+		ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
+		ra |= pg_offset;
+		va = __va(ra);
+		if (flags <= 1)
+			len = *(unsigned short *)(va + 4);
+		else
+			len = *(unsigned int *)(va + 4);
+		if (pg_offset + len > kvm->arch.ram_psize)
+			return H_PARAMETER;
+		switch (flags) {
+		case 1:		/* register VPA */
+			if (len < 640)
+				return H_PARAMETER;
+			tvcpu->arch.vpa = va;
+			init_vpa(vcpu, va);
+			break;
+		case 2:		/* register DTL */
+			if (len < 48)
+				return H_PARAMETER;
+			if (!tvcpu->arch.vpa)
+				return H_RESOURCE;
+			len -= len % 48;
+			tvcpu->arch.dtl = va;
+			tvcpu->arch.dtl_end = va + len;
+			break;
+		case 3:		/* register SLB shadow buffer */
+			if (len < 8)
+				return H_PARAMETER;
+			if (!tvcpu->arch.vpa)
+				return H_RESOURCE;
+			tvcpu->arch.slb_shadow = va;
+			len = (len - 16) / 16;
+			tvcpu->arch.slb_shadow = va;
+			break;
+		}
+	} else {
+		switch (flags) {
+		case 5:		/* unregister VPA */
+			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
+				return H_RESOURCE;
+			tvcpu->arch.vpa = NULL;
+			break;
+		case 6:		/* unregister DTL */
+			tvcpu->arch.dtl = NULL;
+			break;
+		case 7:		/* unregister SLB shadow buffer */
+			tvcpu->arch.slb_shadow = NULL;
+			break;
+		}
+	}
+	return H_SUCCESS;
+}
+
+int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
+{
+	unsigned long req = kvmppc_get_gpr(vcpu, 3);
+	unsigned long target, ret = H_SUCCESS;
+	struct kvm_vcpu *tvcpu;
+
+	switch (req) {
+	case H_CEDE:
+		vcpu->arch.msr |= MSR_EE;
+		vcpu->arch.ceded = 1;
+		smp_mb();
+		if (!vcpu->arch.prodded)
+			kvmppc_vcpu_block(vcpu);
+		else
+			vcpu->arch.prodded = 0;
+		smp_mb();
+		vcpu->arch.ceded = 0;
+		break;
+	case H_PROD:
+		target = kvmppc_get_gpr(vcpu, 4);
+		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+		if (!tvcpu) {
+			ret = H_PARAMETER;
+			break;
+		}
+		tvcpu->arch.prodded = 1;
+		smp_mb();
+		if (vcpu->arch.ceded) {
+			if (waitqueue_active(&vcpu->wq)) {
+				wake_up_interruptible(&vcpu->wq);
+				vcpu->stat.halt_wakeup++;
+			}
+		}
+		break;
+	case H_CONFER:
+		break;
+	case H_REGISTER_VPA:
+		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6));
+		break;
+	default:
+		return RESUME_HOST;
+	}
+	kvmppc_set_gpr(vcpu, 3, ret);
+	vcpu->arch.hcall_needed = 0;
+	return RESUME_GUEST;
+}
+
 static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			      struct task_struct *tsk)
 {
@@ -307,7 +459,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 
-int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	u64 now;
 
@@ -338,6 +490,22 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return kvmppc_handle_exit(run, vcpu, current);
 }
 
+int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	do {
+		r = kvmppc_run_vcpu(run, vcpu);
+
+		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
+		    !(vcpu->arch.msr & MSR_PR)) {
+			r = kvmppc_pseries_do_hcall(vcpu);
+			kvmppc_core_deliver_interrupts(vcpu);
+		}
+	} while (r == RESUME_GUEST);
+	return r;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 813b01c..e8a8f3c 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -195,6 +195,14 @@ kvmppc_handler_trampoline_enter:
 	/* Save R1 in the PACA */
 	std	r1, PACA_KVM_SVCPU + SVCPU_HOST_R1(r13)
 
+	/* Increment yield count if they have a VPA */
+	ld	r3, VCPU_VPA(r4)
+	cmpdi	r3, 0
+	beq	25f
+	lwz	r5, LPPACA_YIELDCOUNT(r3)
+	addi	r5, r5, 1
+	stw	r5, LPPACA_YIELDCOUNT(r3)
+25:
 	/* Load up DAR and DSISR */
 	ld	r5, VCPU_DAR(r4)
 	lwz	r6, VCPU_DSISR(r4)
@@ -432,6 +440,10 @@ kvmppc_interrupt:
 	cmpwi	r3,0
 	bge	ignore_hdec
 2:
+	/* See if this is something we can handle in real mode */
+	cmpwi	r12,0xc00
+	beq	hcall_real_mode
+hcall_real_cont:
 
 	/* Check for mediated interrupts (could be done earlier really ...) */
 	cmpwi	r12,0x500
@@ -607,13 +619,28 @@ hdec_soon:
 	std	r5, VCPU_SPRG2(r9)
 	std	r6, VCPU_SPRG3(r9)
 
-	/* Save PMU registers */
+	/* Increment yield count if they have a VPA */
+	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
+	cmpdi	r8, 0
+	beq	25f
+	lwz	r3, LPPACA_YIELDCOUNT(r8)
+	addi	r3, r3, 1
+	stw	r3, LPPACA_YIELDCOUNT(r8)
+25:
+	/* Save PMU registers if requested */
+	/* r8 and cr0.eq are live here */
 	li	r3, 1
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
 	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
 	isync
-	mfspr	r5, SPRN_MMCR1
+	beq	21f			/* if no VPA, save PMU stuff anyway */
+	lbz	r7, LPPACA_PMCINUSE(r8)
+	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
+	bne	21f
+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
+	b	22f
+21:	mfspr	r5, SPRN_MMCR1
 	mfspr	r6, SPRN_MMCRA
 	std	r4, VCPU_MMCR(r9)
 	std	r5, VCPU_MMCR + 8(r9)
@@ -650,6 +677,119 @@ kvmppc_handler_trampoline_exit_end:
 	mfspr	r7,SPRN_HDSISR
 	b	7b
 
+	.globl	hcall_real_mode
+hcall_real_mode:
+	ld	r3,VCPU_GPR(r3)(r9)
+	andi.	r0,r11,MSR_PR
+	bne	hcall_real_cont
+	clrrdi	r3,r3,2
+	cmpldi	r3,hcall_real_table_end - hcall_real_table
+	bge	hcall_real_cont
+	LOAD_REG_ADDR(r4, hcall_real_table)
+	lwzx	r3,r3,r4
+	cmpwi	r3,0
+	beq	hcall_real_cont
+	add	r3,r3,r4
+	mtctr	r3
+	mr	r3,r9		/* get vcpu pointer */
+	ld	r4,VCPU_GPR(r4)(r9)
+	bctrl
+	cmpdi	r3,H_TOO_HARD
+	beq	hcall_real_fallback
+	ld	r4,PACA_KVM_VCPU(r13)
+	std	r3,VCPU_GPR(r3)(r4)
+	ld	r10,VCPU_PC(r4)
+	ld	r11,VCPU_MSR(r4)
+	b	fast_guest_return
+
+	/* We've attempted a real mode hcall, but it's punted it back
+	 * to userspace.  We need to restore some clobbered volatiles
+	 * before resuming the pass-it-to-qemu path */
+hcall_real_fallback:
+	li	r12,0xc00
+	ld	r9, PACA_KVM_VCPU(r13)
+	ld	r11, VCPU_MSR(r9)
+	
+	b	hcall_real_cont
+	
+	.globl	hcall_real_table
+hcall_real_table:
+	.long	0		/* 0 - unused */
+	.long	.kvmppc_h_remove - hcall_real_table
+	.long	.kvmppc_h_enter - hcall_real_table
+	.long	.kvmppc_h_read - hcall_real_table
+	.long	0		/* 0x10 - H_CLEAR_MOD */
+	.long	0		/* 0x14 - H_CLEAR_REF */
+	.long	.kvmppc_h_protect - hcall_real_table
+	.long	0		/* 0x1c - H_GET_TCE */
+	.long	0		/* 0x20 - H_SET_TCE */
+	.long	0		/* 0x24 - H_SET_SPRG0 */
+	.long	.kvmppc_h_set_dabr - hcall_real_table
+	.long	0		/* 0x2c */
+	.long	0		/* 0x30 */
+	.long	0		/* 0x34 */
+	.long	0		/* 0x38 */
+	.long	0		/* 0x3c */
+	.long	0		/* 0x40 */
+	.long	0		/* 0x44 */
+	.long	0		/* 0x48 */
+	.long	0		/* 0x4c */
+	.long	0		/* 0x50 */
+	.long	0		/* 0x54 */
+	.long	0		/* 0x58 */
+	.long	0		/* 0x5c */
+	.long	0		/* 0x60 */
+	.long	0		/* 0x64 */
+	.long	0		/* 0x68 */
+	.long	0		/* 0x6c */
+	.long	0		/* 0x70 */
+	.long	0		/* 0x74 */
+	.long	0		/* 0x78 */
+	.long	0		/* 0x7c */
+	.long	0		/* 0x80 */
+	.long	0		/* 0x84 */
+	.long	0		/* 0x88 */
+	.long	0		/* 0x8c */
+	.long	0		/* 0x90 */
+	.long	0		/* 0x94 */
+	.long	0		/* 0x98 */
+	.long	0		/* 0x9c */
+	.long	0		/* 0xa0 */
+	.long	0		/* 0xa4 */
+	.long	0		/* 0xa8 */
+	.long	0		/* 0xac */
+	.long	0		/* 0xb0 */
+	.long	0		/* 0xb4 */
+	.long	0		/* 0xb8 */
+	.long	0		/* 0xbc */
+	.long	0		/* 0xc0 */
+	.long	0		/* 0xc4 */
+	.long	0		/* 0xc8 */
+	.long	0		/* 0xcc */
+	.long	0		/* 0xd0 */
+	.long	0		/* 0xd4 */
+	.long	0		/* 0xd8 */
+	.long	0		/* 0xdc */
+	.long	0		/* 0xe0 */
+	.long	0		/* 0xe4 */
+	.long	0		/* 0xe8 */
+	.long	0		/* 0xec */
+	.long	0		/* 0xf0 */
+	.long	0		/* 0xf4 */
+	.long	0		/* 0xf8 */
+	.long	0		/* 0xfc */
+	.long	0		/* 0x100 */
+	.long	0		/* 0x104 */
+	.long	0		/* 0x108 */
+	.long	0		/* 0x10c */
+	.long	0		/* 0x110 */
+	.long	0		/* 0x114 */
+	.long	0		/* 0x118 */
+	.long	0		/* 0x11c */
+	.long	0		/* 0x120 */
+	.long	.kvmppc_h_bulk_remove - hcall_real_table
+hcall_real_table_end:
+
 ignore_hdec:
 	mr	r4,r9
 	b	fast_guest_return
@@ -661,3 +801,9 @@ bounce_ext_interrupt:
 	li	r10,0x500
 	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
 	b	fast_guest_return
+
+_GLOBAL(kvmppc_h_set_dabr)
+	std	r4,VCPU_DABR(r3)
+	mtspr	SPRN_DABR,r4
+	li	r3,0
+	blr
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index e5e3f92..7bfe413 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -42,7 +42,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	return !(kvmppc_get_msr(v) & MSR_WE) ||
 	       !!(v->arch.pending_exceptions);
 #else
-	return 1;
+	return !(v->arch.ceded) || !!(v->arch.pending_exceptions);
 #endif
 }
 
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (10 preceding siblings ...)
  2011-05-11 10:45 ` [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel Paul Mackerras
@ 2011-05-11 10:46 ` Paul Mackerras
  2011-05-17  8:01     ` Alexander Graf
  2011-05-11 10:46 ` [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes Paul Mackerras
  2011-05-17  9:46   ` Alexander Graf
  13 siblings, 1 reply; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:46 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

From: David Gibson <dwg@au1.ibm.com>

This improves I/O performance for guests using the PAPR paravirtualization
interface by making the H_PUT_TCE hcall faster, by implementing it in
real mode.  H_PUT_TCE is used for updating virtual IOMMU tables, and is
used both for virtual I/O and for real I/O in the PAPR interface.

Since this moves the IOMMU tables into the kernel, we define a new
KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables.
The ioctl returns a file descriptor which can be used to mmap the
newly created table.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm.h           |    9 +++
 arch/powerpc/include/asm/kvm_book3s_64.h |    2 +
 arch/powerpc/include/asm/kvm_host.h      |    9 +++
 arch/powerpc/include/asm/kvm_ppc.h       |    2 +
 arch/powerpc/kvm/Makefile                |    3 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c      |   73 +++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c             |  116 +++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |    2 +-
 arch/powerpc/kvm/powerpc.c               |   18 +++++
 include/linux/kvm.h                      |    5 ++
 10 files changed, 236 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 18ea696..a9e641b 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -22,6 +22,9 @@
 
 #include <linux/types.h>
 
+/* Select powerpc specific features in <linux/kvm.h> */
+#define __KVM_HAVE_SPAPR_TCE
+
 struct kvm_regs {
 	__u64 pc;
 	__u64 cr;
@@ -88,4 +91,10 @@ struct kvm_guest_debug_arch {
 #define KVM_INTERRUPT_UNSET	-2U
 #define KVM_INTERRUPT_SET_LEVEL	-3U
 
+/* for KVM_CAP_SPAPR_TCE */
+struct kvm_create_spapr_tce {
+	__u64 liobn;
+	__u32 window_size;
+};
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 4cadd61..e1a096b 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -25,4 +25,6 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
 	return &get_paca()->shadow_vcpu;
 }
 
+#define SPAPR_TCE_SHIFT		12
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index af6703e..cda183e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -144,6 +144,14 @@ struct kvmppc_pginfo {
 	atomic_t refcnt;
 };
 
+struct kvmppc_spapr_tce_table {
+	struct list_head list;
+	struct kvm *kvm;
+	u64 liobn;
+	u32 window_size;
+	struct page *pages[0];
+};
+
 struct kvm_arch {
 	unsigned long hpt_virt;
 	unsigned long ram_npages;
@@ -157,6 +165,7 @@ struct kvm_arch {
 	unsigned long host_sdr1;
 	int tlbie_lock;
 	unsigned short last_vcpu[NR_CPUS];
+	struct list_head spapr_tce_tables;
 };
 
 struct kvmppc_pte {
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index b4ee11a..de683fa 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -117,6 +117,8 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm *kvm,
 			    struct kvm_userspace_memory_region *mem);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				struct kvm_create_spapr_tce *args);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 37c1a60..8ba062f 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -59,7 +59,8 @@ kvm-book3s_64_hv-objs := \
 	book3s.o \
 	book3s_hv.o \
 	book3s_hv_interrupts.o \
-	book3s_64_mmu_hv.o
+	book3s_64_mmu_hv.o \
+	book3s_64_vio_hv.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) := $(kvm-book3s_64_hv-objs)
 
 kvm-book3s_32-objs := \
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644
index 0000000..ea0f8c5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -0,0 +1,73 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_spapr_tce_table *stt;
+
+	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+	/* 	    liobn, ioba, tce); */
+
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == liobn) {
+			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+			struct page *page;
+			u64 *tbl;
+
+			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
+			/* 	    liobn, stt, stt->window_size); */
+			if (ioba >= stt->window_size)
+				return H_PARAMETER;
+
+			page = stt->pages[idx / TCES_PER_PAGE];
+			tbl = (u64 *)page_address(page);
+
+			/* FIXME: Need to validate the TCE itself */
+			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+			tbl[idx % TCES_PER_PAGE] = tce;
+			return H_SUCCESS;
+		}
+	}
+
+	/* Didn't find the liobn, punt it to userspace */
+	return H_TOO_HARD;
+}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 377a35a..eed2c10 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -506,6 +506,116 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+	struct kvm *kvm = stt->kvm;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	list_del(&stt->list);
+	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+		__free_page(stt->pages[i]);
+	kfree(stt);
+	mutex_unlock(&kvm->lock);
+
+	kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+		return VM_FAULT_SIGBUS;
+
+	page = stt->pages[vmf->pgoff];
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+	.fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_spapr_tce_vm_ops;
+	return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+	release_spapr_tce_table(stt);
+	return 0;
+}
+
+static struct file_operations kvm_spapr_tce_fops = {
+	.mmap           = kvm_spapr_tce_mmap,
+	.release	= kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				   struct kvm_create_spapr_tce *args)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	long npages;
+	int ret = -ENOMEM;
+	int i;
+
+	/* Check this LIOBN hasn't been previously allocated */
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == args->liobn)
+			return -EBUSY;
+	}
+
+	npages = kvmppc_stt_npages(args->window_size);
+
+	stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!stt)
+		goto fail;
+
+	stt->liobn = args->liobn;
+	stt->window_size = args->window_size;
+	stt->kvm = kvm;
+
+	for (i = 0; i < npages; i++) {
+		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!stt->pages[i])
+			goto fail;
+	}
+
+	kvm_get_kvm(kvm);
+
+	mutex_lock(&kvm->lock);
+	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	mutex_unlock(&kvm->lock);
+
+	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				stt, O_RDONLY);
+
+fail:
+	if (stt) {
+		for (i = 0; i < npages; i++)
+			if (stt->pages[i])
+				__free_page(stt->pages[i]);
+
+		kfree(stt);
+	}
+	return ret;
+}
+
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
@@ -527,13 +637,17 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	/* Allocate hashed page table */
 	r = kvmppc_alloc_hpt(kvm);
+	if (r)
+		return r;
 
-	return r;
+	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
 	kvmppc_free_hpt(kvm);
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
 
 /* These are stubs for now */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index e8a8f3c..95f6386 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -722,7 +722,7 @@ hcall_real_table:
 	.long	0		/* 0x14 - H_CLEAR_REF */
 	.long	.kvmppc_h_protect - hcall_real_table
 	.long	0		/* 0x1c - H_GET_TCE */
-	.long	0		/* 0x20 - H_SET_TCE */
+	.long	.kvmppc_h_put_tce - hcall_real_table
 	.long	0		/* 0x24 - H_SET_SPRG0 */
 	.long	.kvmppc_h_set_dabr - hcall_real_table
 	.long	0		/* 0x2c */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 7bfe413..10f777a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -196,6 +196,11 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
 #endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CAP_SPAPR_TCE:
+		r = 1;
+		break;
+#endif
 	default:
 		r = 0;
 		break;
@@ -628,6 +633,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		break;
 	}
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	case KVM_CREATE_SPAPR_TCE: {
+		struct kvm_create_spapr_tce create_tce;
+		struct kvm *kvm = filp->private_data;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
+			goto out;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+		goto out;
+	}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index a4447ce..3d3cdf1 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -547,6 +547,9 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#ifdef __KVM_HAVE_SPAPR_TCE
+#define KVM_CAP_SPAPR_TCE 60
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -746,6 +749,8 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_XCRS */
 #define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
 #define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
+/* Available with KVM_CAP_SPAPR_TCE */
+#define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
                   ` (11 preceding siblings ...)
  2011-05-11 10:46 ` [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode Paul Mackerras
@ 2011-05-11 10:46 ` Paul Mackerras
  2011-05-11 13:44   ` Christoph Hellwig
  2011-05-17  8:21     ` Alexander Graf
  2011-05-17  9:46   ` Alexander Graf
  13 siblings, 2 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 10:46 UTC (permalink / raw)
  To: linuxppc-dev, kvm; +Cc: Alexander Graf

This lifts the restriction that book3s_hv guests can only run one
hardware thread per core, and allows them to use up to 4 threads
per core on POWER7.  The host still has to run single-threaded.

This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
capability.

To use this, the host kernel should be booted with all threads
active, and then all the secondary threads should be offlined.
This will put the secondary threads into nap mode.  KVM will then
wake them from nap mode and use them for running guest code (while
they are still offline).  To wake the secondary threads, we send
them an IPI using a new xics_wake_cpu() function, implemented in
arch/powerpc/sysdev/xics/icp-native.c.  In other words, at this stage
we assume that the platform has a XICS interrupt controller and
we are using icp-native.c to drive it.  Since the woken thread will
need to acknowledge and clear the IPI, we also export the base
physical address of the XICS registers using kvmppc_set_xics_phys()
for use in the low-level KVM book3s code.

When a vcpu is created, it is assigned to a virtual core (vcore).
The vcore number is obtained by dividing the vcpu number by 4,
since we assume at most 4 threads per core.  Thus, if qemu wishes
to run the guest in single-threaded mode, it should make all vcpu
numbers be multiples of 4.

We distinguish three states of a vcpu: runnable (i.e., ready to execute
the guest), blocked (that is, idle), and busy in host.  We currently
implement a policy that the vcore can run only when all its threads
are runnable or blocked.  This way, if a vcpu needs to execute elsewhere
in the kernel or in qemu, it can do so without being starved of CPU
by the other vcpus.

When a vcore starts to run, it executes in the context of one of the
vcpu threads.  The other vcpu threads all go to sleep and stay asleep
until something happens requiring the vcpu thread to return to qemu,
or to wake up to run the vcore (this can happen when another vcpu
thread goes from busy in host state to blocked).

It can happen that a vcpu goes from blocked to runnable state (e.g.
because of an interrupt), and the vcore it belongs to is already
running.  In that case it can start to run immediately as long as
the none of the vcpus in the vcore have started to exit the guest.
We send the next free thread in the vcore an IPI to get it to start
to execute the guest.  It synchronizes with the other threads via
the vcore->entry_exit_count field to make sure that it doesn't go
into the guest if the other vcpus are exiting by the time that it
is ready to actually enter the guest.

Note that there is no fixed relationship between the hardware thread
number and the vcpu number.  Hardware threads are assigned to vcpus
as they become runnable, so we will always use the lower-numbered
hardware threads in preference to higher-numbered threads if not all
the vcpus in the vcore are runnable, regardless of which vcpus are
runnable.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm.h          |    1 +
 arch/powerpc/include/asm/kvm_host.h     |   45 +++++-
 arch/powerpc/include/asm/kvm_ppc.h      |   13 ++
 arch/powerpc/include/asm/paca.h         |    2 +
 arch/powerpc/kernel/asm-offsets.c       |    6 +
 arch/powerpc/kernel/exceptions-64s.S    |   31 +++-
 arch/powerpc/kernel/idle_power7.S       |    2 -
 arch/powerpc/kvm/Kconfig                |    2 +-
 arch/powerpc/kvm/book3s_hv.c            |  266 +++++++++++++++++++++++++++++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  157 ++++++++++++++++++-
 arch/powerpc/sysdev/xics/icp-native.c   |    7 +
 include/linux/kvm.h                     |    3 +
 12 files changed, 506 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index a9e641b..624c872 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -24,6 +24,7 @@
 
 /* Select powerpc specific features in <linux/kvm.h> */
 #define __KVM_HAVE_SPAPR_TCE
+#define __KVM_HAVE_PPC_SMT
 
 struct kvm_regs {
 	__u64 pc;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index cda183e..a2085da 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,10 +25,15 @@
 #include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
+#include <linux/threads.h>
+#include <linux/spinlock.h>
 #include <linux/kvm_para.h>
 #include <asm/kvm_asm.h>
+#include <asm/processor.h>
 
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS		NR_CPUS
+#define KVM_THREADS_PER_CORE	4
+#define KVM_MAX_VCORES		(KVM_MAX_VCPUS / KVM_THREADS_PER_CORE)
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -165,9 +170,36 @@ struct kvm_arch {
 	unsigned long host_sdr1;
 	int tlbie_lock;
 	unsigned short last_vcpu[NR_CPUS];
+	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
 	struct list_head spapr_tce_tables;
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_count combines an entry count in the bottom 8 bits
+ * and an exit count in the next 8 bits.  This is so that we can
+ * atomically increment the entry count iff the exit count is 0
+ * without taking the lock.
+ */
+struct kvmppc_vcore {
+	int n_runnable;
+	int n_blocked;
+	int num_threads;
+	int entry_exit_count;
+	int n_woken;
+	int nap_count;
+	u16 pcpu;
+	u8 vcore_running;
+	u8 in_guest;
+	struct kvm_vcpu *runnable_threads[KVM_THREADS_PER_CORE];
+	struct task_struct *run_task[KVM_THREADS_PER_CORE];
+	struct kvm_run *kvm_run[KVM_THREADS_PER_CORE];
+	spinlock_t lock;
+};
+
+#define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
+#define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
+
 struct kvmppc_pte {
 	ulong eaddr;
 	u64 vpage;
@@ -362,10 +394,21 @@ struct kvm_vcpu_arch {
 	struct slb_shadow *slb_shadow;
 	struct dtl *dtl;
 	struct dtl *dtl_end;
+
+	struct kvmppc_vcore *vcore;
+	int ret;
 	int trap;
+	int state;
+	int ptid;
+	wait_queue_head_t cpu_run;
+
 	struct kvm_vcpu_arch_shared *shared;
 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 };
 
+#define KVMPPC_VCPU_BUSY_IN_HOST	0
+#define KVMPPC_VCPU_BLOCKED		1
+#define KVMPPC_VCPU_RUNNABLE		2
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index de683fa..ed75975 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -33,6 +33,9 @@
 #else
 #include <asm/kvm_booke.h>
 #endif
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include <asm/paca.h>
+#endif
 
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
@@ -159,4 +162,14 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
 	return r;
 }
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{
+	paca[cpu].xics_phys = addr;
+}
+#else
+static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
+{}
+#endif
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 8dba5f6..8b6628c 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -151,6 +151,8 @@ struct paca_struct {
 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	struct kvm_vcpu *kvm_vcpu;
+	struct kvmppc_vcore *kvm_vcore;
+	unsigned long xics_phys;
 	u64 dabr;
 	u64 host_mmcr[3];
 	u32 host_pmc[6];
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index fd56f14..1fee2cf 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -204,12 +204,14 @@ int main(void)
 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	DEFINE(PACA_KVM_VCPU, offsetof(struct paca_struct, kvm_vcpu));
+	DEFINE(PACA_KVM_VCORE, offsetof(struct paca_struct, kvm_vcore));
 	DEFINE(PACA_HOST_MMCR, offsetof(struct paca_struct, host_mmcr));
 	DEFINE(PACA_HOST_PMC, offsetof(struct paca_struct, host_pmc));
 	DEFINE(PACA_HOST_PURR, offsetof(struct paca_struct, host_purr));
 	DEFINE(PACA_HOST_SPURR, offsetof(struct paca_struct, host_spurr));
 	DEFINE(PACA_HOST_DSCR, offsetof(struct paca_struct, host_dscr));
 	DEFINE(PACA_DABR, offsetof(struct paca_struct, dabr));
+	DEFINE(PACA_XICS_PHYS, offsetof(struct paca_struct, xics_phys));
 	DEFINE(PACA_KVM_DECEXP, offsetof(struct paca_struct, dec_expires));
 #endif
 #endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
@@ -478,6 +480,10 @@ int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
+	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
+	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
+	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
 	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 80c6456..803dcff 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -49,19 +49,32 @@ BEGIN_FTR_SECTION
 	 * state loss at this time.
 	 */
 	mfspr	r13,SPRN_SRR1
-	rlwinm	r13,r13,47-31,30,31
-	cmpwi	cr0,r13,1
-	bne	1f
-	b	.power7_wakeup_noloss
-1:	cmpwi	cr0,r13,2
-	bne	1f
-	b	.power7_wakeup_loss
+	rlwinm.	r13,r13,47-31,30,31
+	beq	9f
+
+	/* waking up from powersave (nap) state */
+	cmpwi	cr1,r13,2
 	/* Total loss of HV state is fatal, we could try to use the
 	 * PIR to locate a PACA, then use an emergency stack etc...
 	 * but for now, let's just stay stuck here
 	 */
-1:	cmpwi	cr0,r13,3
-	beq	.
+	bgt	cr1,.
+	GET_PACA(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+	lbz	r0,PACAPROCSTART(r13)
+	cmpwi	r0,0x80
+	bne	1f
+	li	r0,0
+	stb	r0,PACAPROCSTART(r13)
+	b	kvm_start_guest
+1:
+#endif
+
+	beq	cr1,2f
+	b	.power7_wakeup_noloss
+2:	b	.power7_wakeup_loss
+9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
 #endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index f8f0bc7..3a70845 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
 	b	.
 
 _GLOBAL(power7_wakeup_loss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	REST_NVGPRS(r1)
 	REST_GPR(2, r1)
@@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
 	rfid
 
 _GLOBAL(power7_wakeup_noloss)
-	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	ld	r4,_MSR(r1)
 	ld	r5,_NIP(r1)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 6ff191b..27d8e36 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -58,7 +58,7 @@ config KVM_BOOK3S_64
 
 config KVM_BOOK3S_64_HV
 	bool "KVM support for POWER7 using hypervisor mode in host"
-	depends on EXPERIMENTAL && PPC_BOOK3S_64
+	depends on EXPERIMENTAL && PPC_BOOK3S_64 && PPC_PSERIES
 	select KVM
 	select KVM_BOOK3S_64
 	---help---
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index eed2c10..ce006c0 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -50,6 +50,7 @@
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	local_paca->kvm_vcpu = vcpu;
+	local_paca->kvm_vcore = vcpu->arch.vcore;
 	vcpu->cpu = cpu;
 }
 
@@ -58,6 +59,9 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->cpu = -1;
 }
 
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
+
 void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	u64 now;
@@ -75,11 +79,15 @@ void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
 			      HRTIMER_MODE_REL);
 	}
 
+	kvmppc_vcpu_blocked(vcpu);
+
 	kvm_vcpu_block(vcpu);
 	vcpu->stat.halt_wakeup++;
 
 	if (vcpu->arch.dec_expires != ~(u64)0)
 		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+	kvmppc_vcpu_unblocked(vcpu);
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -415,6 +423,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
 	struct kvm_vcpu *vcpu;
 	int err = -ENOMEM;
+	int core;
+	struct kvmppc_vcore *vcore;
 	unsigned long lpcr;
 
 	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -443,6 +453,37 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
 	vcpu->arch.lpcr = lpcr;
 
+	/*
+	 * Some vcpus may start out in stopped state.  If we initialize
+	 * them to busy-in-host state they will stop other vcpus in the
+	 * vcore from running.  Instead we initialize them to blocked
+	 * state, effectively considering them to be stopped until we
+	 * see the first run ioctl for them.
+	 */
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+
+	init_waitqueue_head(&vcpu->arch.cpu_run);
+	core = id / KVM_THREADS_PER_CORE;
+
+	mutex_lock(&kvm->lock);
+	vcore = kvm->arch.vcores[core];
+	if (!vcore) {
+		vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+		if (vcore)
+			spin_lock_init(&vcore->lock);
+		kvm->arch.vcores[core] = vcore;
+	}
+	mutex_unlock(&kvm->lock);
+
+	if (!vcore)
+		goto free_vcpu;
+
+	spin_lock(&vcore->lock);
+	++vcore->num_threads;
+	++vcore->n_blocked;
+	spin_unlock(&vcore->lock);
+	vcpu->arch.vcore = vcore;
+
 	return vcpu;
 
 free_vcpu:
@@ -457,37 +498,238 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kfree(vcpu);
 }
 
+static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
+	++vc->n_blocked;
+	if (vc->n_runnable > 0 &&
+	    vc->n_runnable + vc->n_blocked == vc->num_threads)
+		wake_up(&vc->runnable_threads[0]->arch.cpu_run);
+	spin_unlock(&vc->lock);
+}
+
+static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	spin_lock(&vc->lock);
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_blocked;
+	spin_unlock(&vc->lock);
+}
+
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void xics_wake_cpu(int cpu);
+
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, int ptid)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	vcpu = vc->runnable_threads[ptid];
+	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	--vc->n_runnable;
+	for (i = ptid; i < vc->n_runnable; ++i) {
+		vcpu = vc->runnable_threads[i+1];
+		vc->runnable_threads[i] = vcpu;
+		vcpu->arch.ptid = i;
+		vc->run_task[i] = vc->run_task[i+1];
+		vc->kvm_run[i] = vc->kvm_run[i+1];
+	}
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+	struct paca_struct *tpaca;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	cpu = vc->pcpu + vcpu->arch.ptid;
+	tpaca = &paca[cpu];
+	tpaca->kvm_vcpu = vcpu;
+	tpaca->kvm_vcore = vc;
+	smp_wmb();
+	if (vcpu->arch.ptid) {
+		tpaca->cpu_start = 0x80;
+		tpaca->shadow_vcpu.in_guest = KVM_GUEST_MODE_GUEST;
+		wmb();
+		xics_wake_cpu(cpu);
+		++vc->n_woken;
+	}
+}
+
+static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+{
+	int i;
+
+	HMT_low();
+	i = 0;
+	while (vc->nap_count < vc->n_woken) {
+		if (++i >= 1000000) {
+			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
+			       vc->nap_count, vc->n_woken);
+			break;
+		}
+		cpu_relax();
+	}
+	HMT_medium();
+}
 
-static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
+static int kvmppc_run_core(struct kvmppc_vcore *vc)
 {
+	struct kvm_vcpu *vcpu;
+	long i, ret;
 	u64 now;
 
+	/* don't start if any threads have a signal pending */
+	for (i = 0; i < vc->n_runnable; ++i) {
+		if (signal_pending(vc->run_task[i]))
+			return 0;
+	}
+
+	vc->n_woken = 0;
+	vc->nap_count = 0;
+	vc->entry_exit_count = 0;
+	vc->vcore_running = 1;
+	vc->in_guest = 0;
+	vc->pcpu = smp_processor_id();
+	for (i = 0; i < vc->n_runnable; ++i)
+		kvmppc_start_thread(vc->runnable_threads[i]);
+	vcpu = vc->runnable_threads[0];
+
+	spin_unlock(&vc->lock);
+
+	kvm_guest_enter();
+	__kvmppc_vcore_entry(NULL, vcpu);
+
+	/* wait for secondary threads to get back to nap mode */
+	spin_lock(&vc->lock);
+	if (vc->nap_count < vc->n_woken)
+		kvmppc_wait_for_nap(vc);
+	vc->vcore_running = 2;
+	spin_unlock(&vc->lock);
+
+	/* make sure updates to secondary vcpu structs are visible now */
+	smp_mb();
+	kvm_guest_exit();
+
+	preempt_enable();
+	kvm_resched(vcpu);
+
+	now = get_tb();
+	for (i = 0; i < vc->n_runnable; ++i) {
+		vcpu = vc->runnable_threads[i];
+		/* cancel pending dec exception if dec is positive */
+		if (now < vcpu->arch.dec_expires &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+		if (!vcpu->arch.trap) {
+			if (signal_pending(vc->run_task[i])) {
+				vc->kvm_run[i]->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+			}
+			continue;		/* didn't get to run */
+		}
+		ret = kvmppc_handle_exit(vc->kvm_run[i], vcpu, vc->run_task[i]);
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+	}
+
+	preempt_disable();
+	spin_lock(&vc->lock);
+
+	vc->vcore_running = 0;
+	for (i = 0; i < vc->n_runnable; ) {
+		vcpu = vc->runnable_threads[i];
+		if (vcpu->arch.ret != RESUME_GUEST) {
+			kvmppc_remove_runnable(vc, i);
+			wake_up(&vcpu->arch.cpu_run);
+		} else
+			++i;
+	}
+
+	return 1;
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ptid;
+	int wait_state;
+	struct kvmppc_vcore *vc;
+	DEFINE_WAIT(wait);
+
+	/* No need to go into the guest when all we do is going out */
 	if (signal_pending(current)) {
-		run->exit_reason = KVM_EXIT_INTR;
+		kvm_run->exit_reason = KVM_EXIT_INTR;
 		return -EINTR;
 	}
 
+	kvm_run->exit_reason = 0;
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+
 	flush_fp_to_thread(current);
 	flush_altivec_to_thread(current);
 	flush_vsx_to_thread(current);
 	preempt_disable();
 
-	kvm_guest_enter();
+	/*
+	 * Synchronize with other threads in this virtual core
+	 */
+	vc = vcpu->arch.vcore;
+	spin_lock(&vc->lock);
+	/* This happens the first time this is called for a vcpu */
+	if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
+		--vc->n_blocked;
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	ptid = vc->n_runnable;
+	vc->runnable_threads[ptid] = vcpu;
+	vc->run_task[ptid] = current;
+	vc->kvm_run[ptid] = kvm_run;
+	vcpu->arch.ptid = ptid;
+	++vc->n_runnable;
+
+	wait_state = TASK_INTERRUPTIBLE;
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		if (signal_pending(current)) {
+			if (!vc->vcore_running) {
+				kvm_run->exit_reason = KVM_EXIT_INTR;
+				vcpu->arch.ret = -EINTR;
+				break;
+			}
+			/* have to wait for vcore to stop executing guest */
+			wait_state = TASK_UNINTERRUPTIBLE;
+			smp_send_reschedule(vc->pcpu);
+		}
 
-	__kvmppc_vcore_entry(NULL, vcpu);
+		if (!vc->vcore_running &&
+		    vc->n_runnable + vc->n_blocked == vc->num_threads) {
+			/* we can run now */
+			if (kvmppc_run_core(vc))
+				continue;
+		}
 
-	kvm_guest_exit();
+		if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
+			kvmppc_start_thread(vcpu);
 
-	preempt_enable();
-	kvm_resched(vcpu);
+		/* wait for other threads to come in, or wait for vcore */
+		prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+		spin_unlock(&vc->lock);
+		schedule();
+		finish_wait(&vcpu->arch.cpu_run, &wait);
+		spin_lock(&vc->lock);
+	}
 
-	now = get_tb();
-	/* cancel pending dec exception if dec is positive */
-	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
-		kvmppc_core_dequeue_dec(vcpu);
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+		kvmppc_remove_runnable(vc, vcpu->arch.ptid);
+	spin_unlock(&vc->lock);
 
-	return kvmppc_handle_exit(run, vcpu, current);
+	return vcpu->arch.ret;
 }
 
 int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 95f6386..f1d3779 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -111,6 +111,32 @@ kvmppc_trampoline_enter:
  *                                                                            *
  *****************************************************************************/
 
+#define XICS_XIRR		4
+#define XICS_QIRR		0xc
+
+/*
+ * We come in here when wakened from nap mode on a secondary hw thread.
+ * Relocation is off and most register values are lost.
+ * r13 points to the PACA.
+ */
+	.globl	kvm_start_guest
+kvm_start_guest:
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	/* get vcpu pointer */
+	ld	r4, PACA_KVM_VCPU(r13)
+
+	/* We got here with an IPI; clear it */
+	ld	r5, PACA_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	li	r7, XICS_XIRR
+	lwzcix	r8, r5, r7		/* ack the interrupt */
+	sync
+	stbcix	r0, r5, r6		/* clear it */
+	stwcix	r8, r5, r7		/* EOI it */
+
 .global kvmppc_handler_trampoline_enter
 kvmppc_handler_trampoline_enter:
 
@@ -229,7 +255,20 @@ kvmppc_handler_trampoline_enter:
 	slbia
 	ptesync
 
-	/* Switch to guest partition. */
+	/* Increment entry count iff exit count is zero. */
+ 	ld	r5,PACA_KVM_VCORE(r13)
+	addi	r9,r5,VCORE_ENTRY_EXIT
+21:	lwarx	r3,0,r9
+	cmpwi	r3,0x100		/* any threads starting to exit? */
+	bge	secondary_too_late	/* if so we're too late to the party */
+	addi	r3,r3,1
+	stwcx.	r3,0,r9
+	bne	21b
+
+	/* Primary thread switches to guest partition. */
+	lwz	r6,VCPU_PTID(r4)
+	cmpwi	r6,0
+	bne	20f
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
@@ -239,7 +278,15 @@ kvmppc_handler_trampoline_enter:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
-	ld	r8,VCPU_LPCR(r4)
+	li	r0,1
+	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
+	b	10f
+
+	/* Secondary threads wait for primary to have done partition switch */
+20:	lbz	r0,VCORE_IN_GUEST(r5)
+	cmpwi	r0,0
+	beq	20b
+10:	ld	r8,VCPU_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -254,10 +301,12 @@ kvmppc_handler_trampoline_enter:
 	 * Invalidate the TLB if we could possibly have stale TLB
 	 * entries for this partition on this core due to the use
 	 * of tlbiel.
+	 * XXX maybe only need this on primary thread?
 	 */
 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
 	lwz	r5,VCPU_VCPUID(r4)
 	lhz	r6,PACAPACAINDEX(r13)
+	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
 	lhz	r8,VCPU_LAST_CPU(r4)
 	sldi	r7,r6,1			/* see if this is the same vcpu */
 	add	r7,r7,r9		/* as last ran on this pcpu */
@@ -540,8 +589,51 @@ hcall_real_cont:
 	ptesync
 
 hdec_soon:
-	/* Switch back to host partition */
+	/* Increment the threads-exiting-guest count in the 0xff00
+	   bits of vcore->entry_exit_count */
+	lwsync
+	ld	r5,PACA_KVM_VCORE(r13)
+	addi	r6,r5,VCORE_ENTRY_EXIT
+41:	lwarx	r3,0,r6
+	addi	r0,r3,0x100
+	stwcx.	r0,0,r6
+	bne	41b
+
+	/* If this is not a HDEC interrupt and there are other threads,
+	   and we were the first thread to take an interrupt,
+	   set HDEC to 0 to pull the other threads out of the guest. */
+	cmpwi	r12,0x980
+	beq	40f
+	cmpwi	r3,0x100
+	bge	40f
+	cmpwi	r3,1
+	ble	40f
+	li	r0,0
+	mtspr	SPRN_HDEC,r0
+40:
+
+	/* Secondary threads wait for primary to do partition switch */
 	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+	ld	r5,PACA_KVM_VCORE(r13)
+	lwz	r3,VCPU_PTID(r9)
+	cmpwi	r3,0
+	beq	15f
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	b	16f
+
+	/* Primary thread waits for all the secondaries to exit guest */
+15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
+	srwi	r0,r3,8
+	clrldi	r3,r3,56
+	cmpw	r3,r0
+	bne	15b
+	isync
+
+	/* Primary thread switches back to host partition */
 	ld	r6,KVM_HOST_SDR1(r4)
 	lwz	r7,KVM_HOST_LPID(r4)
 	li	r8,0x3ff		/* switch to reserved LPID */
@@ -550,10 +642,12 @@ hdec_soon:
 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
 	mtspr	SPRN_LPID,r7
 	isync
+	li	r0,0
+	stb	r0,VCORE_IN_GUEST(r5)
 	lis	r8,0x7fff
 	mtspr	SPRN_HDEC,r8
 
-	ld	r8,KVM_HOST_LPCR(r4)
+16:	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
 
@@ -662,6 +756,11 @@ hdec_soon:
 	mr	r3, r9
 	bl	.kvmppc_save_fp
 
+	/* Secondary threads go off to take a nap */
+	lwz	r0,VCPU_PTID(r3)
+	cmpwi	r0,0
+	bne	secondary_nap
+
 	/* RFI into the highmem handler */
 	mfmsr	r7
 	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
@@ -807,3 +906,53 @@ _GLOBAL(kvmppc_h_set_dabr)
 	mtspr	SPRN_DABR,r4
 	li	r3,0
 	blr
+
+secondary_too_late:
+	ld	r5,PACA_KVM_VCORE(r13)
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	ld	r11,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	ld	r5,SLBSHADOW_SAVEAREA(r11)
+	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r11,r11,16
+	.endr
+	b	50f
+
+secondary_nap:
+	/* Clear any pending IPI */
+50:	ld	r5, PACA_XICS_PHYS(r13)
+	li	r0, 0xff
+	li	r6, XICS_QIRR
+	stbcix	r0, r5, r6
+
+	/* increment the nap count and then go to nap mode */
+	ld	r4, PACA_KVM_VCORE(r13)
+	addi	r4, r4, VCORE_NAP_COUNT
+	lwsync				/* make previous updates visible */
+51:	lwarx	r3, 0, r4
+	addi	r3, r3, 1
+	stwcx.	r3, 0, r4
+	bne	51b
+	isync
+
+	mfspr	r4, SPRN_LPCR
+	li	r0, LPCR_PECE
+	andc	r4, r4, r0
+	ori	r4, r4, LPCR_PECE0	/* exit nap on interrupt */
+	mtspr	SPRN_LPCR, r4
+	li	r0, 0
+	std	r0, SHADOW_VCPU_OFF + SVCPU_SCRATCH0(r13)
+	ptesync
+	ld	r0, SHADOW_VCPU_OFF + SVCPU_SCRATCH0(r13)
+1:	cmpd	r0, r0
+	bne	1b
+	nap
+	b	.
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index be5e3d7..01149c0 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -23,6 +23,7 @@
 #include <asm/irq.h>
 #include <asm/errno.h>
 #include <asm/xics.h>
+#include <asm/kvm_ppc.h>
 
 struct icp_ipl {
 	union {
@@ -142,6 +143,11 @@ static inline void icp_native_do_message(int cpu, int msg)
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
+void xics_wake_cpu(int cpu)
+{
+	icp_native_set_qirr(cpu, IPI_PRIORITY);
+}
+
 static void icp_native_message_pass(int target, int msg)
 {
 	unsigned int i;
@@ -204,6 +210,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr,
 	}
 
 	icp_native_regs[cpu] = ioremap(addr, size);
+	kvmppc_set_xics_phys(cpu, addr);
 	if (!icp_native_regs[cpu]) {
 		pr_warning("icp_native: Failed ioremap for CPU %d, "
 			   "interrupt server #0x%x, addr %#lx\n",
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3d3cdf1..952d556 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -550,6 +550,9 @@ struct kvm_ppc_pvinfo {
 #ifdef __KVM_HAVE_SPAPR_TCE
 #define KVM_CAP_SPAPR_TCE 60
 #endif
+#ifdef __KVM_HAVE_PPC_SMT
+#define KVM_CAP_PPC_SMT 61
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
  2011-05-11 10:46 ` [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes Paul Mackerras
@ 2011-05-11 13:44   ` Christoph Hellwig
  2011-05-11 21:17       ` Paul Mackerras
  2011-05-17  8:21     ` Alexander Graf
  1 sibling, 1 reply; 103+ messages in thread
From: Christoph Hellwig @ 2011-05-11 13:44 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Alexander Graf, kvm

On Wed, May 11, 2011 at 08:46:56PM +1000, Paul Mackerras wrote:
> arch/powerpc/sysdev/xics/icp-native.c.

What kernel tree do I need to actually have that file?

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
  2011-05-11 13:44   ` Christoph Hellwig
@ 2011-05-11 21:17       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 21:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linuxppc-dev, kvm, Alexander Graf

On Wed, May 11, 2011 at 03:44:15PM +0200, Christoph Hellwig wrote:
> On Wed, May 11, 2011 at 08:46:56PM +1000, Paul Mackerras wrote:
> > arch/powerpc/sysdev/xics/icp-native.c.
> 
> What kernel tree do I need to actually have that file?

The "next" branch of
git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
@ 2011-05-11 21:17       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-11 21:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linuxppc-dev, Alexander Graf, kvm

On Wed, May 11, 2011 at 03:44:15PM +0200, Christoph Hellwig wrote:
> On Wed, May 11, 2011 at 08:46:56PM +1000, Paul Mackerras wrote:
> > arch/powerpc/sysdev/xics/icp-native.c.
> 
> What kernel tree do I need to actually have that file?

The "next" branch of
git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-11 10:44 ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
@ 2011-05-12  9:07     ` Avi Kivity
  2011-05-15 21:58     ` Alexander Graf
  1 sibling, 0 replies; 103+ messages in thread
From: Avi Kivity @ 2011-05-12  9:07 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm, Alexander Graf

On 05/11/2011 01:44 PM, Paul Mackerras wrote:
> This adds support for KVM running on 64-bit Book 3S processors,
> specifically POWER7, in hypervisor mode.  Using hypervisor mode means
> that the guest can use the processor's supervisor mode.  That means
> that the guest can execute privileged instructions and access privileged
> registers itself without trapping to the host.  This gives excellent
> performance, but does mean that KVM cannot emulate a processor
> architecture other than the one that the hardware implements.
>
> This code assumes that the guest is running paravirtualized using the
> PAPR (Power Architecture Platform Requirements) interface, which is the
> interface that IBM's PowerVM hypervisor uses.  That means that existing
> Linux distributions that run on IBM pSeries machines will also run
> under KVM without modification.  In order to communicate the PAPR
> hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
> to include/linux/kvm.h.
>
> Currently the choice between book3s_hv support and book3s_pr support
> (i.e. the existing code, which runs the guest in user mode) has to be
> made at kernel configuration time, so a given kernel binary can only
> do one or the other.
>
> This new book3s_hv code doesn't support MMIO emulation at present.
> Since we are running paravirtualized guests, this isn't a serious
> restriction.
>
> With the guest running in supervisor mode, most exceptions go straight
> to the guest.  We will never get data or instruction storage or segment
> interrupts, alignment interrupts, decrementer interrupts, program
> interrupts, single-step interrupts, etc., coming to the hypervisor from
> the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
> exception entry path so that we don't have to do the KVM test on entry
> to those exception handlers.
>
> We do however get hypervisor decrementer, hypervisor data storage,
> hypervisor instruction storage, and hypervisor emulation assist
> interrupts, so we have to handle those.
>
> In hypervisor mode, real-mode accesses can access all of RAM, not just
> a limited amount.  Therefore we put all the guest state in the vcpu.arch
> and use the shadow_vcpu in the PACA only for temporary scratch space.
> We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
> anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
>
> The POWER7 processor has a restriction that all threads in a core have
> to be in the same partition.  MMU-on kernel code counts as a partition
> (partition 0), so we have to do a partition switch on every entry to and
> exit from the guest.  At present we require the host and guest to run
> in single-thread mode because of this hardware restriction.
>
> This code allocates a hashed page table for the guest and initializes
> it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
> require that the guest memory is allocated using 16MB huge pages, in
> order to simplify the low-level memory management.  This also means that
> we can get away without tracking paging activity in the host for now,
> since huge pages can't be paged or swapped.
>
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..a4447ce 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
>   #define KVM_EXIT_NMI              16
>   #define KVM_EXIT_INTERNAL_ERROR   17
>   #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_PAPR_HCALL	  19
>
>   /* For KVM_EXIT_INTERNAL_ERROR */
>   #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -264,6 +265,11 @@ struct kvm_run {
>   		struct {
>   			__u64 gprs[32];
>   		} osi;
> +		struct {
> +			__u64 nr;
> +			__u64 ret;
> +			__u64 args[9];
> +		} papr_hcall;
>   		/* Fix the size of the union. */
>   		char padding[256];
>   	};

Please document this in Documentation/kvm/api.txt.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-12  9:07     ` Avi Kivity
  0 siblings, 0 replies; 103+ messages in thread
From: Avi Kivity @ 2011-05-12  9:07 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Alexander Graf, kvm

On 05/11/2011 01:44 PM, Paul Mackerras wrote:
> This adds support for KVM running on 64-bit Book 3S processors,
> specifically POWER7, in hypervisor mode.  Using hypervisor mode means
> that the guest can use the processor's supervisor mode.  That means
> that the guest can execute privileged instructions and access privileged
> registers itself without trapping to the host.  This gives excellent
> performance, but does mean that KVM cannot emulate a processor
> architecture other than the one that the hardware implements.
>
> This code assumes that the guest is running paravirtualized using the
> PAPR (Power Architecture Platform Requirements) interface, which is the
> interface that IBM's PowerVM hypervisor uses.  That means that existing
> Linux distributions that run on IBM pSeries machines will also run
> under KVM without modification.  In order to communicate the PAPR
> hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
> to include/linux/kvm.h.
>
> Currently the choice between book3s_hv support and book3s_pr support
> (i.e. the existing code, which runs the guest in user mode) has to be
> made at kernel configuration time, so a given kernel binary can only
> do one or the other.
>
> This new book3s_hv code doesn't support MMIO emulation at present.
> Since we are running paravirtualized guests, this isn't a serious
> restriction.
>
> With the guest running in supervisor mode, most exceptions go straight
> to the guest.  We will never get data or instruction storage or segment
> interrupts, alignment interrupts, decrementer interrupts, program
> interrupts, single-step interrupts, etc., coming to the hypervisor from
> the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
> exception entry path so that we don't have to do the KVM test on entry
> to those exception handlers.
>
> We do however get hypervisor decrementer, hypervisor data storage,
> hypervisor instruction storage, and hypervisor emulation assist
> interrupts, so we have to handle those.
>
> In hypervisor mode, real-mode accesses can access all of RAM, not just
> a limited amount.  Therefore we put all the guest state in the vcpu.arch
> and use the shadow_vcpu in the PACA only for temporary scratch space.
> We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
> anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
>
> The POWER7 processor has a restriction that all threads in a core have
> to be in the same partition.  MMU-on kernel code counts as a partition
> (partition 0), so we have to do a partition switch on every entry to and
> exit from the guest.  At present we require the host and guest to run
> in single-thread mode because of this hardware restriction.
>
> This code allocates a hashed page table for the guest and initializes
> it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
> require that the guest memory is allocated using 16MB huge pages, in
> order to simplify the low-level memory management.  This also means that
> we can get away without tracking paging activity in the host for now,
> since huge pages can't be paged or swapped.
>
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..a4447ce 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
>   #define KVM_EXIT_NMI              16
>   #define KVM_EXIT_INTERNAL_ERROR   17
>   #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_PAPR_HCALL	  19
>
>   /* For KVM_EXIT_INTERNAL_ERROR */
>   #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -264,6 +265,11 @@ struct kvm_run {
>   		struct {
>   			__u64 gprs[32];
>   		} osi;
> +		struct {
> +			__u64 nr;
> +			__u64 ret;
> +			__u64 args[9];
> +		} papr_hcall;
>   		/* Fix the size of the union. */
>   		char padding[256];
>   	};

Please document this in Documentation/kvm/api.txt.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
  2011-05-11 10:39 ` [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors Paul Mackerras
@ 2011-05-12  9:33     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-12  9:33 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:

> Commits a5d4f3ad3a ("powerpc: Base support for exceptions using
> HSRR0/1") and 673b189a2e ("powerpc: Always use SPRN_SPRG_HSCRATCH0
> when running in HV mode") cause compile and link errors for 32-bit
> classic Book 3S processors when KVM is enabled.  This fixes these
> errors.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/reg.h       |    5 +++++
> arch/powerpc/kvm/book3s_rmhandlers.S |    2 ++
> 2 files changed, 7 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index 47e3416..05658b7 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -823,6 +823,11 @@
>    FTR_SECTION_ELSE_NESTED(66);            \
>    mtspr    SPRN_SPRG_HSCRATCH0,rX;            \
>    ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
> +
> +#else /* CONFIG_PPC_BOOK3S_64 */
> +#define GET_SCRATCH0(rX)    mfspr    rX,SPRN_SPRG_SCRATCH0
> +#define SET_SCRATCH0(rX)    mtspr    SPRN_SPRG_SCRATCH0,rX
> +
> #endif
> 
> #ifdef CONFIG_PPC_BOOK3E_64
> diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
> index ae99af6..1a1b344 100644
> --- a/arch/powerpc/kvm/book3s_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
> @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHECK
> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
> +#ifdef CONFIG_PPC_BOOK3S_64
> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV

Hrm - I don't remember putting this one here. When did it get into the tree and why wasn't I CC'ed?

Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
@ 2011-05-12  9:33     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-12  9:33 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:

> Commits a5d4f3ad3a ("powerpc: Base support for exceptions using
> HSRR0/1") and 673b189a2e ("powerpc: Always use SPRN_SPRG_HSCRATCH0
> when running in HV mode") cause compile and link errors for 32-bit
> classic Book 3S processors when KVM is enabled.  This fixes these
> errors.
>=20
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/reg.h       |    5 +++++
> arch/powerpc/kvm/book3s_rmhandlers.S |    2 ++
> 2 files changed, 7 insertions(+), 0 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg=
.h
> index 47e3416..05658b7 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -823,6 +823,11 @@
>    FTR_SECTION_ELSE_NESTED(66);            \
>    mtspr    SPRN_SPRG_HSCRATCH0,rX;            \
>    ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
> +
> +#else /* CONFIG_PPC_BOOK3S_64 */
> +#define GET_SCRATCH0(rX)    mfspr    rX,SPRN_SPRG_SCRATCH0
> +#define SET_SCRATCH0(rX)    mtspr    SPRN_SPRG_SCRATCH0,rX
> +
> #endif
>=20
> #ifdef CONFIG_PPC_BOOK3E_64
> diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3=
s_rmhandlers.S
> index ae99af6..1a1b344 100644
> --- a/arch/powerpc/kvm/book3s_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
> @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHECK=

> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
> +#ifdef CONFIG_PPC_BOOK3S_64
> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV

Hrm - I don't remember putting this one here. When did it get into the tree a=
nd why wasn't I CC'ed?

Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
  2011-05-12  9:33     ` Alexander Graf
@ 2011-05-12 11:15       ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-12 11:15 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm, benh

On Thu, May 12, 2011 at 11:33:00AM +0200, Alexander Graf wrote:
> 
> Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:
> 
> > diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
> > index ae99af6..1a1b344 100644
> > --- a/arch/powerpc/kvm/book3s_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
> > @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHECK
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
> > +#ifdef CONFIG_PPC_BOOK3S_64
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV
> 
> Hrm - I don't remember putting this one here. When did it get into
> the tree and why wasn't I CC'ed?

It comes from commit a5d4f3ad3a ("powerpc: Base support for exceptions
using HSRR0/1", author Ben H.) in Ben's next branch.  He committed it
on April 20.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
@ 2011-05-12 11:15       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-12 11:15 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm

On Thu, May 12, 2011 at 11:33:00AM +0200, Alexander Graf wrote:
> 
> Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:
> 
> > diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
> > index ae99af6..1a1b344 100644
> > --- a/arch/powerpc/kvm/book3s_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
> > @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHECK
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
> > +#ifdef CONFIG_PPC_BOOK3S_64
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV
> 
> Hrm - I don't remember putting this one here. When did it get into
> the tree and why wasn't I CC'ed?

It comes from commit a5d4f3ad3a ("powerpc: Base support for exceptions
using HSRR0/1", author Ben H.) in Ben's next branch.  He committed it
on April 20.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
  2011-05-12  9:33     ` Alexander Graf
@ 2011-05-12 11:16       ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 103+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-12 11:16 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Paul Mackerras, linuxppc-dev, kvm

On Thu, 2011-05-12 at 11:33 +0200, Alexander Graf wrote:
> Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:
> 
> > Commits a5d4f3ad3a ("powerpc: Base support for exceptions using
> > HSRR0/1") and 673b189a2e ("powerpc: Always use SPRN_SPRG_HSCRATCH0
> > when running in HV mode") cause compile and link errors for 32-bit
> > classic Book 3S processors when KVM is enabled.  This fixes these
> > errors.
> > 
> > Signed-off-by: Paul Mackerras <paulus@samba.org>
> > ---
> > arch/powerpc/include/asm/reg.h       |    5 +++++
> > arch/powerpc/kvm/book3s_rmhandlers.S |    2 ++
> > 2 files changed, 7 insertions(+), 0 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> > index 47e3416..05658b7 100644
> > --- a/arch/powerpc/include/asm/reg.h
> > +++ b/arch/powerpc/include/asm/reg.h
> > @@ -823,6 +823,11 @@
> >    FTR_SECTION_ELSE_NESTED(66);            \
> >    mtspr    SPRN_SPRG_HSCRATCH0,rX;            \
> >    ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
> > +
> > +#else /* CONFIG_PPC_BOOK3S_64 */
> > +#define GET_SCRATCH0(rX)    mfspr    rX,SPRN_SPRG_SCRATCH0
> > +#define SET_SCRATCH0(rX)    mtspr    SPRN_SPRG_SCRATCH0,rX
> > +
> > #endif
> > 
> > #ifdef CONFIG_PPC_BOOK3E_64
> > diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
> > index ae99af6..1a1b344 100644
> > --- a/arch/powerpc/kvm/book3s_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
> > @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHECK
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
> > +#ifdef CONFIG_PPC_BOOK3S_64
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV
> 
> Hrm - I don't remember putting this one here. When did it get into the tree and why wasn't I CC'ed?

Because I did and I forgot :-)

The patch in question only marginally touched kvm, it's one in a series
that rework of the ppc64 exception vectors to better operate on modern
CPUs running in HV mode (deal with HSRR's vs SRR's etc...) and it needed
a small fixup to the KVM code due to 0x500 becoming "H" interrupts
(using HSRR's) on these.

Unfortunately, it looks like I didn't have KVM enabled in any of my
32-bit test configs and missed that little breakage.

I should have CCed you I suppose, I simply forgot as it wasn't primarily
a KVM related patch.

Cheers,
Ben.


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
@ 2011-05-12 11:16       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 103+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-12 11:16 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Paul Mackerras, kvm

On Thu, 2011-05-12 at 11:33 +0200, Alexander Graf wrote:
> Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:
> 
> > Commits a5d4f3ad3a ("powerpc: Base support for exceptions using
> > HSRR0/1") and 673b189a2e ("powerpc: Always use SPRN_SPRG_HSCRATCH0
> > when running in HV mode") cause compile and link errors for 32-bit
> > classic Book 3S processors when KVM is enabled.  This fixes these
> > errors.
> > 
> > Signed-off-by: Paul Mackerras <paulus@samba.org>
> > ---
> > arch/powerpc/include/asm/reg.h       |    5 +++++
> > arch/powerpc/kvm/book3s_rmhandlers.S |    2 ++
> > 2 files changed, 7 insertions(+), 0 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> > index 47e3416..05658b7 100644
> > --- a/arch/powerpc/include/asm/reg.h
> > +++ b/arch/powerpc/include/asm/reg.h
> > @@ -823,6 +823,11 @@
> >    FTR_SECTION_ELSE_NESTED(66);            \
> >    mtspr    SPRN_SPRG_HSCRATCH0,rX;            \
> >    ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
> > +
> > +#else /* CONFIG_PPC_BOOK3S_64 */
> > +#define GET_SCRATCH0(rX)    mfspr    rX,SPRN_SPRG_SCRATCH0
> > +#define SET_SCRATCH0(rX)    mtspr    SPRN_SPRG_SCRATCH0,rX
> > +
> > #endif
> > 
> > #ifdef CONFIG_PPC_BOOK3E_64
> > diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
> > index ae99af6..1a1b344 100644
> > --- a/arch/powerpc/kvm/book3s_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
> > @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHECK
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
> > +#ifdef CONFIG_PPC_BOOK3S_64
> > INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV
> 
> Hrm - I don't remember putting this one here. When did it get into the tree and why wasn't I CC'ed?

Because I did and I forgot :-)

The patch in question only marginally touched kvm, it's one in a series
that rework of the ppc64 exception vectors to better operate on modern
CPUs running in HV mode (deal with HSRR's vs SRR's etc...) and it needed
a small fixup to the KVM code due to 0x500 becoming "H" interrupts
(using HSRR's) on these.

Unfortunately, it looks like I didn't have KVM enabled in any of my
32-bit test configs and missed that little breakage.

I should have CCed you I suppose, I simply forgot as it wasn't primarily
a KVM related patch.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
  2011-05-12 11:16       ` Benjamin Herrenschmidt
@ 2011-05-12 11:57         ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-12 11:57 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Paul Mackerras, linuxppc-dev, kvm


Am 12.05.2011 um 13:16 schrieb Benjamin Herrenschmidt <benh@kernel.crashing.org>:

> On Thu, 2011-05-12 at 11:33 +0200, Alexander Graf wrote:
>> Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:
>> 
>>> Commits a5d4f3ad3a ("powerpc: Base support for exceptions using
>>> HSRR0/1") and 673b189a2e ("powerpc: Always use SPRN_SPRG_HSCRATCH0
>>> when running in HV mode") cause compile and link errors for 32-bit
>>> classic Book 3S processors when KVM is enabled.  This fixes these
>>> errors.
>>> 
>>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>>> ---
>>> arch/powerpc/include/asm/reg.h       |    5 +++++
>>> arch/powerpc/kvm/book3s_rmhandlers.S |    2 ++
>>> 2 files changed, 7 insertions(+), 0 deletions(-)
>>> 
>>> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
>>> index 47e3416..05658b7 100644
>>> --- a/arch/powerpc/include/asm/reg.h
>>> +++ b/arch/powerpc/include/asm/reg.h
>>> @@ -823,6 +823,11 @@
>>>   FTR_SECTION_ELSE_NESTED(66);            \
>>>   mtspr    SPRN_SPRG_HSCRATCH0,rX;            \
>>>   ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
>>> +
>>> +#else /* CONFIG_PPC_BOOK3S_64 */
>>> +#define GET_SCRATCH0(rX)    mfspr    rX,SPRN_SPRG_SCRATCH0
>>> +#define SET_SCRATCH0(rX)    mtspr    SPRN_SPRG_SCRATCH0,rX
>>> +
>>> #endif
>>> 
>>> #ifdef CONFIG_PPC_BOOK3E_64
>>> diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
>>> index ae99af6..1a1b344 100644
>>> --- a/arch/powerpc/kvm/book3s_rmhandlers.S
>>> +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
>>> @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHECK
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
>>> +#ifdef CONFIG_PPC_BOOK3S_64
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV
>> 
>> Hrm - I don't remember putting this one here. When did it get into the tree and why wasn't I CC'ed?
> 
> Because I did and I forgot :-)
> 
> The patch in question only marginally touched kvm, it's one in a series
> that rework of the ppc64 exception vectors to better operate on modern
> CPUs running in HV mode (deal with HSRR's vs SRR's etc...) and it needed
> a small fixup to the KVM code due to 0x500 becoming "H" interrupts
> (using HSRR's) on these.
> 
> Unfortunately, it looks like I didn't have KVM enabled in any of my
> 32-bit test configs and missed that little breakage.
> 
> I should have CCed you I suppose, I simply forgot as it wasn't primarily
> a KVM related patch.

Alright :). Don't worry too much about it - just wanted to make sure you'll remember next time ;)

Alex

> 

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors
@ 2011-05-12 11:57         ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-12 11:57 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Paul Mackerras, kvm


Am 12.05.2011 um 13:16 schrieb Benjamin Herrenschmidt <benh@kernel.crashing.=
org>:

> On Thu, 2011-05-12 at 11:33 +0200, Alexander Graf wrote:
>> Am 11.05.2011 um 12:39 schrieb Paul Mackerras <paulus@samba.org>:
>>=20
>>> Commits a5d4f3ad3a ("powerpc: Base support for exceptions using
>>> HSRR0/1") and 673b189a2e ("powerpc: Always use SPRN_SPRG_HSCRATCH0
>>> when running in HV mode") cause compile and link errors for 32-bit
>>> classic Book 3S processors when KVM is enabled.  This fixes these
>>> errors.
>>>=20
>>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>>> ---
>>> arch/powerpc/include/asm/reg.h       |    5 +++++
>>> arch/powerpc/kvm/book3s_rmhandlers.S |    2 ++
>>> 2 files changed, 7 insertions(+), 0 deletions(-)
>>>=20
>>> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/r=
eg.h
>>> index 47e3416..05658b7 100644
>>> --- a/arch/powerpc/include/asm/reg.h
>>> +++ b/arch/powerpc/include/asm/reg.h
>>> @@ -823,6 +823,11 @@
>>>   FTR_SECTION_ELSE_NESTED(66);            \
>>>   mtspr    SPRN_SPRG_HSCRATCH0,rX;            \
>>>   ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66)
>>> +
>>> +#else /* CONFIG_PPC_BOOK3S_64 */
>>> +#define GET_SCRATCH0(rX)    mfspr    rX,SPRN_SPRG_SCRATCH0
>>> +#define SET_SCRATCH0(rX)    mtspr    SPRN_SPRG_SCRATCH0,rX
>>> +
>>> #endif
>>>=20
>>> #ifdef CONFIG_PPC_BOOK3E_64
>>> diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/boo=
k3s_rmhandlers.S
>>> index ae99af6..1a1b344 100644
>>> --- a/arch/powerpc/kvm/book3s_rmhandlers.S
>>> +++ b/arch/powerpc/kvm/book3s_rmhandlers.S
>>> @@ -112,7 +112,9 @@ INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_MACHINE_CHE=
CK
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_DATA_STORAGE
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_INST_STORAGE
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL
>>> +#ifdef CONFIG_PPC_BOOK3S_64
>>> INTERRUPT_TRAMPOLINE    BOOK3S_INTERRUPT_EXTERNAL_HV
>>=20
>> Hrm - I don't remember putting this one here. When did it get into the tr=
ee and why wasn't I CC'ed?
>=20
> Because I did and I forgot :-)
>=20
> The patch in question only marginally touched kvm, it's one in a series
> that rework of the ppc64 exception vectors to better operate on modern
> CPUs running in HV mode (deal with HSRR's vs SRR's etc...) and it needed
> a small fixup to the KVM code due to 0x500 becoming "H" interrupts
> (using HSRR's) on these.
>=20
> Unfortunately, it looks like I didn't have KVM enabled in any of my
> 32-bit test configs and missed that little breakage.
>=20
> I should have CCed you I suppose, I simply forgot as it wasn't primarily
> a KVM related patch.

Alright :). Don't worry too much about it - just wanted to make sure you'll r=
emember next time ;)

Alex

>=20

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-11 10:44 ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
  2011-05-12  9:07     ` Avi Kivity
@ 2011-05-15 21:58     ` Alexander Graf
  1 sibling, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-15 21:58 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc


On 11.05.2011, at 12:44, Paul Mackerras wrote:

> This adds support for KVM running on 64-bit Book 3S processors,
> specifically POWER7, in hypervisor mode.  Using hypervisor mode means
> that the guest can use the processor's supervisor mode.  That means
> that the guest can execute privileged instructions and access privileged
> registers itself without trapping to the host.  This gives excellent
> performance, but does mean that KVM cannot emulate a processor
> architecture other than the one that the hardware implements.
> 
> This code assumes that the guest is running paravirtualized using the
> PAPR (Power Architecture Platform Requirements) interface, which is the
> interface that IBM's PowerVM hypervisor uses.  That means that existing
> Linux distributions that run on IBM pSeries machines will also run
> under KVM without modification.  In order to communicate the PAPR
> hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
> to include/linux/kvm.h.
> 
> Currently the choice between book3s_hv support and book3s_pr support
> (i.e. the existing code, which runs the guest in user mode) has to be
> made at kernel configuration time, so a given kernel binary can only
> do one or the other.
> 
> This new book3s_hv code doesn't support MMIO emulation at present.
> Since we are running paravirtualized guests, this isn't a serious
> restriction.
> 
> With the guest running in supervisor mode, most exceptions go straight
> to the guest.  We will never get data or instruction storage or segment
> interrupts, alignment interrupts, decrementer interrupts, program
> interrupts, single-step interrupts, etc., coming to the hypervisor from
> the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
> exception entry path so that we don't have to do the KVM test on entry
> to those exception handlers.
> 
> We do however get hypervisor decrementer, hypervisor data storage,
> hypervisor instruction storage, and hypervisor emulation assist
> interrupts, so we have to handle those.
> 
> In hypervisor mode, real-mode accesses can access all of RAM, not just
> a limited amount.  Therefore we put all the guest state in the vcpu.arch
> and use the shadow_vcpu in the PACA only for temporary scratch space.
> We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
> anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
> 
> The POWER7 processor has a restriction that all threads in a core have
> to be in the same partition.  MMU-on kernel code counts as a partition
> (partition 0), so we have to do a partition switch on every entry to and
> exit from the guest.  At present we require the host and guest to run
> in single-thread mode because of this hardware restriction.
> 
> This code allocates a hashed page table for the guest and initializes
> it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
> require that the guest memory is allocated using 16MB huge pages, in
> order to simplify the low-level memory management.  This also means that
> we can get away without tracking paging activity in the host for now,
> since huge pages can't be paged or swapped.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/exception-64s.h  |   27 +-
> arch/powerpc/include/asm/kvm_asm.h        |    4 +
> arch/powerpc/include/asm/kvm_book3s.h     |  148 ++++++-
> arch/powerpc/include/asm/kvm_book3s_asm.h |    4 +-
> arch/powerpc/include/asm/kvm_booke.h      |    4 +
> arch/powerpc/include/asm/kvm_host.h       |   60 +++-
> arch/powerpc/include/asm/kvm_ppc.h        |    6 +
> arch/powerpc/include/asm/mmu-hash64.h     |   10 +-
> arch/powerpc/include/asm/paca.h           |   10 +
> arch/powerpc/include/asm/reg.h            |    4 +
> arch/powerpc/kernel/asm-offsets.c         |   95 ++++-
> arch/powerpc/kernel/exceptions-64s.S      |   60 ++--
> arch/powerpc/kvm/Kconfig                  |   40 ++-
> arch/powerpc/kvm/Makefile                 |   16 +-
> arch/powerpc/kvm/book3s_64_mmu_hv.c       |  258 +++++++++++
> arch/powerpc/kvm/book3s_hv.c              |  413 ++++++++++++++++++
> arch/powerpc/kvm/book3s_hv_interrupts.S   |  326 ++++++++++++++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  663 +++++++++++++++++++++++++++++
> arch/powerpc/kvm/powerpc.c                |   23 +-
> arch/powerpc/kvm/trace.h                  |    2 +-
> include/linux/kvm.h                       |    6 +
> 21 files changed, 2094 insertions(+), 85 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
> create mode 100644 arch/powerpc/kvm/book3s_hv.c
> create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
> create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S
> 
> diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
> index 2a770d8..d32e1ef 100644
> --- a/arch/powerpc/include/asm/exception-64s.h
> +++ b/arch/powerpc/include/asm/exception-64s.h
> @@ -65,14 +65,14 @@
> 	GET_PACA(r13);							\
> 	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
> 	std	r10,area+EX_R10(r13);					\
> -	mfcr	r9;							\
> -	extra(vec);							\
> -	std	r11,area+EX_R11(r13);					\
> -	std	r12,area+EX_R12(r13);					\
> 	BEGIN_FTR_SECTION_NESTED(66);					\
> 	mfspr	r10,SPRN_CFAR;						\
> 	std	r10,area+EX_CFAR(r13);					\
> 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		\
> +	mfcr	r9;							\
> +	extra(vec);							\
> +	std	r11,area+EX_R11(r13);					\
> +	std	r12,area+EX_R12(r13);					\


I don't really understand that change :).

> 	GET_SCRATCH0(r10);						\
> 	std	r10,area+EX_R13(r13)
> #define EXCEPTION_PROLOG_1(area, extra, vec)				\
> @@ -134,6 +134,17 @@ do_kvm_##n:								\
> #define KVM_HANDLER_SKIP(area, h, n)
> #endif
> 
> +#ifdef CONFIG_KVM_BOOK3S_NONHV

I really liked how you called the .c file _pr - why call it NONHV now?

> +#define KVMTEST_NONHV(n)		__KVMTEST(n)
> +#define KVM_HANDLER_NONHV(area, h, n)	__KVM_HANDLER(area, h, n)
> +#define KVM_HANDLER_NONHV_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
> +
> +#else
> +#define KVMTEST_NONHV(n)
> +#define KVM_HANDLER_NONHV(area, h, n)
> +#define KVM_HANDLER_NONHV_SKIP(area, h, n)
> +#endif
> +
> #define NOTEST(n)
> 
> /*
> @@ -210,7 +221,7 @@ label##_pSeries:					\
> 	HMT_MEDIUM;					\
> 	SET_SCRATCH0(r13);		/* save r13 */		\
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
> -				 EXC_STD, KVMTEST, vec)
> +				 EXC_STD, KVMTEST_NONHV, vec)
> 
> #define STD_EXCEPTION_HV(loc, vec, label)		\
> 	. = loc;					\
> @@ -227,8 +238,8 @@ label##_hv:						\
> 	beq	masked_##h##interrupt
> #define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
> 
> -#define SOFTEN_TEST(vec)						\
> -	KVMTEST(vec);							\
> +#define SOFTEN_TEST_NONHV(vec)						\
> +	KVMTEST_NONHV(vec);						\
> 	_SOFTEN_TEST(EXC_STD)
> 
> #define SOFTEN_TEST_HV(vec)						\
> @@ -248,7 +259,7 @@ label##_hv:						\
> 	.globl label##_pSeries;						\
> label##_pSeries:							\
> 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
> -				    EXC_STD, SOFTEN_TEST)
> +				    EXC_STD, SOFTEN_TEST_NONHV)
> 
> #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
> 	. = loc;							\
> diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
> index 0951b17..7b1f0e0 100644
> --- a/arch/powerpc/include/asm/kvm_asm.h
> +++ b/arch/powerpc/include/asm/kvm_asm.h
> @@ -64,8 +64,12 @@
> #define BOOK3S_INTERRUPT_PROGRAM	0x700
> #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
> #define BOOK3S_INTERRUPT_DECREMENTER	0x900
> +#define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
> #define BOOK3S_INTERRUPT_SYSCALL	0xc00
> #define BOOK3S_INTERRUPT_TRACE		0xd00
> +#define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
> +#define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
> +#define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
> #define BOOK3S_INTERRUPT_PERFMON	0xf00
> #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
> #define BOOK3S_INTERRUPT_VSX		0xf40
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 12829bb..5b76073 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -117,6 +117,7 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
> extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
> extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
> extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
> +extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
> extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
> extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
> extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
> @@ -128,10 +129,12 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
> extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
> extern int kvmppc_mmu_hpte_sysinit(void);
> extern void kvmppc_mmu_hpte_sysexit(void);
> +extern int kvmppc_mmu_hv_init(void);
> 
> extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
> extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
> extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
> +extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
> extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
> 			   bool upper, u32 val);
> extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
> @@ -152,6 +155,19 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
> 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
> }
> 
> +extern void kvm_return_point(void);
> +
> +/* Also add subarch specific defines */
> +
> +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
> +#include <asm/kvm_book3s_32.h>
> +#endif
> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#include <asm/kvm_book3s_64.h>
> +#endif
> +
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> +
> #define vcpu_guest_state(vcpu)	((vcpu)->arch.shared)
> 
> static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
> @@ -168,16 +184,6 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
> 		vcpu_guest_state(vcpu)->int_pending = 0;
> }
> 
> -static inline ulong dsisr(void)
> -{
> -	ulong r;
> -	asm ( "mfdsisr %0 " : "=r" (r) );
> -	return r;
> -}
> -
> -extern void kvm_return_point(void);
> -static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
> -
> static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
> {
> 	if ( num < 14 ) {
> @@ -265,6 +271,11 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
> 	return to_svcpu(vcpu)->fault_dar;
> }
> 
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.shared->msr;
> +}
> +
> static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> {
> 	ulong crit_raw = vcpu_guest_state(vcpu)->critical;
> @@ -284,6 +295,115 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> 
> 	return crit;
> }
> +#else /* CONFIG_KVM_BOOK3S_NONHV */
> +
> +#define vcpu_guest_state(vcpu)	(&(vcpu)->arch)
> +
> +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
> +{
> +	return 0;
> +}
> +
> +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
> +			unsigned long pending_now, unsigned long old_pending)
> +{
> +	/* Recalculate LPCR:MER based on the presence of
> +	 * a pending external interrupt
> +	 */
> +	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions) ||
> +	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions))

Wasn't pending_now pending_exceptions?

> +		vcpu->arch.lpcr |= LPCR_MER;
> +	else
> +		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
> +}
> +
> +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
> +{
> +	vcpu->arch.gpr[num] = val;
> +}
> +
> +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
> +{
> +	return vcpu->arch.gpr[num];
> +}
> +
> +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
> +{
> +	vcpu->arch.cr = val;
> +}
> +
> +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.cr;
> +}
> +
> +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
> +{
> +	vcpu->arch.xer = val;
> +}
> +
> +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.xer;
> +}
> +
> +static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.ctr = val;
> +}
> +
> +static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.ctr;
> +}
> +
> +static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.lr = val;
> +}
> +
> +static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.lr;
> +}
> +
> +static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.pc = val;
> +}
> +
> +static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.pc;
> +}
> +
> +static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
> +{
> +	ulong pc = kvmppc_get_pc(vcpu);
> +
> +	/* Load the instruction manually if it failed to do so in the
> +	 * exit path */
> +	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
> +		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
> +
> +	return vcpu->arch.last_inst;
> +}
> +
> +static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.fault_dar;
> +}
> +
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.msr;
> +}
> +
> +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> +{
> +	return false;
> +}
> +#endif
> 
> /* Magic register values loaded into r3 and r4 before the 'sc' assembly
>  * instruction for the OSI hypercalls */
> @@ -292,12 +412,4 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> 
> #define INS_DCBZ			0x7c0007ec
> 
> -/* Also add subarch specific defines */
> -
> -#ifdef CONFIG_PPC_BOOK3S_32
> -#include <asm/kvm_book3s_32.h>
> -#else
> -#include <asm/kvm_book3s_64.h>
> -#endif
> -
> #endif /* __ASM_KVM_BOOK3S_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index d5a8a38..d7279f5 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -61,6 +61,7 @@ kvmppc_resume_\intno:
> #else  /*__ASSEMBLY__ */
> 
> struct kvmppc_book3s_shadow_vcpu {
> +#ifdef CONFIG_KVM_BOOK3S_NONHV

Do you really want any shadow_vcpu in the paca at all?

> 	ulong gpr[14];
> 	u32 cr;
> 	u32 xer;
> @@ -72,6 +73,7 @@ struct kvmppc_book3s_shadow_vcpu {
> 	ulong pc;
> 	ulong shadow_srr1;
> 	ulong fault_dar;
> +#endif
> 
> 	ulong host_r1;
> 	ulong host_r2;
> @@ -84,7 +86,7 @@ struct kvmppc_book3s_shadow_vcpu {
> #ifdef CONFIG_PPC_BOOK3S_32
> 	u32     sr[16];			/* Guest SRs */
> #endif
> -#ifdef CONFIG_PPC_BOOK3S_64
> +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_NONHV)
> 	u8 slb_max;			/* highest used guest slb entry */
> 	struct  {
> 		u64     esid;
> diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
> index 9c9ba3d..a90e091 100644
> --- a/arch/powerpc/include/asm/kvm_booke.h
> +++ b/arch/powerpc/include/asm/kvm_booke.h
> @@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
> 	return vcpu->arch.fault_dear;
> }
> 
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.shared->msr;
> +}
> #endif /* __ASM_KVM_BOOKE_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 3ebe51b..ec62365 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -33,7 +33,9 @@
> /* memory slots that does not exposed to userspace */
> #define KVM_PRIVATE_MEM_SLOTS 4
> 
> +#ifdef CONFIG_KVM_MMIO
> #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
> +#endif
> 
> /* We don't currently support large pages. */
> #define KVM_HPAGE_GFN_SHIFT(x)	0
> @@ -133,7 +135,24 @@ struct kvmppc_exit_timing {
> 	};
> };
> 
> +struct kvmppc_pginfo {
> +	unsigned long pfn;
> +	atomic_t refcnt;
> +};
> +
> struct kvm_arch {
> +	unsigned long hpt_virt;
> +	unsigned long ram_npages;
> +	unsigned long ram_psize;
> +	unsigned long ram_porder;
> +	struct kvmppc_pginfo *ram_pginfo;
> +	unsigned int lpid;
> +	unsigned int host_lpid;
> +	unsigned long host_lpcr;
> +	unsigned long sdr1;
> +	unsigned long host_sdr1;
> +	int tlbie_lock;
> +	unsigned short last_vcpu[NR_CPUS];

This should all be #ifdef CONFIG_KVM_BOOK3S_HV

> };
> 
> struct kvmppc_pte {
> @@ -190,7 +209,7 @@ struct kvm_vcpu_arch {
> 	ulong rmcall;
> 	ulong host_paca_phys;
> 	struct kvmppc_slb slb[64];
> -	int slb_max;		/* # valid entries in slb[] */
> +	int slb_max;		/* 1 + index of last valid entry in slb[] */
> 	int slb_nr;		/* total number of entries in SLB */
> 	struct kvmppc_mmu mmu;
> #endif
> @@ -204,9 +223,10 @@ struct kvm_vcpu_arch {
> 	vector128 vr[32];
> 	vector128 vscr;
> #endif
> +	u32 vrsave;
> 
> #ifdef CONFIG_VSX
> -	u64 vsr[32];
> +	u64 vsr[64];
> #endif
> 
> #ifdef CONFIG_PPC_BOOK3S
> @@ -214,29 +234,45 @@ struct kvm_vcpu_arch {
> 	u32 qpr[32];
> #endif
> 
> -#ifdef CONFIG_BOOKE
> -	ulong pc;
> 	ulong ctr;
> 	ulong lr;
> 
> 	ulong xer;
> 	u32 cr;
> -#endif
> +
> +	ulong pc;
> +	ulong msr;
> 
> #ifdef CONFIG_PPC_BOOK3S
> 	ulong shadow_msr;
> 	ulong hflags;
> 	ulong guest_owned_ext;
> +	ulong purr;
> +	ulong spurr;
> +	ulong lpcr;
> +	ulong dscr;
> +	ulong amr;
> +	ulong uamor;
> +	u32 ctrl;
> +	u32 dsisr;
> +	ulong dabr;
> #endif
> 	u32 mmucr;
> +	ulong sprg0;
> +	ulong sprg1;
> +	ulong sprg2;
> +	ulong sprg3;
> 	ulong sprg4;
> 	ulong sprg5;
> 	ulong sprg6;
> 	ulong sprg7;
> +	ulong srr0;
> +	ulong srr1;
> 	ulong csrr0;
> 	ulong csrr1;
> 	ulong dsrr0;
> 	ulong dsrr1;
> +	ulong dear;
> 	ulong esr;
> 	u32 dec;
> 	u32 decar;
> @@ -259,6 +295,9 @@ struct kvm_vcpu_arch {
> 	u32 dbcr1;
> 	u32 dbsr;
> 
> +	u64 mmcr[3];
> +	u32 pmc[6];
> +
> #ifdef CONFIG_KVM_EXIT_TIMING
> 	struct kvmppc_exit_timing timing_exit;
> 	struct kvmppc_exit_timing timing_last_enter;
> @@ -272,8 +311,12 @@ struct kvm_vcpu_arch {
> 	struct dentry *debugfs_exit_timing;
> #endif
> 
> +#ifdef CONFIG_PPC_BOOK3S
> +	ulong fault_dar;
> +	u32 fault_dsisr;
> +#endif
> +
> #ifdef CONFIG_BOOKE
> -	u32 last_inst;
> 	ulong fault_dear;
> 	ulong fault_esr;
> 	ulong queued_dear;
> @@ -288,13 +331,18 @@ struct kvm_vcpu_arch {
> 	u8 dcr_is_write;
> 	u8 osi_needed;
> 	u8 osi_enabled;
> +	u8 hcall_needed;
> 
> 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
> 
> 	struct hrtimer dec_timer;
> 	struct tasklet_struct tasklet;
> 	u64 dec_jiffies;
> +	u64 dec_expires;
> 	unsigned long pending_exceptions;
> +	u16 last_cpu;
> +	u32 last_inst;
> +	int trap;
> 	struct kvm_vcpu_arch_shared *shared;
> 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
> 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 3210911..cd9ad96 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -110,6 +110,12 @@ extern void kvmppc_booke_exit(void);
> extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
> extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
> 
> +extern long kvmppc_alloc_hpt(struct kvm *kvm);
> +extern void kvmppc_free_hpt(struct kvm *kvm);
> +extern long kvmppc_prepare_vrma(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem);
> +extern void kvmppc_map_vrma(struct kvm *kvm,
> +			    struct kvm_userspace_memory_region *mem);
> extern int kvmppc_core_init_vm(struct kvm *kvm);
> extern void kvmppc_core_destroy_vm(struct kvm *kvm);
> extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> index ae7b3ef..0bb3fc1 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -90,13 +90,19 @@ extern char initial_stab[];
> 
> #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
> #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
> +#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
> #define HPTE_R_RPN_SHIFT	12
> -#define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
> -#define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
> +#define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
> #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
> #define HPTE_R_N		ASM_CONST(0x0000000000000004)
> +#define HPTE_R_G		ASM_CONST(0x0000000000000008)
> +#define HPTE_R_M		ASM_CONST(0x0000000000000010)
> +#define HPTE_R_I		ASM_CONST(0x0000000000000020)
> +#define HPTE_R_W		ASM_CONST(0x0000000000000040)
> +#define HPTE_R_WIMG		ASM_CONST(0x0000000000000078)
> #define HPTE_R_C		ASM_CONST(0x0000000000000080)
> #define HPTE_R_R		ASM_CONST(0x0000000000000100)
> +#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)
> 
> #define HPTE_V_1TB_SEG		ASM_CONST(0x4000000000000000)
> #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index 7412676..8dba5f6 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -149,6 +149,16 @@ struct paca_struct {
> #ifdef CONFIG_KVM_BOOK3S_HANDLER
> 	/* We use this to store guest state in */
> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	struct kvm_vcpu *kvm_vcpu;
> +	u64 dabr;
> +	u64 host_mmcr[3];
> +	u32 host_pmc[6];
> +	u64 host_purr;
> +	u64 host_spurr;
> +	u64 host_dscr;
> +	u64 dec_expires;

Hrm. I'd say either push those into shadow_vcpu for HV mode or get rid of the shadow_vcpu reference. I'd probably prefer the former.

> +#endif
> #endif
> };
> 
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index c07b7be..0036977 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -189,6 +189,10 @@
> #define SPRN_CTR	0x009	/* Count Register */
> #define SPRN_DSCR	0x11
> #define SPRN_CFAR	0x1c	/* Come From Address Register */
> +#define SPRN_AMR	0x1d	/* Authority Mask Register */
> +#define SPRN_UAMOR	0x9d	/* User Authority Mask Override Register */
> +#define SPRN_AMOR	0x15d	/* Authority Mask Override Register */
> +#define SPRN_RWMR	885
> #define SPRN_CTRLF	0x088
> #define SPRN_CTRLT	0x098
> #define   CTRL_CT	0xc0000000	/* current thread */
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 6887661..49e97fd 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -187,6 +187,7 @@ int main(void)
> 	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
> 	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
> 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
> +	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
> 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
> 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
> #endif /* CONFIG_PPC_STD_MMU_64 */
> @@ -200,9 +201,17 @@ int main(void)
> 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
> -	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
> -	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	DEFINE(PACA_KVM_VCPU, offsetof(struct paca_struct, kvm_vcpu));
> +	DEFINE(PACA_HOST_MMCR, offsetof(struct paca_struct, host_mmcr));
> +	DEFINE(PACA_HOST_PMC, offsetof(struct paca_struct, host_pmc));
> +	DEFINE(PACA_HOST_PURR, offsetof(struct paca_struct, host_purr));
> +	DEFINE(PACA_HOST_SPURR, offsetof(struct paca_struct, host_spurr));
> +	DEFINE(PACA_HOST_DSCR, offsetof(struct paca_struct, host_dscr));
> +	DEFINE(PACA_DABR, offsetof(struct paca_struct, dabr));
> +	DEFINE(PACA_KVM_DECEXP, offsetof(struct paca_struct, dec_expires));
> #endif
> +#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
> #endif /* CONFIG_PPC64 */
> 
> 	/* RTAS */
> @@ -396,6 +405,28 @@ int main(void)
> 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
> 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
> 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
> +	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
> +	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
> +#ifdef CONFIG_ALTIVEC
> +	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
> +	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
> +#endif
> +#ifdef CONFIG_VSX
> +	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
> +#endif
> +	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
> +	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
> +	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
> +	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
> +	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
> +	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
> +	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
> +	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.srr0));
> +	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.srr1));
> +	DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.sprg0));
> +	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.sprg1));
> +	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.sprg2));
> +	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.sprg3));
> 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
> 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
> 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
> @@ -406,16 +437,65 @@ int main(void)
> 
> 	/* book3s */
> #ifdef CONFIG_PPC_BOOK3S
> +	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
> +	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
> +	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
> +	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
> +	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
> +	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
> +	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
> +	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
> +	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
> +	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
> 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
> 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
> 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
> +	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
> +	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
> +	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
> +	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
> +	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
> +	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
> +	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
> 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
> 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
> 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
> 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
> 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
> +	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.dsisr));
> +	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.dear));
> +	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
> +	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
> +	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
> +	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
> +	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
> +	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
> +	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
> +	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
> +	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
> +	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
> +	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
> +	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
> 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
> 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
> +	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
> +	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
> +	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					scratch0));
> +	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					scratch1));
> +	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					in_guest));
> +	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
> +	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
> +	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> +#ifdef CONFIG_PPC64
> +	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
> +	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
> +#endif
> +	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					 vmhandler));
> 	DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
> 	DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
> 	DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
> @@ -435,16 +515,6 @@ int main(void)
> 	DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
> 	DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
> 	DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
> -	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
> -	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
> -	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					 vmhandler));
> -	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					scratch0));
> -	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					scratch1));
> -	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					in_guest));
> 	DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
> 					   fault_dsisr));
> 	DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
> @@ -453,6 +523,7 @@ int main(void)
> 					 last_inst));
> 	DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
> 					   shadow_srr1));
> +#endif /* CONFIG_KVM_BOOK3S_NONHV */
> #ifdef CONFIG_PPC_BOOK3S_32
> 	DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
> #endif
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index cbdf374..80c6456 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -87,14 +87,14 @@ data_access_not_stab:
> END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
> #endif
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
> -				 KVMTEST, 0x300)
> +				 KVMTEST_NONHV, 0x300)
> 
> 	. = 0x380
> 	.globl data_access_slb_pSeries
> data_access_slb_pSeries:
> 	HMT_MEDIUM
> 	SET_SCRATCH0(r13)
> -	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
> +	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x380)
> 	std	r3,PACA_EXSLB+EX_R3(r13)
> 	mfspr	r3,SPRN_DAR
> #ifdef __DISABLED__
> @@ -125,7 +125,7 @@ data_access_slb_pSeries:
> instruction_access_slb_pSeries:
> 	HMT_MEDIUM
> 	SET_SCRATCH0(r13)
> -	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
> +	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x480)
> 	std	r3,PACA_EXSLB+EX_R3(r13)
> 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
> #ifdef __DISABLED__
> @@ -153,32 +153,32 @@ instruction_access_slb_pSeries:
> hardware_interrupt_pSeries:
> hardware_interrupt_hv:
> 	BEGIN_FTR_SECTION
> -		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
> -					    EXC_STD, SOFTEN_TEST)
> -		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
> -	FTR_SECTION_ELSE
> 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
> 					    EXC_HV, SOFTEN_TEST_HV)
> 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
> -	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
> +	FTR_SECTION_ELSE
> +		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
> +					    EXC_STD, SOFTEN_TEST_NONHV)
> +		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
> +	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
> 
> 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x600)
> 
> 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x700)
> 
> 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x800)
> 
> 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
> 	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
> 
> 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xa00)
> 
> 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xb00)
> 
> 	. = 0xc00
> 	.globl	system_call_pSeries
> @@ -219,7 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
> 	b	.
> 
> 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xd00)
> 
> 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
> 	 * out of line to handle them
> @@ -254,23 +254,23 @@ vsx_unavailable_pSeries_1:
> 
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
> #endif /* CONFIG_CBE_RAS */
> 
> 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
> 
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
> #endif /* CONFIG_CBE_RAS */
> 
> 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x1700)
> 
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
> #endif /* CONFIG_CBE_RAS */
> 
> 	. = 0x3000
> @@ -297,7 +297,7 @@ data_access_check_stab:
> 	mfspr	r9,SPRN_DSISR
> 	srdi	r10,r10,60
> 	rlwimi	r10,r9,16,0x20
> -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
> 	rlwimi	r10,r9,8,0x300
> #endif
> @@ -316,11 +316,11 @@ do_stab_bolted_pSeries:
> 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
> #endif /* CONFIG_POWER4_ONLY */
> 
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
> -	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
> -	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x300)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXSLB, EXC_STD, 0x380)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x400)
> +	KVM_HANDLER_NONHV(PACA_EXSLB, EXC_STD, 0x480)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x900)
> 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
> 
> 	.align	7
> @@ -337,11 +337,11 @@ do_stab_bolted_pSeries:
> 
> 	/* moved from 0xf00 */
> 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf00)
> 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf20)
> 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf40)
> 
> /*
>  * An interrupt came in while soft-disabled; clear EE in SRR1,
> @@ -418,7 +418,11 @@ slb_miss_user_pseries:
> /* KVM's trampoline code needs to be close to the interrupt handlers */
> 
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> #include "../kvm/book3s_rmhandlers.S"
> +#else
> +#include "../kvm/book3s_hv_rmhandlers.S"
> +#endif
> #endif
> 
> 	.align	7
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index b7baff7..6ff191b 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -20,7 +20,6 @@ config KVM
> 	bool
> 	select PREEMPT_NOTIFIERS
> 	select ANON_INODES
> -	select KVM_MMIO
> 
> config KVM_BOOK3S_HANDLER
> 	bool
> @@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
> config KVM_BOOK3S_32_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
> +	select KVM_MMIO
> 
> config KVM_BOOK3S_64_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
> 
> +config KVM_BOOK3S_NONHV
> +	bool
> +	select KVM_MMIO
> +
> config KVM_BOOK3S_32
> 	tristate "KVM support for PowerPC book3s_32 processors"
> 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
> 	select KVM
> 	select KVM_BOOK3S_32_HANDLER
> +	select KVM_BOOK3S_NONHV
> 	---help---
> 	  Support running unmodified book3s_32 guest kernels
> 	  in virtual machines on book3s_32 host processors.
> @@ -48,10 +53,38 @@ config KVM_BOOK3S_32
> 	  If unsure, say N.
> 
> config KVM_BOOK3S_64
> -	tristate "KVM support for PowerPC book3s_64 processors"
> +	bool
> +	select KVM_BOOK3S_64_HANDLER
> +
> +config KVM_BOOK3S_64_HV
> +	bool "KVM support for POWER7 using hypervisor mode in host"
> 	depends on EXPERIMENTAL && PPC_BOOK3S_64
> 	select KVM
> -	select KVM_BOOK3S_64_HANDLER
> +	select KVM_BOOK3S_64
> +	---help---
> +	  Support running unmodified book3s_64 guest kernels in
> +	  virtual machines on POWER7 processors that have hypervisor
> +	  mode available to the host.
> +
> +	  If you say Y here, KVM will use the hardware virtualization
> +	  facilities of POWER7 (and later) processors, meaning that
> +	  guest operating systems will run at full hardware speed
> +	  using supervisor and user modes.  However, this also means
> +	  that KVM is not usable under PowerVM (pHyp), is only usable
> +	  on POWER7 (or later) processors, and can only emulate
> +	  POWER5+, POWER6 and POWER7 processors.
> +
> +	  This module provides access to the hardware capabilities through
> +	  a character device node named /dev/kvm.
> +
> +	  If unsure, say N.
> +
> +config KVM_BOOK3S_64_NONHV
> +	tristate "KVM support for PowerPC book3s_64 processors"
> +	depends on EXPERIMENTAL && PPC_BOOK3S_64 && !KVM_BOOK3S_64_HV
> +	select KVM
> +	select KVM_BOOK3S_64
> +	select KVM_BOOK3S_NONHV
> 	---help---
> 	  Support running unmodified book3s_64 and book3s_32 guest kernels
> 	  in virtual machines on book3s_64 host processors.
> @@ -65,6 +98,7 @@ config KVM_440
> 	bool "KVM support for PowerPC 440 processors"
> 	depends on EXPERIMENTAL && 44x
> 	select KVM
> +	select KVM_MMIO

e500 should also select MMIO, no?

> 	---help---
> 	  Support running unmodified 440 guest kernels in virtual machines on
> 	  440 host processors.
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index bf9854f..37c1a60 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -14,7 +14,7 @@ CFLAGS_emulate.o  := -I.
> 
> common-objs-y += powerpc.o emulate.o
> obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
> -obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
> +obj-$(CONFIG_KVM_BOOK3S_NONHV) += book3s_exports.o
> 
> AFLAGS_booke_interrupts.o := -I$(obj)
> 
> @@ -38,7 +38,7 @@ kvm-e500-objs := \
> 	e500_emulate.o
> kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
> 
> -kvm-book3s_64-objs := \
> +kvm-book3s_64_nonhv-objs := \
> 	$(common-objs-y) \
> 	fpu.o \
> 	book3s_paired_singles.o \
> @@ -50,7 +50,17 @@ kvm-book3s_64-objs := \
> 	book3s_64_mmu_host.o \
> 	book3s_64_mmu.o \
> 	book3s_32_mmu.o
> -kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
> +kvm-objs-$(CONFIG_KVM_BOOK3S_64_NONHV) := $(kvm-book3s_64_nonhv-objs)
> +
> +kvm-book3s_64_hv-objs := \
> +	../../../virt/kvm/kvm_main.o \
> +	powerpc.o \
> +	emulate.o \
> +	book3s.o \
> +	book3s_hv.o \
> +	book3s_hv_interrupts.o \
> +	book3s_64_mmu_hv.o
> +kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) := $(kvm-book3s_64_hv-objs)
> 
> kvm-book3s_32-objs := \
> 	$(common-objs-y) \
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> new file mode 100644
> index 0000000..52d1be1
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -0,0 +1,258 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + *
> + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + */
> +
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/highmem.h>
> +#include <linux/gfp.h>
> +#include <linux/slab.h>
> +#include <linux/hugetlb.h>
> +
> +#include <asm/tlbflush.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu-hash64.h>
> +#include <asm/hvcall.h>
> +#include <asm/synch.h>
> +#include <asm/ppc-opcode.h>
> +#include <asm/cputable.h>
> +
> +/* For now use fixed-size 16MB page table */
> +#define HPT_ORDER	24
> +#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
> +#define HPT_HASH_MASK	(HPT_NPTEG - 1)
> +
> +/* Pages in the VRMA are 16MB pages */
> +#define VRMA_PAGE_ORDER	24
> +#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
> +
> +#define NR_LPIDS	1024
> +unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
> +
> +long kvmppc_alloc_hpt(struct kvm *kvm)
> +{
> +	unsigned long hpt;
> +	unsigned long lpid;
> +
> +	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
> +			       HPT_ORDER - PAGE_SHIFT);

This would end up failing quite often, no?

> +	if (!hpt) {
> +		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
> +		return -ENOMEM;
> +	}
> +	kvm->arch.hpt_virt = hpt;
> +
> +	do {
> +		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
> +		if (lpid >= NR_LPIDS) {
> +			pr_err("kvm_alloc_hpt: No LPIDs free\n");
> +			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
> +			return -ENOMEM;
> +		}
> +	} while (test_and_set_bit(lpid, lpid_inuse));
> +
> +	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
> +	kvm->arch.lpid = lpid;
> +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
> +	kvm->arch.host_lpid = mfspr(SPRN_LPID);
> +	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);

How do these correlate with the guest's hv mmu? I'd like to keep the code clean enough so we can potentially use it for PR mode as well :).

> +	
> +	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
> +	return 0;
> +}
> +
> +void kvmppc_free_hpt(struct kvm *kvm)
> +{
> +	unsigned long i;
> +	struct kvmppc_pginfo *pginfo;
> +
> +	clear_bit(kvm->arch.lpid, lpid_inuse);
> +	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
> +
> +	if (kvm->arch.ram_pginfo) {
> +		pginfo = kvm->arch.ram_pginfo;
> +		kvm->arch.ram_pginfo = NULL;
> +		for (i = 0; i < kvm->arch.ram_npages; ++i)
> +			put_page(pfn_to_page(pginfo[i].pfn));
> +		kfree(pginfo);
> +	}
> +}
> +
> +static unsigned long user_page_size(unsigned long addr)
> +{
> +	struct vm_area_struct *vma;
> +	unsigned long size = PAGE_SIZE;
> +
> +	down_read(&current->mm->mmap_sem);
> +	vma = find_vma(current->mm, addr);
> +	if (vma)
> +		size = vma_kernel_pagesize(vma);
> +	up_read(&current->mm->mmap_sem);
> +	return size;
> +}
> +
> +static pfn_t hva_to_pfn(unsigned long addr)
> +{
> +	struct page *page[1];
> +	int npages;
> +
> +	might_sleep();
> +
> +	npages = get_user_pages_fast(addr, 1, 1, page);
> +
> +	if (unlikely(npages != 1))
> +		return 0;
> +
> +	return page_to_pfn(page[0]);
> +}
> +
> +long kvmppc_prepare_vrma(struct kvm *kvm,
> +			 struct kvm_userspace_memory_region *mem)
> +{
> +	unsigned long psize, porder;
> +	unsigned long i, npages;
> +	struct kvmppc_pginfo *pginfo;
> +	pfn_t pfn;
> +	unsigned long hva;
> +
> +	/* First see what page size we have */
> +	psize = user_page_size(mem->userspace_addr);
> +	/* For now, only allow 16MB pages */

The reason to go for 16MB pages is because of the host mmu code, not the guest hv mmu. So please at least #ifdef the code to HV so we document that correlation.

> +	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
> +		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
> +		       psize, mem->memory_size, mem->userspace_addr);
> +		return -EINVAL;
> +	}
> +	porder = __ilog2(psize);
> +
> +	npages = mem->memory_size >> porder;
> +	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
> +	if (!pginfo) {
> +		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
> +		       npages * sizeof(struct kvmppc_pginfo));
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < npages; ++i) {
> +		hva = mem->userspace_addr + (i << porder);
> +		if (user_page_size(hva) != psize)
> +			goto err;
> +		pfn = hva_to_pfn(hva);
> +		if (pfn == 0) {
> +			pr_err("oops, no pfn for hva %lx\n", hva);
> +			goto err;
> +		}
> +		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
> +			pr_err("oops, unaligned pfn %llx\n", pfn);
> +			put_page(pfn_to_page(pfn));
> +			goto err;
> +		}
> +		pginfo[i].pfn = pfn;
> +	}
> +
> +	kvm->arch.ram_npages = npages;
> +	kvm->arch.ram_psize = psize;
> +	kvm->arch.ram_porder = porder;
> +	kvm->arch.ram_pginfo = pginfo;
> +
> +	return 0;
> +
> + err:
> +	kfree(pginfo);
> +	return -EINVAL;
> +}
> +
> +void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
> +{
> +	unsigned long i;
> +	unsigned long npages = kvm->arch.ram_npages;
> +	unsigned long pfn;
> +	unsigned long *hpte;
> +	unsigned long hash;
> +	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
> +
> +	if (!pginfo)
> +		return;
> +
> +	/* VRMA can't be > 1TB */
> +	if (npages > 1ul << (40 - kvm->arch.ram_porder))
> +		npages = 1ul << (40 - kvm->arch.ram_porder);
> +	/* Can't use more than 1 HPTE per HPTEG */
> +	if (npages > HPT_NPTEG)
> +		npages = HPT_NPTEG;
> +
> +	for (i = 0; i < npages; ++i) {
> +		pfn = pginfo[i].pfn;
> +		/* can't use hpt_hash since va > 64 bits */
> +		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
> +		/*
> +		 * We assume that the hash table is empty and no
> +		 * vcpus are using it at this stage.  Since we create
> +		 * at most one HPTE per HPTEG, we just assume entry 7
> +		 * is available and use it.
> +		 */
> +		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
> +		hpte += 7 * 2;
> +		/* HPTE low word - RPN, protection, etc. */
> +		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
> +			HPTE_R_M | PP_RWXX;
> +		wmb();
> +		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
> +			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
> +			HPTE_V_LARGE | HPTE_V_VALID;
> +	}
> +}
> +
> +int kvmppc_mmu_hv_init(void)
> +{
> +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> +		return 0;

Return 0 for "it doesn't work" might not be the right exit code ;).

> +	memset(lpid_inuse, 0, sizeof(lpid_inuse));
> +	set_bit(mfspr(SPRN_LPID), lpid_inuse);
> +	set_bit(NR_LPIDS - 1, lpid_inuse);
> +
> +	return 0;
> +}
> +
> +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
> +{
> +}
> +
> +static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
> +{
> +	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
> +}
> +
> +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
> +				struct kvmppc_pte *gpte, bool data)
> +{
> +	return -ENOENT;

Can't you just call the normal book3s_64 mmu code here? Without xlate, doing ppc_ld doesn't work, which means that reading out the faulting guest instruction breaks. We'd also need it for PR mode :).

> +}
> +
> +void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
> +
> +	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
> +
> +	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
> +	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
> +
> +	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> new file mode 100644
> index 0000000..f6b7cd1
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -0,0 +1,413 @@
> +/*
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
> + *
> + * Authors:
> + *    Paul Mackerras <paulus@au1.ibm.com>
> + *    Alexander Graf <agraf@suse.de>
> + *    Kevin Wolf <mail@kevin-wolf.de>
> + *
> + * Description: KVM functions specific to running on Book 3S
> + * processors in hypervisor mode (specifically POWER7 and later).
> + *
> + * This file is derived from arch/powerpc/kvm/book3s.c,
> + * by Alexander Graf <agraf@suse.de>.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/preempt.h>
> +#include <linux/sched.h>
> +#include <linux/delay.h>
> +#include <linux/fs.h>
> +#include <linux/anon_inodes.h>
> +
> +#include <asm/reg.h>
> +#include <asm/cputable.h>
> +#include <asm/cacheflush.h>
> +#include <asm/tlbflush.h>
> +#include <asm/uaccess.h>
> +#include <asm/io.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu_context.h>
> +#include <asm/lppaca.h>
> +#include <asm/processor.h>
> +#include <linux/gfp.h>
> +#include <linux/sched.h>
> +#include <linux/vmalloc.h>
> +#include <linux/highmem.h>
> +
> +/* #define EXIT_DEBUG */
> +/* #define EXIT_DEBUG_SIMPLE */
> +/* #define EXIT_DEBUG_INT */
> +
> +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> +	local_paca->kvm_vcpu = vcpu;
> +	vcpu->cpu = cpu;
> +}
> +
> +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
> +{
> +	vcpu->cpu = -1;
> +}
> +
> +void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
> +{
> +	u64 now;
> +	unsigned long dec_nsec;
> +
> +	now = get_tb();
> +	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
> +		kvmppc_core_queue_dec(vcpu);
> +	if (vcpu->arch.pending_exceptions)
> +		return;
> +	if (vcpu->arch.dec_expires != ~(u64)0) {
> +		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
> +			tb_ticks_per_sec;
> +		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
> +			      HRTIMER_MODE_REL);
> +	}
> +
> +	kvm_vcpu_block(vcpu);
> +	vcpu->stat.halt_wakeup++;
> +
> +	if (vcpu->arch.dec_expires != ~(u64)0)
> +		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
> +}
> +
> +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
> +{
> +	vcpu->arch.msr = msr;
> +}
> +
> +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
> +{
> +	vcpu->arch.pvr = pvr;
> +	kvmppc_mmu_book3s_hv_init(vcpu);

No need for you to do it depending on pvr. You can just do the mmu initialization on normal init :).

> +}
> +
> +void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
> +{
> +	int r;
> +
> +	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
> +	pr_err("pc  = %.16lx  msr = %.16lx  trap = %x\n",
> +	       vcpu->arch.pc, vcpu->arch.msr, vcpu->arch.trap);
> +	for (r = 0; r < 16; ++r)
> +		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
> +		       r, kvmppc_get_gpr(vcpu, r),
> +		       r+16, kvmppc_get_gpr(vcpu, r+16));
> +	pr_err("ctr = %.16lx  lr  = %.16lx\n",
> +	       vcpu->arch.ctr, vcpu->arch.lr);
> +	pr_err("srr0 = %.16lx srr1 = %.16lx\n",
> +	       vcpu->arch.srr0, vcpu->arch.srr1);
> +	pr_err("sprg0 = %.16lx sprg1 = %.16lx\n",
> +	       vcpu->arch.sprg0, vcpu->arch.sprg1);
> +	pr_err("sprg2 = %.16lx sprg3 = %.16lx\n",
> +	       vcpu->arch.sprg2, vcpu->arch.sprg3);
> +	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
> +	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.dsisr);
> +	pr_err("dar = %.16lx\n", vcpu->arch.dear);
> +	pr_err("fault dar = %.16lx dsisr = %.8x\n",
> +	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
> +	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
> +	for (r = 0; r < vcpu->arch.slb_max; ++r)
> +		pr_err("  ESID = %.16llx VSID = %.16llx\n",
> +		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
> +	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
> +	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
> +	       vcpu->arch.last_inst);
> +}
> +
> +static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
> +			      struct task_struct *tsk)
> +{
> +	int r = RESUME_HOST;
> +
> +	vcpu->stat.sum_exits++;
> +
> +	run->exit_reason = KVM_EXIT_UNKNOWN;
> +	run->ready_for_interrupt_injection = 1;
> +	switch (vcpu->arch.trap) {
> +	/* We're good on these - the host merely wanted to get our attention */
> +	case BOOK3S_INTERRUPT_HV_DECREMENTER:
> +		vcpu->stat.dec_exits++;
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_EXTERNAL:
> +		vcpu->stat.ext_intr_exits++;
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_PERFMON:
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_PROGRAM:
> +	{
> +		ulong flags;
> +		/*
> +		 * Normally program interrupts are delivered directly
> +		 * to the guest by the hardware, but we can get here
> +		 * as a result of a hypervisor emulation interrupt
> +		 * (e40) getting turned into a 700 by BML RTAS.

Not sure I fully understand what's going on there. Mind to explain? :)

> +		 */
> +		flags = vcpu->arch.msr & 0x1f0000ull;
> +		kvmppc_core_queue_program(vcpu, flags);
> +		r = RESUME_GUEST;
> +		break;
> +	}
> +	case BOOK3S_INTERRUPT_SYSCALL:
> +	{
> +		/* hcall - punt to userspace */
> +		int i;
> +

Do we really want to accept sc from pr=1? I'd just reject them straight away.

> +		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
> +		for (i = 0; i < 9; ++i)
> +			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
> +		run->exit_reason = KVM_EXIT_PAPR_HCALL;
> +		vcpu->arch.hcall_needed = 1;
> +		r = RESUME_HOST;
> +		break;
> +	}
> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> +		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
> +		vcpu->arch.dear = vcpu->arch.fault_dar;
> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
> +					0x08000000);

What do these do? I thought the guest gets DSI and ISI directly?

> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
> +		kvmppc_core_queue_program(vcpu, 0x80000);
> +		r = RESUME_GUEST;
> +		break;
> +	default:
> +		kvmppc_dump_regs(vcpu);
> +		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%lx\n",
> +			vcpu->arch.trap, kvmppc_get_pc(vcpu), vcpu->arch.msr);
> +		r = RESUME_HOST;
> +		BUG();
> +		break;
> +	}
> +
> +
> +	if (!(r & RESUME_HOST)) {
> +		/* To avoid clobbering exit_reason, only check for signals if
> +		 * we aren't already exiting to userspace for some other
> +		 * reason. */
> +		if (signal_pending(tsk)) {
> +			vcpu->stat.signal_exits++;
> +			run->exit_reason = KVM_EXIT_INTR;
> +			r = -EINTR;
> +		} else {
> +			kvmppc_core_deliver_interrupts(vcpu);
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> +                                  struct kvm_sregs *sregs)
> +{
> +	int i;
> +
> +	sregs->pvr = vcpu->arch.pvr;
> +
> +	memset(sregs, 0, sizeof(struct kvm_sregs));
> +	for (i = 0; i < vcpu->arch.slb_max; i++) {
> +		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
> +		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
> +	}
> +
> +	return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> +                                  struct kvm_sregs *sregs)
> +{
> +	int i, j;
> +
> +	kvmppc_set_pvr(vcpu, sregs->pvr);
> +
> +	j = 0;
> +	for (i = 0; i < vcpu->arch.slb_nr; i++) {
> +		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
> +			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
> +			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
> +			++j;
> +		}
> +	}
> +	vcpu->arch.slb_max = j;
> +
> +	return 0;
> +}
> +
> +int kvmppc_core_check_processor_compat(void)
> +{
> +	if (cpu_has_feature(CPU_FTR_HVMODE_206))
> +		return 0;
> +	return -EIO;
> +}
> +
> +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
> +{
> +	struct kvm_vcpu *vcpu;
> +	int err = -ENOMEM;
> +	unsigned long lpcr;
> +
> +	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
> +	if (!vcpu)
> +		goto out;
> +
> +	err = kvm_vcpu_init(vcpu, kvm, id);
> +	if (err)
> +		goto free_vcpu;
> +
> +	vcpu->arch.last_cpu = -1;
> +	vcpu->arch.host_msr = mfmsr();
> +	vcpu->arch.mmcr[0] = MMCR0_FC;
> +	vcpu->arch.ctrl = CTRL_RUNLATCH;
> +	/* default to book3s_64 (power7) */
> +	vcpu->arch.pvr = 0x3f0200;

Wouldn't it make sense to simply default to the host pvr? Not sure - maybe it's not :).

> +	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
> +
> +	/* remember where some real-mode handlers are */
> +	vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
> +	vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
> +	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
> +	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
> +
> +	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
> +	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
> +	vcpu->arch.lpcr = lpcr;
> +
> +	return vcpu;
> +
> +free_vcpu:
> +	kfree(vcpu);
> +out:
> +	return ERR_PTR(err);
> +}
> +
> +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
> +{
> +	kvm_vcpu_uninit(vcpu);
> +	kfree(vcpu);
> +}
> +
> +extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
> +
> +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +{
> +	u64 now;
> +
> +	if (signal_pending(current)) {
> +		run->exit_reason = KVM_EXIT_INTR;
> +		return -EINTR;
> +	}
> +
> +	flush_fp_to_thread(current);

Do those with fine with preemption enabled?

> +	flush_altivec_to_thread(current);
> +	flush_vsx_to_thread(current);
> +	preempt_disable();
> +
> +	kvm_guest_enter();
> +
> +	__kvmppc_vcore_entry(NULL, vcpu);
> +
> +	kvm_guest_exit();
> +
> +	preempt_enable();
> +	kvm_resched(vcpu);
> +
> +	now = get_tb();
> +	/* cancel pending dec exception if dec is positive */
> +	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
> +		kvmppc_core_dequeue_dec(vcpu);
> +
> +	return kvmppc_handle_exit(run, vcpu, current);
> +}
> +
> +int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem)
> +{
> +	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
> +		return kvmppc_prepare_vrma(kvm, mem);
> +	return 0;
> +}
> +
> +void kvmppc_core_commit_memory_region(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem)
> +{
> +	if (mem->guest_phys_addr == 0 && mem->memory_size != 0)
> +		kvmppc_map_vrma(kvm, mem);
> +}
> +
> +int kvmppc_core_init_vm(struct kvm *kvm)
> +{
> +	long r;
> +
> +	/* Allocate hashed page table */
> +	r = kvmppc_alloc_hpt(kvm);
> +
> +	return r;
> +}
> +
> +void kvmppc_core_destroy_vm(struct kvm *kvm)
> +{
> +	kvmppc_free_hpt(kvm);
> +}
> +
> +/* These are stubs for now */
> +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
> +{
> +}
> +
> +/* We don't need to emulate any privileged instructions or dcbz */
> +int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
> +                           unsigned int inst, int *advance)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +static int kvmppc_book3s_hv_init(void)
> +{
> +	int r;
> +
> +	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
> +
> +	if (r)
> +		return r;
> +
> +	r = kvmppc_mmu_hv_init();
> +
> +	return r;
> +}
> +
> +static void kvmppc_book3s_hv_exit(void)
> +{
> +	kvm_exit();
> +}
> +
> +module_init(kvmppc_book3s_hv_init);
> +module_exit(kvmppc_book3s_hv_exit);
> diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
> new file mode 100644
> index 0000000..19d152d
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
> @@ -0,0 +1,326 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + *
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + *
> + * Derived from book3s_interrupts.S, which is:
> + * Copyright SUSE Linux Products GmbH 2009
> + *
> + * Authors: Alexander Graf <agraf@suse.de>
> + */
> +
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/page.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/exception-64s.h>
> +#include <asm/ppc-opcode.h>
> +
> +#define DISABLE_INTERRUPTS	\
> +	mfmsr   r0;		\
> +	rldicl  r0,r0,48,1;	\
> +	rotldi  r0,r0,16;	\
> +	mtmsrd  r0,1;		\
> +
> +/*****************************************************************************
> + *                                                                           *
> + *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
> + *                                                                           *
> + ****************************************************************************/
> +
> +/* Registers:
> + *  r4: vcpu pointer
> + */
> +_GLOBAL(__kvmppc_vcore_entry)
> +
> +	/* Write correct stack frame */
> +	mflr	r0
> +	std	r0,PPC_LR_STKOFF(r1)
> +
> +	/* Save host state to the stack */
> +	stdu	r1, -SWITCH_FRAME_SIZE(r1)
> +
> +	/* Save non-volatile registers (r14 - r31) */
> +	SAVE_NVGPRS(r1)
> +
> +	/* Save host PMU registers and load guest PMU registers */
> +	/* R4 is live here (vcpu pointer) but not r3 or r5 */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
> +	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
> +	isync
> +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
> +	lbz	r5, LPPACA_PMCINUSE(r3)
> +	cmpwi	r5, 0
> +	beq	31f			/* skip if not */
> +	mfspr	r5, SPRN_MMCR1
> +	mfspr	r6, SPRN_MMCRA
> +	std	r7, PACA_HOST_MMCR(r13)
> +	std	r5, PACA_HOST_MMCR + 8(r13)
> +	std	r6, PACA_HOST_MMCR + 16(r13)
> +	mfspr	r3, SPRN_PMC1
> +	mfspr	r5, SPRN_PMC2
> +	mfspr	r6, SPRN_PMC3
> +	mfspr	r7, SPRN_PMC4
> +	mfspr	r8, SPRN_PMC5
> +	mfspr	r9, SPRN_PMC6
> +	stw	r3, PACA_HOST_PMC(r13)
> +	stw	r5, PACA_HOST_PMC + 4(r13)
> +	stw	r6, PACA_HOST_PMC + 8(r13)
> +	stw	r7, PACA_HOST_PMC + 12(r13)
> +	stw	r8, PACA_HOST_PMC + 16(r13)
> +	stw	r9, PACA_HOST_PMC + 20(r13)
> +31:
> +
> +	/* Save host DSCR */
> +	mfspr	r3, SPRN_DSCR
> +	std	r3, PACA_HOST_DSCR(r13)
> +
> +	/* Save host DABR */
> +	mfspr	r3, SPRN_DABR
> +	std	r3, PACA_DABR(r13)
> +
> +	DISABLE_INTERRUPTS
> +
> +	/*
> +	 * Put whatever is in the decrementer into the
> +	 * hypervisor decrementer.
> +	 */
> +	mfspr	r8,SPRN_DEC
> +	mftb	r7
> +	mtspr	SPRN_HDEC,r8

Can't we just always use HDEC on the host? That's save us from all the swapping here.

> +	extsw	r8,r8
> +	add	r8,r8,r7
> +	std	r8,PACA_KVM_DECEXP(r13)

Why is dec+hdec on vcpu_run decexp? What exactly does this store?

> +
> +	ld	r5, VCPU_TRAMPOLINE_ENTER(r4)
> +	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
> +
> +	/* Jump to segment patching handler and into our guest */
> +	b	kvmppc_rmcall
> +
> +/*
> + * This is the handler in module memory. It gets jumped at from the
> + * lowmem trampoline code, so it's basically the guest exit code.
> + *
> + */
> +
> +.global kvmppc_handler_highmem
> +kvmppc_handler_highmem:
> +
> +	/*
> +	 * Register usage at this point:
> +	 *
> +	 * R1       = host R1
> +	 * R2       = host R2
> +	 * R12      = exit handler id
> +	 * R13      = PACA
> +	 * SVCPU.*  = guest *
> +	 *
> +	 */
> +
> +	/* R7 = vcpu */
> +	ld	r7, PACA_KVM_VCPU(r13)
> +
> +	/*
> +	 * Reload DEC.  HDEC interrupts were disabled when
> +	 * we reloaded the host's LPCR value.
> +	 */
> +	ld	r3, PACA_KVM_DECEXP(r13)
> +	mftb	r4
> +	subf	r4, r4, r3
> +	mtspr	SPRN_DEC, r4
> +
> +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
> +	lbz	r4, LPPACA_PMCINUSE(r3)
> +	cmpwi	r4, 0
> +	beq	23f			/* skip if not */
> +	lwz	r3, PACA_HOST_PMC(r13)
> +	lwz	r4, PACA_HOST_PMC + 4(r13)
> +	lwz	r5, PACA_HOST_PMC + 4(r13)

copy&paste error?

> +	lwz	r6, PACA_HOST_PMC + 4(r13)
> +	lwz	r8, PACA_HOST_PMC + 4(r13)
> +	lwz	r9, PACA_HOST_PMC + 4(r13)
> +	mtspr	SPRN_PMC1, r3
> +	mtspr	SPRN_PMC2, r4
> +	mtspr	SPRN_PMC3, r5
> +	mtspr	SPRN_PMC4, r6
> +	mtspr	SPRN_PMC5, r8
> +	mtspr	SPRN_PMC6, r9
> +	ld	r3, PACA_HOST_MMCR(r13)
> +	ld	r4, PACA_HOST_MMCR + 8(r13)
> +	ld	r5, PACA_HOST_MMCR + 16(r13)
> +	mtspr	SPRN_MMCR1, r4
> +	mtspr	SPRN_MMCRA, r5
> +	mtspr	SPRN_MMCR0, r3
> +	isync
> +23:
> +
> +	/* Restore host msr -> SRR1 */
> +	ld	r4, VCPU_HOST_MSR(r7)
> +
> +	/*
> +	 * For some interrupts, we need to call the real Linux
> +	 * handler, so it can do work for us. This has to happen
> +	 * as if the interrupt arrived from the kernel though,
> +	 * so let's fake it here where most state is restored.
> +	 *
> +	 * Call Linux for hardware interrupts/decrementer
> +	 * r3 = address of interrupt handler (exit reason)
> +	 */
> +	/* Note: preemption is disabled at this point */
> +
> +	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
> +	beq	1f
> +	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
> +	beq	1f
> +
> +	/* Back to EE=1 */
> +	mtmsr	r4
> +	sync
> +	b	kvm_return_point
> +
> +1:	bl	call_linux_handler
> +
> +.global kvm_return_point
> +kvm_return_point:
> +	/* Restore non-volatile host registers (r14 - r31) */
> +	REST_NVGPRS(r1)
> +
> +	addi    r1, r1, SWITCH_FRAME_SIZE
> +	ld	r0, PPC_LR_STKOFF(r1)
> +	mtlr	r0
> +	blr
> +
> +call_linux_handler:
> +	/* Restore host IP -> SRR0 */
> +	mflr	r3
> +	mtlr	r12
> +
> +	ld	r5, VCPU_TRAMPOLINE_LOWMEM(r7)
> +	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
> +	b	kvmppc_rmcall
> +
> +/*
> + * Save away FP, VMX and VSX registers.
> + * r3 = vcpu pointer
> +*/
> +_GLOBAL(kvmppc_save_fp)
> +	mfmsr	r9
> +	ori	r8,r9,MSR_FP
> +#ifdef CONFIG_ALTIVEC
> +#ifdef CONFIG_VSX
> +	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
> +#else
> +	oris	r8,r8,MSR_VEC@h
> +#endif
> +#endif
> +	mtmsrd	r8
> +	isync
> +#ifdef CONFIG_VSX
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r6,reg*16+VCPU_VSRS
> +	stxvd2x	reg,r6,r3
> +	reg = reg + 1
> +	.endr
> +FTR_SECTION_ELSE
> +#endif
> +	reg = 0
> +	.rept	32
> +	stfd	reg,reg*8+VCPU_FPRS(r3)
> +	reg = reg + 1
> +	.endr
> +#ifdef CONFIG_VSX
> +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
> +#endif
> +	mffs	fr0
> +	stfd	fr0,VCPU_FPSCR(r3)
> +
> +#ifdef CONFIG_ALTIVEC
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r6,reg*16+VCPU_VRS
> +	stvx	reg,r6,r3
> +	reg = reg + 1
> +	.endr
> +	mfvscr	vr0
> +	li	r6,VCPU_VSCR
> +	stvx	vr0,r6,r3
> +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
> +#endif
> +	mfspr	r6,SPRN_VRSAVE
> +	stw	r6,VCPU_VRSAVE(r3)
> +	mtmsrd	r9
> +	isync
> +	blr
> +
> +/*
> + * Load up FP, VMX and VSX registers
> + * r4 = vcpu pointer
> + */
> +	.globl	kvmppc_load_fp
> +kvmppc_load_fp:
> +	mfmsr	r9
> +	ori	r8,r9,MSR_FP
> +#ifdef CONFIG_ALTIVEC
> +#ifdef CONFIG_VSX
> +	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
> +#else
> +	oris	r8,r8,MSR_VEC@h
> +#endif
> +#endif
> +	mtmsrd	r8
> +	isync
> +	lfd	fr0,VCPU_FPSCR(r4)
> +	MTFSF_L(fr0)
> +#ifdef CONFIG_VSX
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r7,reg*16+VCPU_VSRS
> +	lxvd2x	reg,r7,r4
> +	reg = reg + 1
> +	.endr
> +FTR_SECTION_ELSE
> +#endif
> +	reg = 0
> +	.rept	32
> +	lfd	reg,reg*8+VCPU_FPRS(r4)
> +	reg = reg + 1
> +	.endr
> +#ifdef CONFIG_VSX
> +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
> +#endif
> +
> +#ifdef CONFIG_ALTIVEC
> +	li	r7,VCPU_VSCR
> +	lvx	vr0,r7,r4
> +	mtvscr	vr0
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r7,reg*16+VCPU_VRS
> +	lvx	reg,r7,r4
> +	reg = reg + 1
> +	.endr
> +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
> +#endif
> +	lwz	r7,VCPU_VRSAVE(r4)
> +	mtspr	SPRN_VRSAVE,r7
> +	blr
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> new file mode 100644
> index 0000000..813b01c
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -0,0 +1,663 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + *
> + * Derived from book3s_rmhandlers.S and other files, which are:
> + *
> + * Copyright SUSE Linux Products GmbH 2009
> + *
> + * Authors: Alexander Graf <agraf@suse.de>
> + */
> +
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/page.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/exception-64s.h>
> +
> +/*****************************************************************************
> + *                                                                           *
> + *        Real Mode handlers that need to be in the linear mapping           *
> + *                                                                           *
> + ****************************************************************************/
> +
> +#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
> +
> +	.globl	kvmppc_skip_interrupt
> +kvmppc_skip_interrupt:
> +	mfspr	r13,SPRN_SRR0
> +	addi	r13,r13,4
> +	mtspr	SPRN_SRR0,r13
> +	GET_SCRATCH0(r13)
> +	rfid
> +	b	.
> +
> +	.globl	kvmppc_skip_Hinterrupt
> +kvmppc_skip_Hinterrupt:
> +	mfspr	r13,SPRN_HSRR0
> +	addi	r13,r13,4
> +	mtspr	SPRN_HSRR0,r13
> +	GET_SCRATCH0(r13)
> +	hrfid
> +	b	.
> +
> +/*
> + * This trampoline brings us back to a real mode handler
> + *
> + * Input Registers:
> + *
> + * R5 = SRR0
> + * R6 = SRR1
> + * R12 = real-mode IP
> + * LR = real-mode IP
> + *
> + */
> +.global kvmppc_handler_lowmem_trampoline
> +kvmppc_handler_lowmem_trampoline:
> +	cmpwi	r12,0x500
> +	beq	1f
> +	cmpwi	r12,0x980
> +	beq	1f

What?

1) use the macros please
2) why?

> +	mtsrr0	r3
> +	mtsrr1	r4
> +	blr
> +1:	mtspr	SPRN_HSRR0,r3
> +	mtspr	SPRN_HSRR1,r4
> +	blr
> +
> +/*
> + * Call a function in real mode.
> + * Must be called with interrupts hard-disabled.
> + *
> + * Input Registers:
> + *
> + * R5 = function
> + * R6 = MSR
> + * R7 = scratch register
> + *
> + */
> +_GLOBAL(kvmppc_rmcall)
> +	mfmsr	r7
> +	li	r0,MSR_RI		/* clear RI in MSR */
> +	andc	r7,r7,r0
> +	mtmsrd	r7,1
> +	mtsrr0	r5
> +	mtsrr1	r6
> +	RFI
> +
> +.global kvmppc_trampoline_lowmem
> +kvmppc_trampoline_lowmem:
> +	PPC_LONG kvmppc_handler_lowmem_trampoline - _stext
> +
> +.global kvmppc_trampoline_enter
> +kvmppc_trampoline_enter:
> +	PPC_LONG kvmppc_handler_trampoline_enter - _stext
> +
> +#define ULONG_SIZE 		8
> +#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))
> +
> +/******************************************************************************
> + *                                                                            *
> + *                               Entry code                                   *
> + *                                                                            *
> + *****************************************************************************/
> +
> +.global kvmppc_handler_trampoline_enter
> +kvmppc_handler_trampoline_enter:
> +
> +	/* Required state:
> +	 *
> +	 * R4 = vcpu pointer
> +	 * MSR = ~IR|DR
> +	 * R13 = PACA
> +	 * R1 = host R1
> +	 * all other volatile GPRS = free
> +	 */
> +	ld	r14, VCPU_GPR(r14)(r4)
> +	ld	r15, VCPU_GPR(r15)(r4)
> +	ld	r16, VCPU_GPR(r16)(r4)
> +	ld	r17, VCPU_GPR(r17)(r4)
> +	ld	r18, VCPU_GPR(r18)(r4)
> +	ld	r19, VCPU_GPR(r19)(r4)
> +	ld	r20, VCPU_GPR(r20)(r4)
> +	ld	r21, VCPU_GPR(r21)(r4)
> +	ld	r22, VCPU_GPR(r22)(r4)
> +	ld	r23, VCPU_GPR(r23)(r4)
> +	ld	r24, VCPU_GPR(r24)(r4)
> +	ld	r25, VCPU_GPR(r25)(r4)
> +	ld	r26, VCPU_GPR(r26)(r4)
> +	ld	r27, VCPU_GPR(r27)(r4)
> +	ld	r28, VCPU_GPR(r28)(r4)
> +	ld	r29, VCPU_GPR(r29)(r4)
> +	ld	r30, VCPU_GPR(r30)(r4)
> +	ld	r31, VCPU_GPR(r31)(r4)
> +
> +	/* Load guest PMU registers */
> +	/* R4 is live here (vcpu pointer) */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
> +	isync
> +	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
> +	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
> +	lwz	r6, VCPU_PMC + 8(r4)
> +	lwz	r7, VCPU_PMC + 12(r4)
> +	lwz	r8, VCPU_PMC + 16(r4)
> +	lwz	r9, VCPU_PMC + 20(r4)
> +	mtspr	SPRN_PMC1, r3
> +	mtspr	SPRN_PMC2, r5
> +	mtspr	SPRN_PMC3, r6
> +	mtspr	SPRN_PMC4, r7
> +	mtspr	SPRN_PMC5, r8
> +	mtspr	SPRN_PMC6, r9
> +	ld	r3, VCPU_MMCR(r4)
> +	ld	r5, VCPU_MMCR + 8(r4)
> +	ld	r6, VCPU_MMCR + 16(r4)
> +	mtspr	SPRN_MMCR1, r5
> +	mtspr	SPRN_MMCRA, r6
> +	mtspr	SPRN_MMCR0, r3
> +	isync
> +
> +	/* Load up FP, VMX and VSX registers */
> +	bl	kvmppc_load_fp
> +
> +	/* Switch DSCR to guest value */
> +	ld	r5, VCPU_DSCR(r4)
> +	mtspr	SPRN_DSCR, r5
> +
> +	/*
> +	 * Set the decrementer to the guest decrementer.
> +	 */
> +	ld	r8,VCPU_DEC_EXPIRES(r4)
> +	mftb	r7
> +	subf	r3,r7,r8
> +	mtspr	SPRN_DEC,r3
> +	stw	r3,VCPU_DEC(r4)
> +
> +	ld	r5, VCPU_SPRG0(r4)
> +	ld	r6, VCPU_SPRG1(r4)
> +	ld	r7, VCPU_SPRG2(r4)
> +	ld	r8, VCPU_SPRG3(r4)
> +	mtspr	SPRN_SPRG0, r5
> +	mtspr	SPRN_SPRG1, r6
> +	mtspr	SPRN_SPRG2, r7
> +	mtspr	SPRN_SPRG3, r8
> +
> +	/* Save R1 in the PACA */
> +	std	r1, PACA_KVM_SVCPU + SVCPU_HOST_R1(r13)
> +
> +	/* Load up DAR and DSISR */
> +	ld	r5, VCPU_DAR(r4)
> +	lwz	r6, VCPU_DSISR(r4)
> +	mtspr	SPRN_DAR, r5
> +	mtspr	SPRN_DSISR, r6
> +
> +	/* Set partition DABR */
> +	li	r5,3
> +	ld	r6,VCPU_DABR(r4)
> +	mtspr	SPRN_DABRX,r5
> +	mtspr	SPRN_DABR,r6
> +
> +	/* Restore AMR and UAMOR, set AMOR to all 1s */
> +	ld	r5,VCPU_AMR(r4)
> +	ld	r6,VCPU_UAMOR(r4)
> +	li	r7,-1
> +	mtspr	SPRN_AMR,r5
> +	mtspr	SPRN_UAMOR,r6
> +	mtspr	SPRN_AMOR,r7
> +
> +	/* Clear out SLB */
> +	li	r6,0
> +	slbmte	r6,r6
> +	slbia
> +	ptesync
> +
> +	/* Switch to guest partition. */
> +	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> +	ld	r6,KVM_SDR1(r9)
> +	lwz	r7,KVM_LPID(r9)
> +	li	r0,0x3ff		/* switch to reserved LPID */
> +	mtspr	SPRN_LPID,r0
> +	ptesync
> +	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
> +	mtspr	SPRN_LPID,r7
> +	isync
> +	ld	r8,VCPU_LPCR(r4)
> +	mtspr	SPRN_LPCR,r8
> +	isync
> +
> +	/* Check if HDEC expires soon */
> +	mfspr	r3,SPRN_HDEC
> +	cmpwi	r3,10
> +	li	r12,0x980

define

> +	mr	r9,r4
> +	blt	hdec_soon

Is it faster to do the check than to save the check and take the odds? Also, maybe we should rather do the check in preemptible code that could just directly pass the time slice on.

> +
> +	/*
> +	 * Invalidate the TLB if we could possibly have stale TLB
> +	 * entries for this partition on this core due to the use
> +	 * of tlbiel.
> +	 */
> +	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> +	lwz	r5,VCPU_VCPUID(r4)
> +	lhz	r6,PACAPACAINDEX(r13)
> +	lhz	r8,VCPU_LAST_CPU(r4)
> +	sldi	r7,r6,1			/* see if this is the same vcpu */
> +	add	r7,r7,r9		/* as last ran on this pcpu */
> +	lhz	r0,KVM_LAST_VCPU(r7)
> +	cmpw	r6,r8			/* on the same cpu core as last time? */
> +	bne	3f
> +	cmpw	r0,r5			/* same vcpu as this core last ran? */
> +	beq	1f
> +3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
> +	sth	r5,KVM_LAST_VCPU(r7)
> +	li	r6,128
> +	mtctr	r6
> +	li	r7,0x800		/* IS field = 0b10 */
> +	ptesync
> +2:	tlbiel	r7
> +	addi	r7,r7,0x1000
> +	bdnz	2b
> +	ptesync
> +1:
> +
> +	/* Save purr/spurr */
> +	mfspr	r5,SPRN_PURR
> +	mfspr	r6,SPRN_SPURR
> +	std	r5,PACA_HOST_PURR(r13)
> +	std	r6,PACA_HOST_SPURR(r13)
> +	ld	r7,VCPU_PURR(r4)
> +	ld	r8,VCPU_SPURR(r4)
> +	mtspr	SPRN_PURR,r7
> +	mtspr	SPRN_SPURR,r8
> +
> +	/* Load up guest SLB entries */
> +	lwz	r5,VCPU_SLB_MAX(r4)
> +	cmpwi	r5,0
> +	beq	9f
> +	mtctr	r5
> +	addi	r6,r4,VCPU_SLB
> +1:	ld	r8,VCPU_SLB_E(r6)
> +	ld	r9,VCPU_SLB_V(r6)
> +	slbmte	r9,r8
> +	addi	r6,r6,VCPU_SLB_SIZE
> +	bdnz	1b
> +9:
> +
> +	/* Restore state of CTRL run bit; assume 1 on entry */
> +	lwz	r5,VCPU_CTRL(r4)
> +	andi.	r5,r5,1
> +	bne	4f
> +	mfspr	r6,SPRN_CTRLF
> +	clrrdi	r6,r6,1
> +	mtspr	SPRN_CTRLT,r6
> +4:
> +	ld	r6, VCPU_CTR(r4)
> +	lwz	r7, VCPU_XER(r4)
> +
> +	mtctr	r6
> +	mtxer	r7
> +
> +	/* Move SRR0 and SRR1 into the respective regs */
> +	ld	r6, VCPU_SRR0(r4)
> +	ld	r7, VCPU_SRR1(r4)
> +	mtspr	SPRN_SRR0, r6
> +	mtspr	SPRN_SRR1, r7
> +
> +	ld	r10, VCPU_PC(r4)
> +
> +	ld	r11, VCPU_MSR(r4)	/* r10 = vcpu->arch.msr & ~MSR_HV */
> +	rldicl	r11, r11, 63 - MSR_HV_LG, 1
> +	rotldi	r11, r11, 1 + MSR_HV_LG
> +	ori	r11, r11, MSR_ME
> +
> +fast_guest_return:
> +	mtspr	SPRN_HSRR0,r10
> +	mtspr	SPRN_HSRR1,r11
> +
> +	/* Activate guest mode, so faults get handled by KVM */
> +	li	r9, KVM_GUEST_MODE_GUEST
> +	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
> +
> +	/* Enter guest */
> +
> +	ld	r5, VCPU_LR(r4)
> +	lwz	r6, VCPU_CR(r4)
> +	mtlr	r5
> +	mtcr	r6
> +
> +	ld	r0, VCPU_GPR(r0)(r4)
> +	ld	r1, VCPU_GPR(r1)(r4)
> +	ld	r2, VCPU_GPR(r2)(r4)
> +	ld	r3, VCPU_GPR(r3)(r4)
> +	ld	r5, VCPU_GPR(r5)(r4)
> +	ld	r6, VCPU_GPR(r6)(r4)
> +	ld	r7, VCPU_GPR(r7)(r4)
> +	ld	r8, VCPU_GPR(r8)(r4)
> +	ld	r9, VCPU_GPR(r9)(r4)
> +	ld	r10, VCPU_GPR(r10)(r4)
> +	ld	r11, VCPU_GPR(r11)(r4)
> +	ld	r12, VCPU_GPR(r12)(r4)
> +	ld	r13, VCPU_GPR(r13)(r4)
> +
> +	ld	r4, VCPU_GPR(r4)(r4)
> +
> +	hrfid
> +	b	.
> +kvmppc_handler_trampoline_enter_end:
> +
> +
> +/******************************************************************************
> + *                                                                            *
> + *                               Exit code                                    *
> + *                                                                            *
> + *****************************************************************************/
> +
> +/*
> + * We come here from the first-level interrupt handlers.
> + */
> +	.globl	kvmppc_interrupt
> +kvmppc_interrupt:
> +	/*
> +	 * Register contents:
> +	 * R12		= interrupt vector
> +	 * R13		= PACA
> +	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
> +	 * guest R13 saved in SPRN_SCRATCH0
> +	 */
> +	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
> +	std	r9, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
> +	ld	r9, PACA_KVM_VCPU(r13)
> +
> +	/* Save registers */
> +
> +	std	r0, VCPU_GPR(r0)(r9)
> +	std	r1, VCPU_GPR(r1)(r9)
> +	std	r2, VCPU_GPR(r2)(r9)
> +	std	r3, VCPU_GPR(r3)(r9)
> +	std	r4, VCPU_GPR(r4)(r9)
> +	std	r5, VCPU_GPR(r5)(r9)
> +	std	r6, VCPU_GPR(r6)(r9)
> +	std	r7, VCPU_GPR(r7)(r9)
> +	std	r8, VCPU_GPR(r8)(r9)
> +	ld	r0, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
> +	std	r0, VCPU_GPR(r9)(r9)
> +	std	r10, VCPU_GPR(r10)(r9)
> +	std	r11, VCPU_GPR(r11)(r9)
> +	ld	r3, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
> +	lwz	r4, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
> +	std	r3, VCPU_GPR(r12)(r9)
> +	stw	r4, VCPU_CR(r9)
> +
> +	/* Restore R1/R2 so we can handle faults */
> +	ld	r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
> +	ld	r2, PACATOC(r13)
> +
> +	mfspr	r10, SPRN_SRR0
> +	mfspr	r11, SPRN_SRR1
> +	std	r10, VCPU_SRR0(r9)
> +	std	r11, VCPU_SRR1(r9)
> +	andi.	r0, r12, 2		/* need to read HSRR0/1? */
> +	beq	1f
> +	mfspr	r10, SPRN_HSRR0
> +	mfspr	r11, SPRN_HSRR1
> +	clrrdi	r12, r12, 2
> +1:	std	r10, VCPU_PC(r9)
> +	std	r11, VCPU_MSR(r9)
> +
> +	GET_SCRATCH0(r3)
> +	mflr	r4
> +	std	r3, VCPU_GPR(r13)(r9)
> +	std	r4, VCPU_LR(r9)
> +
> +	/* Unset guest mode */
> +	li	r0, KVM_GUEST_MODE_NONE
> +	stb	r0, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
> +
> +	stw	r12,VCPU_TRAP(r9)
> +
> +	/* See if this is a leftover HDEC interrupt */
> +	cmpwi	r12,0x980

define

> +	bne	2f
> +	mfspr	r3,SPRN_HDEC
> +	cmpwi	r3,0
> +	bge	ignore_hdec
> +2:
> +
> +	/* Check for mediated interrupts (could be done earlier really ...) */
> +	cmpwi	r12,0x500

define

> +	bne+	1f
> +	ld	r5,VCPU_LPCR(r9)
> +	andi.	r0,r11,MSR_EE
> +	beq	1f
> +	andi.	r0,r5,LPCR_MER
> +	bne	bounce_ext_interrupt

So there's no need for the external check that directly goes into the Linux handler code on full-fledged exits?

> +1:
> +
> +	/* Save DEC */
> +	mfspr	r5,SPRN_DEC
> +	mftb	r6
> +	extsw	r5,r5
> +	add	r5,r5,r6
> +	std	r5,VCPU_DEC_EXPIRES(r9)
> +
> +	/* Save HEIR (in last_inst) if this is a HEI (e40) */
> +	li	r3,-1
> +	cmpwi	r12,0xe40
> +	bne	11f
> +	mfspr	r3,SPRN_HEIR
> +11:	stw	r3,VCPU_LAST_INST(r9)
> +	
> +	/* Save more register state  */
> +	mfxer	r5
> +	mfdar	r6
> +	mfdsisr	r7
> +	mfctr	r8
> +
> +	stw	r5, VCPU_XER(r9)
> +	std	r6, VCPU_DAR(r9)
> +	stw	r7, VCPU_DSISR(r9)
> +	std	r8, VCPU_CTR(r9)
> +	cmpwi	r12,0xe00		/* grab HDAR & HDSISR if HDSI */
> +	beq	6f
> +7:	std	r6, VCPU_FAULT_DAR(r9)
> +	stw	r7, VCPU_FAULT_DSISR(r9)
> +
> +	/* Save guest CTRL register, set runlatch to 1 */
> +	mfspr	r6,SPRN_CTRLF
> +	stw	r6,VCPU_CTRL(r9)
> +	andi.	r0,r6,1
> +	bne	4f
> +	ori	r6,r6,1
> +	mtspr	SPRN_CTRLT,r6
> +4:
> +	/* Read the guest SLB and save it away */
> +	li	r6,0
> +	addi	r7,r9,VCPU_SLB
> +	li	r5,0
> +1:	slbmfee	r8,r6
> +	andis.	r0,r8,SLB_ESID_V@h
> +	beq	2f
> +	add	r8,r8,r6		/* put index in */
> +	slbmfev	r3,r6
> +	std	r8,VCPU_SLB_E(r7)
> +	std	r3,VCPU_SLB_V(r7)
> +	addi	r7,r7,VCPU_SLB_SIZE
> +	addi	r5,r5,1
> +2:	addi	r6,r6,1
> +	cmpwi	r6,32

I don't like how the 32 is hardcoded here. Better create a define for it and use the same in the init code.

> +	blt	1b
> +	stw	r5,VCPU_SLB_MAX(r9)
> +
> +	/*
> +	 * Save the guest PURR/SPURR
> +	 */
> +	mfspr	r5,SPRN_PURR
> +	mfspr	r6,SPRN_SPURR
> +	ld	r7,VCPU_PURR(r9)
> +	ld	r8,VCPU_SPURR(r9)
> +	std	r5,VCPU_PURR(r9)
> +	std	r6,VCPU_SPURR(r9)
> +	subf	r5,r7,r5
> +	subf	r6,r8,r6
> +
> +	/*
> +	 * Restore host PURR/SPURR and add guest times
> +	 * so that the time in the guest gets accounted.
> +	 */
> +	ld	r3,PACA_HOST_PURR(r13)
> +	ld	r4,PACA_HOST_SPURR(r13)
> +	add	r3,r3,r5
> +	add	r4,r4,r6
> +	mtspr	SPRN_PURR,r3
> +	mtspr	SPRN_SPURR,r4
> +
> +	/* Clear out SLB */
> +	li	r5,0
> +	slbmte	r5,r5
> +	slbia
> +	ptesync
> +
> +hdec_soon:
> +	/* Switch back to host partition */
> +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> +	ld	r6,KVM_HOST_SDR1(r4)
> +	lwz	r7,KVM_HOST_LPID(r4)
> +	li	r8,0x3ff		/* switch to reserved LPID */

is it reserved by ISA? Either way, hard-coding the constant without a name is not nice :).

> +	mtspr	SPRN_LPID,r8
> +	ptesync
> +	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
> +	mtspr	SPRN_LPID,r7
> +	isync
> +	lis	r8,0x7fff

INT_MAX@h might be more readable.

> +	mtspr	SPRN_HDEC,r8
> +
> +	ld	r8,KVM_HOST_LPCR(r4)
> +	mtspr	SPRN_LPCR,r8
> +	isync
> +
> +	/* load host SLB entries */
> +	ld	r8,PACA_SLBSHADOWPTR(r13)
> +
> +	.rept	SLB_NUM_BOLTED
> +	ld	r5,SLBSHADOW_SAVEAREA(r8)
> +	ld	r6,SLBSHADOW_SAVEAREA+8(r8)
> +	andis.	r7,r5,SLB_ESID_V@h
> +	beq	1f
> +	slbmte	r6,r5
> +1:	addi	r8,r8,16
> +	.endr
> +
> +	/* Save and reset AMR and UAMOR before turning on the MMU */
> +	mfspr	r5,SPRN_AMR
> +	mfspr	r6,SPRN_UAMOR
> +	std	r5,VCPU_AMR(r9)
> +	std	r6,VCPU_UAMOR(r9)
> +	li	r6,0
> +	mtspr	SPRN_AMR,r6
> +
> +	/* Restore host DABR and DABRX */
> +	ld	r5,PACA_DABR(r13)
> +	li	r6,7
> +	mtspr	SPRN_DABR,r5
> +	mtspr	SPRN_DABRX,r6
> +
> +	/* Switch DSCR back to host value */
> +	mfspr	r8, SPRN_DSCR
> +	ld	r7, PACA_HOST_DSCR(r13)
> +	std	r8, VCPU_DSCR(r7)
> +	mtspr	SPRN_DSCR, r7
> +
> +	/* Save non-volatile GPRs */
> +	std	r14, VCPU_GPR(r14)(r9)
> +	std	r15, VCPU_GPR(r15)(r9)
> +	std	r16, VCPU_GPR(r16)(r9)
> +	std	r17, VCPU_GPR(r17)(r9)
> +	std	r18, VCPU_GPR(r18)(r9)
> +	std	r19, VCPU_GPR(r19)(r9)
> +	std	r20, VCPU_GPR(r20)(r9)
> +	std	r21, VCPU_GPR(r21)(r9)
> +	std	r22, VCPU_GPR(r22)(r9)
> +	std	r23, VCPU_GPR(r23)(r9)
> +	std	r24, VCPU_GPR(r24)(r9)
> +	std	r25, VCPU_GPR(r25)(r9)
> +	std	r26, VCPU_GPR(r26)(r9)
> +	std	r27, VCPU_GPR(r27)(r9)
> +	std	r28, VCPU_GPR(r28)(r9)
> +	std	r29, VCPU_GPR(r29)(r9)
> +	std	r30, VCPU_GPR(r30)(r9)
> +	std	r31, VCPU_GPR(r31)(r9)
> +
> +	/* Save SPRGs */
> +	mfspr	r3, SPRN_SPRG0
> +	mfspr	r4, SPRN_SPRG1
> +	mfspr	r5, SPRN_SPRG2
> +	mfspr	r6, SPRN_SPRG3
> +	std	r3, VCPU_SPRG0(r9)
> +	std	r4, VCPU_SPRG1(r9)
> +	std	r5, VCPU_SPRG2(r9)
> +	std	r6, VCPU_SPRG3(r9)
> +
> +	/* Save PMU registers */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
> +	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
> +	isync
> +	mfspr	r5, SPRN_MMCR1
> +	mfspr	r6, SPRN_MMCRA
> +	std	r4, VCPU_MMCR(r9)
> +	std	r5, VCPU_MMCR + 8(r9)
> +	std	r6, VCPU_MMCR + 16(r9)
> +	mfspr	r3, SPRN_PMC1
> +	mfspr	r4, SPRN_PMC2
> +	mfspr	r5, SPRN_PMC3
> +	mfspr	r6, SPRN_PMC4
> +	mfspr	r7, SPRN_PMC5
> +	mfspr	r8, SPRN_PMC6
> +	stw	r3, VCPU_PMC(r9)
> +	stw	r4, VCPU_PMC + 4(r9)
> +	stw	r5, VCPU_PMC + 8(r9)
> +	stw	r6, VCPU_PMC + 12(r9)
> +	stw	r7, VCPU_PMC + 16(r9)
> +	stw	r8, VCPU_PMC + 20(r9)
> +22:
> +	/* save FP state */
> +	mr	r3, r9
> +	bl	.kvmppc_save_fp
> +
> +	/* RFI into the highmem handler */
> +	mfmsr	r7
> +	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
> +	mtsrr1	r7
> +	/* Load highmem handler address */
> +	ld	r8, VCPU_HIGHMEM_HANDLER(r3)
> +	mtsrr0	r8
> +
> +	RFI
> +kvmppc_handler_trampoline_exit_end:
> +
> +6:	mfspr	r6,SPRN_HDAR
> +	mfspr	r7,SPRN_HDSISR
> +	b	7b
> +
> +ignore_hdec:
> +	mr	r4,r9
> +	b	fast_guest_return
> +
> +bounce_ext_interrupt:
> +	mr	r4,r9
> +	mtspr	SPRN_SRR0,r10
> +	mtspr	SPRN_SRR1,r11
> +	li	r10,0x500
> +	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
> +	b	fast_guest_return
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 4b54148..e5e3f92 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,8 +38,12 @@
> 
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> -	return !(v->arch.shared->msr & MSR_WE) ||
> +#ifndef CONFIG_KVM_BOOK3S_64_HV
> +	return !(kvmppc_get_msr(v) & MSR_WE) ||
> 	       !!(v->arch.pending_exceptions);
> +#else
> +	return 1;

So what happens if the guest sets MSR_WE? It just stays in guest context idling? That'd be pretty bad for scheduling on the host.

> +#endif
> }
> 
> int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
> @@ -52,7 +56,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
> 	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
> 	unsigned long r2 = 0;
> 
> -	if (!(vcpu->arch.shared->msr & MSR_SF)) {
> +	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
> 		/* 32 bit mode */
> 		param1 &= 0xffffffff;
> 		param2 &= 0xffffffff;
> @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_PPC_IRQ_LEVEL:
> 	case KVM_CAP_ENABLE_CAP:
> 	case KVM_CAP_PPC_OSI:
> +#ifndef CONFIG_KVM_BOOK3S_64_HV

You also don't do OSI on HV :).

> 	case KVM_CAP_PPC_GET_PVINFO:
> 		r = 1;
> 		break;
> 	case KVM_CAP_COALESCED_MMIO:
> 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
> 		break;
> +#endif
> 	default:
> 		r = 0;
> 		break;
> @@ -286,6 +292,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
> 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
> 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
> +	vcpu->arch.dec_expires = ~(u64)0;
> 
> 	return 0;
> }
> @@ -474,6 +481,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
> 		for (i = 0; i < 32; i++)
> 			kvmppc_set_gpr(vcpu, i, gprs[i]);
> 		vcpu->arch.osi_needed = 0;
> +	} else if (vcpu->arch.hcall_needed) {
> +		int i;
> +
> +		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
> +		for (i = 0; i < 6; ++i)
> +			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
> +		vcpu->arch.hcall_needed = 0;
> 	}
> 
> 	kvmppc_core_deliver_interrupts(vcpu);
> @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
> 	if (waitqueue_active(&vcpu->wq)) {
> 		wake_up_interruptible(&vcpu->wq);
> 		vcpu->stat.halt_wakeup++;
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	} else if (vcpu->cpu != -1) {
> +		smp_send_reschedule(vcpu->cpu);

Shouldn't this be done for non-HV too? The only reason we don't do it yet is because we don't do SMP, no?

> +#endif
> 	}
> -

eh... :)

> 	return 0;
> }
> 
> diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
> index d62a14b..e5c99b8 100644
> --- a/arch/powerpc/kvm/trace.h
> +++ b/arch/powerpc/kvm/trace.h
> @@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
>  *                         Book3S trace points                           *
>  *************************************************************************/
> 
> -#ifdef CONFIG_PPC_BOOK3S
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 
> TRACE_EVENT(kvm_book3s_exit,
> 	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..a4447ce 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
> #define KVM_EXIT_NMI              16
> #define KVM_EXIT_INTERNAL_ERROR   17
> #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_PAPR_HCALL	  19
> 
> /* For KVM_EXIT_INTERNAL_ERROR */
> #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -264,6 +265,11 @@ struct kvm_run {
> 		struct {
> 			__u64 gprs[32];
> 		} osi;
> +		struct {
> +			__u64 nr;
> +			__u64 ret;
> +			__u64 args[9];
> +		} papr_hcall;

This needs some information in the documentation.


Alex

PS: I CC'ed kvm-ppc@vger. Please make sure to CC that mailing list, so people interested in kvm on ppc get your patches as well :).

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-15 21:58     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-15 21:58 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, kvm-ppc, KVM list


On 11.05.2011, at 12:44, Paul Mackerras wrote:

> This adds support for KVM running on 64-bit Book 3S processors,
> specifically POWER7, in hypervisor mode.  Using hypervisor mode means
> that the guest can use the processor's supervisor mode.  That means
> that the guest can execute privileged instructions and access =
privileged
> registers itself without trapping to the host.  This gives excellent
> performance, but does mean that KVM cannot emulate a processor
> architecture other than the one that the hardware implements.
>=20
> This code assumes that the guest is running paravirtualized using the
> PAPR (Power Architecture Platform Requirements) interface, which is =
the
> interface that IBM's PowerVM hypervisor uses.  That means that =
existing
> Linux distributions that run on IBM pSeries machines will also run
> under KVM without modification.  In order to communicate the PAPR
> hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
> to include/linux/kvm.h.
>=20
> Currently the choice between book3s_hv support and book3s_pr support
> (i.e. the existing code, which runs the guest in user mode) has to be
> made at kernel configuration time, so a given kernel binary can only
> do one or the other.
>=20
> This new book3s_hv code doesn't support MMIO emulation at present.
> Since we are running paravirtualized guests, this isn't a serious
> restriction.
>=20
> With the guest running in supervisor mode, most exceptions go straight
> to the guest.  We will never get data or instruction storage or =
segment
> interrupts, alignment interrupts, decrementer interrupts, program
> interrupts, single-step interrupts, etc., coming to the hypervisor =
from
> the guest.  Therefore this introduces a new KVMTEST_NONHV macro for =
the
> exception entry path so that we don't have to do the KVM test on entry
> to those exception handlers.
>=20
> We do however get hypervisor decrementer, hypervisor data storage,
> hypervisor instruction storage, and hypervisor emulation assist
> interrupts, so we have to handle those.
>=20
> In hypervisor mode, real-mode accesses can access all of RAM, not just
> a limited amount.  Therefore we put all the guest state in the =
vcpu.arch
> and use the shadow_vcpu in the PACA only for temporary scratch space.
> We allocate the vcpu with kzalloc rather than vzalloc, and we don't =
use
> anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
>=20
> The POWER7 processor has a restriction that all threads in a core have
> to be in the same partition.  MMU-on kernel code counts as a partition
> (partition 0), so we have to do a partition switch on every entry to =
and
> exit from the guest.  At present we require the host and guest to run
> in single-thread mode because of this hardware restriction.
>=20
> This code allocates a hashed page table for the guest and initializes
> it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
> require that the guest memory is allocated using 16MB huge pages, in
> order to simplify the low-level memory management.  This also means =
that
> we can get away without tracking paging activity in the host for now,
> since huge pages can't be paged or swapped.
>=20
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/exception-64s.h  |   27 +-
> arch/powerpc/include/asm/kvm_asm.h        |    4 +
> arch/powerpc/include/asm/kvm_book3s.h     |  148 ++++++-
> arch/powerpc/include/asm/kvm_book3s_asm.h |    4 +-
> arch/powerpc/include/asm/kvm_booke.h      |    4 +
> arch/powerpc/include/asm/kvm_host.h       |   60 +++-
> arch/powerpc/include/asm/kvm_ppc.h        |    6 +
> arch/powerpc/include/asm/mmu-hash64.h     |   10 +-
> arch/powerpc/include/asm/paca.h           |   10 +
> arch/powerpc/include/asm/reg.h            |    4 +
> arch/powerpc/kernel/asm-offsets.c         |   95 ++++-
> arch/powerpc/kernel/exceptions-64s.S      |   60 ++--
> arch/powerpc/kvm/Kconfig                  |   40 ++-
> arch/powerpc/kvm/Makefile                 |   16 +-
> arch/powerpc/kvm/book3s_64_mmu_hv.c       |  258 +++++++++++
> arch/powerpc/kvm/book3s_hv.c              |  413 ++++++++++++++++++
> arch/powerpc/kvm/book3s_hv_interrupts.S   |  326 ++++++++++++++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  663 =
+++++++++++++++++++++++++++++
> arch/powerpc/kvm/powerpc.c                |   23 +-
> arch/powerpc/kvm/trace.h                  |    2 +-
> include/linux/kvm.h                       |    6 +
> 21 files changed, 2094 insertions(+), 85 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
> create mode 100644 arch/powerpc/kvm/book3s_hv.c
> create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
> create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S
>=20
> diff --git a/arch/powerpc/include/asm/exception-64s.h =
b/arch/powerpc/include/asm/exception-64s.h
> index 2a770d8..d32e1ef 100644
> --- a/arch/powerpc/include/asm/exception-64s.h
> +++ b/arch/powerpc/include/asm/exception-64s.h
> @@ -65,14 +65,14 @@
> 	GET_PACA(r13);							=
\
> 	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		=
\
> 	std	r10,area+EX_R10(r13);					=
\
> -	mfcr	r9;							=
\
> -	extra(vec);							=
\
> -	std	r11,area+EX_R11(r13);					=
\
> -	std	r12,area+EX_R12(r13);					=
\
> 	BEGIN_FTR_SECTION_NESTED(66);					=
\
> 	mfspr	r10,SPRN_CFAR;						=
\
> 	std	r10,area+EX_CFAR(r13);					=
\
> 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		=
\
> +	mfcr	r9;							=
\
> +	extra(vec);							=
\
> +	std	r11,area+EX_R11(r13);					=
\
> +	std	r12,area+EX_R12(r13);					=
\


I don't really understand that change :).

> 	GET_SCRATCH0(r10);						=
\
> 	std	r10,area+EX_R13(r13)
> #define EXCEPTION_PROLOG_1(area, extra, vec)				=
\
> @@ -134,6 +134,17 @@ do_kvm_##n:						=
		\
> #define KVM_HANDLER_SKIP(area, h, n)
> #endif
>=20
> +#ifdef CONFIG_KVM_BOOK3S_NONHV

I really liked how you called the .c file _pr - why call it NONHV now?

> +#define KVMTEST_NONHV(n)		__KVMTEST(n)
> +#define KVM_HANDLER_NONHV(area, h, n)	__KVM_HANDLER(area, h, =
n)
> +#define KVM_HANDLER_NONHV_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, =
h, n)
> +
> +#else
> +#define KVMTEST_NONHV(n)
> +#define KVM_HANDLER_NONHV(area, h, n)
> +#define KVM_HANDLER_NONHV_SKIP(area, h, n)
> +#endif
> +
> #define NOTEST(n)
>=20
> /*
> @@ -210,7 +221,7 @@ label##_pSeries:					=
\
> 	HMT_MEDIUM;					\
> 	SET_SCRATCH0(r13);		/* save r13 */		\
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
> -				 EXC_STD, KVMTEST, vec)
> +				 EXC_STD, KVMTEST_NONHV, vec)
>=20
> #define STD_EXCEPTION_HV(loc, vec, label)		\
> 	. =3D loc;					\
> @@ -227,8 +238,8 @@ label##_hv:						=
\
> 	beq	masked_##h##interrupt
> #define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
>=20
> -#define SOFTEN_TEST(vec)						=
\
> -	KVMTEST(vec);							=
\
> +#define SOFTEN_TEST_NONHV(vec)						=
\
> +	KVMTEST_NONHV(vec);						=
\
> 	_SOFTEN_TEST(EXC_STD)
>=20
> #define SOFTEN_TEST_HV(vec)						=
\
> @@ -248,7 +259,7 @@ label##_hv:						=
\
> 	.globl label##_pSeries;						=
\
> label##_pSeries:							=
\
> 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				=
\
> -				    EXC_STD, SOFTEN_TEST)
> +				    EXC_STD, SOFTEN_TEST_NONHV)
>=20
> #define MASKABLE_EXCEPTION_HV(loc, vec, label)				=
\
> 	. =3D loc;							=
\
> diff --git a/arch/powerpc/include/asm/kvm_asm.h =
b/arch/powerpc/include/asm/kvm_asm.h
> index 0951b17..7b1f0e0 100644
> --- a/arch/powerpc/include/asm/kvm_asm.h
> +++ b/arch/powerpc/include/asm/kvm_asm.h
> @@ -64,8 +64,12 @@
> #define BOOK3S_INTERRUPT_PROGRAM	0x700
> #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
> #define BOOK3S_INTERRUPT_DECREMENTER	0x900
> +#define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
> #define BOOK3S_INTERRUPT_SYSCALL	0xc00
> #define BOOK3S_INTERRUPT_TRACE		0xd00
> +#define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
> +#define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
> +#define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
> #define BOOK3S_INTERRUPT_PERFMON	0xf00
> #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
> #define BOOK3S_INTERRUPT_VSX		0xf40
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h =
b/arch/powerpc/include/asm/kvm_book3s.h
> index 12829bb..5b76073 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -117,6 +117,7 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, =
u64 new_msr);
> extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
> extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
> extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
> +extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
> extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct =
kvmppc_pte *pte);
> extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
> extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
> @@ -128,10 +129,12 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu =
*vcpu);
> extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct =
hpte_cache *pte);
> extern int kvmppc_mmu_hpte_sysinit(void);
> extern void kvmppc_mmu_hpte_sysexit(void);
> +extern int kvmppc_mmu_hv_init(void);
>=20
> extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, =
void *ptr, bool data);
> extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, =
void *ptr, bool data);
> extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, =
unsigned int vec);
> +extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, =
u64 flags);
> extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat =
*bat,
> 			   bool upper, u32 val);
> extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
> @@ -152,6 +155,19 @@ static inline struct kvmppc_vcpu_book3s =
*to_book3s(struct kvm_vcpu *vcpu)
> 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
> }
>=20
> +extern void kvm_return_point(void);
> +
> +/* Also add subarch specific defines */
> +
> +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
> +#include <asm/kvm_book3s_32.h>
> +#endif
> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#include <asm/kvm_book3s_64.h>
> +#endif
> +
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> +
> #define vcpu_guest_state(vcpu)	((vcpu)->arch.shared)
>=20
> static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu =
*vcpu)
> @@ -168,16 +184,6 @@ static inline void =
kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
> 		vcpu_guest_state(vcpu)->int_pending =3D 0;
> }
>=20
> -static inline ulong dsisr(void)
> -{
> -	ulong r;
> -	asm ( "mfdsisr %0 " : "=3Dr" (r) );
> -	return r;
> -}
> -
> -extern void kvm_return_point(void);
> -static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct =
kvm_vcpu *vcpu);
> -
> static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, =
ulong val)
> {
> 	if ( num < 14 ) {
> @@ -265,6 +271,11 @@ static inline ulong kvmppc_get_fault_dar(struct =
kvm_vcpu *vcpu)
> 	return to_svcpu(vcpu)->fault_dar;
> }
>=20
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.shared->msr;
> +}
> +
> static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> {
> 	ulong crit_raw =3D vcpu_guest_state(vcpu)->critical;
> @@ -284,6 +295,115 @@ static inline bool =
kvmppc_critical_section(struct kvm_vcpu *vcpu)
>=20
> 	return crit;
> }
> +#else /* CONFIG_KVM_BOOK3S_NONHV */
> +
> +#define vcpu_guest_state(vcpu)	(&(vcpu)->arch)
> +
> +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu =
*vcpu)
> +{
> +	return 0;
> +}
> +
> +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
> +			unsigned long pending_now, unsigned long =
old_pending)
> +{
> +	/* Recalculate LPCR:MER based on the presence of
> +	 * a pending external interrupt
> +	 */
> +	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, =
&vcpu->arch.pending_exceptions) ||
> +	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, =
&vcpu->arch.pending_exceptions))

Wasn't pending_now pending_exceptions?

> +		vcpu->arch.lpcr |=3D LPCR_MER;
> +	else
> +		vcpu->arch.lpcr &=3D ~((u64)LPCR_MER);
> +}
> +
> +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, =
ulong val)
> +{
> +	vcpu->arch.gpr[num] =3D val;
> +}
> +
> +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
> +{
> +	return vcpu->arch.gpr[num];
> +}
> +
> +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
> +{
> +	vcpu->arch.cr =3D val;
> +}
> +
> +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.cr;
> +}
> +
> +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
> +{
> +	vcpu->arch.xer =3D val;
> +}
> +
> +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.xer;
> +}
> +
> +static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.ctr =3D val;
> +}
> +
> +static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.ctr;
> +}
> +
> +static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.lr =3D val;
> +}
> +
> +static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.lr;
> +}
> +
> +static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.pc =3D val;
> +}
> +
> +static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.pc;
> +}
> +
> +static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
> +{
> +	ulong pc =3D kvmppc_get_pc(vcpu);
> +
> +	/* Load the instruction manually if it failed to do so in the
> +	 * exit path */
> +	if (vcpu->arch.last_inst =3D=3D KVM_INST_FETCH_FAILED)
> +		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, =
false);
> +
> +	return vcpu->arch.last_inst;
> +}
> +
> +static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.fault_dar;
> +}
> +
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.msr;
> +}
> +
> +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> +{
> +	return false;
> +}
> +#endif
>=20
> /* Magic register values loaded into r3 and r4 before the 'sc' =
assembly
>  * instruction for the OSI hypercalls */
> @@ -292,12 +412,4 @@ static inline bool kvmppc_critical_section(struct =
kvm_vcpu *vcpu)
>=20
> #define INS_DCBZ			0x7c0007ec
>=20
> -/* Also add subarch specific defines */
> -
> -#ifdef CONFIG_PPC_BOOK3S_32
> -#include <asm/kvm_book3s_32.h>
> -#else
> -#include <asm/kvm_book3s_64.h>
> -#endif
> -
> #endif /* __ASM_KVM_BOOK3S_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h =
b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index d5a8a38..d7279f5 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -61,6 +61,7 @@ kvmppc_resume_\intno:
> #else  /*__ASSEMBLY__ */
>=20
> struct kvmppc_book3s_shadow_vcpu {
> +#ifdef CONFIG_KVM_BOOK3S_NONHV

Do you really want any shadow_vcpu in the paca at all?

> 	ulong gpr[14];
> 	u32 cr;
> 	u32 xer;
> @@ -72,6 +73,7 @@ struct kvmppc_book3s_shadow_vcpu {
> 	ulong pc;
> 	ulong shadow_srr1;
> 	ulong fault_dar;
> +#endif
>=20
> 	ulong host_r1;
> 	ulong host_r2;
> @@ -84,7 +86,7 @@ struct kvmppc_book3s_shadow_vcpu {
> #ifdef CONFIG_PPC_BOOK3S_32
> 	u32     sr[16];			/* Guest SRs */
> #endif
> -#ifdef CONFIG_PPC_BOOK3S_64
> +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_NONHV)
> 	u8 slb_max;			/* highest used guest slb entry =
*/
> 	struct  {
> 		u64     esid;
> diff --git a/arch/powerpc/include/asm/kvm_booke.h =
b/arch/powerpc/include/asm/kvm_booke.h
> index 9c9ba3d..a90e091 100644
> --- a/arch/powerpc/include/asm/kvm_booke.h
> +++ b/arch/powerpc/include/asm/kvm_booke.h
> @@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct =
kvm_vcpu *vcpu)
> 	return vcpu->arch.fault_dear;
> }
>=20
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.shared->msr;
> +}
> #endif /* __ASM_KVM_BOOKE_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_host.h =
b/arch/powerpc/include/asm/kvm_host.h
> index 3ebe51b..ec62365 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -33,7 +33,9 @@
> /* memory slots that does not exposed to userspace */
> #define KVM_PRIVATE_MEM_SLOTS 4
>=20
> +#ifdef CONFIG_KVM_MMIO
> #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
> +#endif
>=20
> /* We don't currently support large pages. */
> #define KVM_HPAGE_GFN_SHIFT(x)	0
> @@ -133,7 +135,24 @@ struct kvmppc_exit_timing {
> 	};
> };
>=20
> +struct kvmppc_pginfo {
> +	unsigned long pfn;
> +	atomic_t refcnt;
> +};
> +
> struct kvm_arch {
> +	unsigned long hpt_virt;
> +	unsigned long ram_npages;
> +	unsigned long ram_psize;
> +	unsigned long ram_porder;
> +	struct kvmppc_pginfo *ram_pginfo;
> +	unsigned int lpid;
> +	unsigned int host_lpid;
> +	unsigned long host_lpcr;
> +	unsigned long sdr1;
> +	unsigned long host_sdr1;
> +	int tlbie_lock;
> +	unsigned short last_vcpu[NR_CPUS];

This should all be #ifdef CONFIG_KVM_BOOK3S_HV

> };
>=20
> struct kvmppc_pte {
> @@ -190,7 +209,7 @@ struct kvm_vcpu_arch {
> 	ulong rmcall;
> 	ulong host_paca_phys;
> 	struct kvmppc_slb slb[64];
> -	int slb_max;		/* # valid entries in slb[] */
> +	int slb_max;		/* 1 + index of last valid entry in =
slb[] */
> 	int slb_nr;		/* total number of entries in SLB */
> 	struct kvmppc_mmu mmu;
> #endif
> @@ -204,9 +223,10 @@ struct kvm_vcpu_arch {
> 	vector128 vr[32];
> 	vector128 vscr;
> #endif
> +	u32 vrsave;
>=20
> #ifdef CONFIG_VSX
> -	u64 vsr[32];
> +	u64 vsr[64];
> #endif
>=20
> #ifdef CONFIG_PPC_BOOK3S
> @@ -214,29 +234,45 @@ struct kvm_vcpu_arch {
> 	u32 qpr[32];
> #endif
>=20
> -#ifdef CONFIG_BOOKE
> -	ulong pc;
> 	ulong ctr;
> 	ulong lr;
>=20
> 	ulong xer;
> 	u32 cr;
> -#endif
> +
> +	ulong pc;
> +	ulong msr;
>=20
> #ifdef CONFIG_PPC_BOOK3S
> 	ulong shadow_msr;
> 	ulong hflags;
> 	ulong guest_owned_ext;
> +	ulong purr;
> +	ulong spurr;
> +	ulong lpcr;
> +	ulong dscr;
> +	ulong amr;
> +	ulong uamor;
> +	u32 ctrl;
> +	u32 dsisr;
> +	ulong dabr;
> #endif
> 	u32 mmucr;
> +	ulong sprg0;
> +	ulong sprg1;
> +	ulong sprg2;
> +	ulong sprg3;
> 	ulong sprg4;
> 	ulong sprg5;
> 	ulong sprg6;
> 	ulong sprg7;
> +	ulong srr0;
> +	ulong srr1;
> 	ulong csrr0;
> 	ulong csrr1;
> 	ulong dsrr0;
> 	ulong dsrr1;
> +	ulong dear;
> 	ulong esr;
> 	u32 dec;
> 	u32 decar;
> @@ -259,6 +295,9 @@ struct kvm_vcpu_arch {
> 	u32 dbcr1;
> 	u32 dbsr;
>=20
> +	u64 mmcr[3];
> +	u32 pmc[6];
> +
> #ifdef CONFIG_KVM_EXIT_TIMING
> 	struct kvmppc_exit_timing timing_exit;
> 	struct kvmppc_exit_timing timing_last_enter;
> @@ -272,8 +311,12 @@ struct kvm_vcpu_arch {
> 	struct dentry *debugfs_exit_timing;
> #endif
>=20
> +#ifdef CONFIG_PPC_BOOK3S
> +	ulong fault_dar;
> +	u32 fault_dsisr;
> +#endif
> +
> #ifdef CONFIG_BOOKE
> -	u32 last_inst;
> 	ulong fault_dear;
> 	ulong fault_esr;
> 	ulong queued_dear;
> @@ -288,13 +331,18 @@ struct kvm_vcpu_arch {
> 	u8 dcr_is_write;
> 	u8 osi_needed;
> 	u8 osi_enabled;
> +	u8 hcall_needed;
>=20
> 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
>=20
> 	struct hrtimer dec_timer;
> 	struct tasklet_struct tasklet;
> 	u64 dec_jiffies;
> +	u64 dec_expires;
> 	unsigned long pending_exceptions;
> +	u16 last_cpu;
> +	u32 last_inst;
> +	int trap;
> 	struct kvm_vcpu_arch_shared *shared;
> 	unsigned long magic_page_pa; /* phys addr to map the magic page =
to */
> 	unsigned long magic_page_ea; /* effect. addr to map the magic =
page to */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h =
b/arch/powerpc/include/asm/kvm_ppc.h
> index 3210911..cd9ad96 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -110,6 +110,12 @@ extern void kvmppc_booke_exit(void);
> extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
> extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
>=20
> +extern long kvmppc_alloc_hpt(struct kvm *kvm);
> +extern void kvmppc_free_hpt(struct kvm *kvm);
> +extern long kvmppc_prepare_vrma(struct kvm *kvm,
> +				struct kvm_userspace_memory_region =
*mem);
> +extern void kvmppc_map_vrma(struct kvm *kvm,
> +			    struct kvm_userspace_memory_region *mem);
> extern int kvmppc_core_init_vm(struct kvm *kvm);
> extern void kvmppc_core_destroy_vm(struct kvm *kvm);
> extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h =
b/arch/powerpc/include/asm/mmu-hash64.h
> index ae7b3ef..0bb3fc1 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -90,13 +90,19 @@ extern char initial_stab[];
>=20
> #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
> #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
> +#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
> #define HPTE_R_RPN_SHIFT	12
> -#define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
> -#define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
> +#define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
> #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
> #define HPTE_R_N		ASM_CONST(0x0000000000000004)
> +#define HPTE_R_G		ASM_CONST(0x0000000000000008)
> +#define HPTE_R_M		ASM_CONST(0x0000000000000010)
> +#define HPTE_R_I		ASM_CONST(0x0000000000000020)
> +#define HPTE_R_W		ASM_CONST(0x0000000000000040)
> +#define HPTE_R_WIMG		ASM_CONST(0x0000000000000078)
> #define HPTE_R_C		ASM_CONST(0x0000000000000080)
> #define HPTE_R_R		ASM_CONST(0x0000000000000100)
> +#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)
>=20
> #define HPTE_V_1TB_SEG		ASM_CONST(0x4000000000000000)
> #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
> diff --git a/arch/powerpc/include/asm/paca.h =
b/arch/powerpc/include/asm/paca.h
> index 7412676..8dba5f6 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -149,6 +149,16 @@ struct paca_struct {
> #ifdef CONFIG_KVM_BOOK3S_HANDLER
> 	/* We use this to store guest state in */
> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	struct kvm_vcpu *kvm_vcpu;
> +	u64 dabr;
> +	u64 host_mmcr[3];
> +	u32 host_pmc[6];
> +	u64 host_purr;
> +	u64 host_spurr;
> +	u64 host_dscr;
> +	u64 dec_expires;

Hrm. I'd say either push those into shadow_vcpu for HV mode or get rid =
of the shadow_vcpu reference. I'd probably prefer the former.

> +#endif
> #endif
> };
>=20
> diff --git a/arch/powerpc/include/asm/reg.h =
b/arch/powerpc/include/asm/reg.h
> index c07b7be..0036977 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -189,6 +189,10 @@
> #define SPRN_CTR	0x009	/* Count Register */
> #define SPRN_DSCR	0x11
> #define SPRN_CFAR	0x1c	/* Come =46rom Address Register */
> +#define SPRN_AMR	0x1d	/* Authority Mask Register */
> +#define SPRN_UAMOR	0x9d	/* User Authority Mask Override Register =
*/
> +#define SPRN_AMOR	0x15d	/* Authority Mask Override Register */
> +#define SPRN_RWMR	885
> #define SPRN_CTRLF	0x088
> #define SPRN_CTRLT	0x098
> #define   CTRL_CT	0xc0000000	/* current thread */
> diff --git a/arch/powerpc/kernel/asm-offsets.c =
b/arch/powerpc/kernel/asm-offsets.c
> index 6887661..49e97fd 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -187,6 +187,7 @@ int main(void)
> 	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
> 	DEFINE(LPPACAANYINT, offsetof(struct lppaca, =
int_dword.any_int));
> 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, =
int_dword.fields.decr_int));
> +	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, =
pmcregs_in_use));
> 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
> 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
> #endif /* CONFIG_PPC_STD_MMU_64 */
> @@ -200,9 +201,17 @@ int main(void)
> 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, =
shadow_vcpu));
> -	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, =
slb));
> -	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, =
slb_max));
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	DEFINE(PACA_KVM_VCPU, offsetof(struct paca_struct, kvm_vcpu));
> +	DEFINE(PACA_HOST_MMCR, offsetof(struct paca_struct, host_mmcr));
> +	DEFINE(PACA_HOST_PMC, offsetof(struct paca_struct, host_pmc));
> +	DEFINE(PACA_HOST_PURR, offsetof(struct paca_struct, host_purr));
> +	DEFINE(PACA_HOST_SPURR, offsetof(struct paca_struct, =
host_spurr));
> +	DEFINE(PACA_HOST_DSCR, offsetof(struct paca_struct, host_dscr));
> +	DEFINE(PACA_DABR, offsetof(struct paca_struct, dabr));
> +	DEFINE(PACA_KVM_DECEXP, offsetof(struct paca_struct, =
dec_expires));
> #endif
> +#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
> #endif /* CONFIG_PPC64 */
>=20
> 	/* RTAS */
> @@ -396,6 +405,28 @@ int main(void)
> 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, =
arch.host_stack));
> 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
> 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
> +	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
> +	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
> +#ifdef CONFIG_ALTIVEC
> +	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
> +	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
> +#endif
> +#ifdef CONFIG_VSX
> +	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
> +#endif
> +	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
> +	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
> +	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
> +	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
> +	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
> +	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
> +	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
> +	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.srr0));
> +	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.srr1));
> +	DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.sprg0));
> +	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.sprg1));
> +	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.sprg2));
> +	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.sprg3));
> 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
> 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
> 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
> @@ -406,16 +437,65 @@ int main(void)
>=20
> 	/* book3s */
> #ifdef CONFIG_PPC_BOOK3S
> +	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
> +	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
> +	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
> +	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
> +	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
> +	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
> +	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, =
online_vcpus.counter));
> +	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
> +	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
> +	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
> 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, =
arch.host_retip));
> 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
> 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, =
arch.shadow_msr));
> +	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
> +	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
> +	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
> +	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
> +	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
> +	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
> +	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
> 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, =
arch.trampoline_lowmem));
> 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, =
arch.trampoline_enter));
> 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, =
arch.highmem_handler));
> 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
> 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
> +	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.dsisr));
> +	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.dear));
> +	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
> +	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, =
arch.dec_expires));
> +	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
> +	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
> +	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
> +	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
> +	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
> +	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
> +	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, =
arch.fault_dsisr));
> +	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, =
arch.fault_dar));
> +	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, =
arch.last_inst));
> +	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
> 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, =
shadow_vcpu) -
> 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
> +	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, =
host_r1));
> +	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, =
host_r2));
> +	DEFINE(SVCPU_SCRATCH0, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> +					scratch0));
> +	DEFINE(SVCPU_SCRATCH1, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> +					scratch1));
> +	DEFINE(SVCPU_IN_GUEST, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> +					in_guest));
> +	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
> +	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
> +	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> +#ifdef CONFIG_PPC64
> +	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, =
slb));
> +	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, =
slb_max));
> +#endif
> +	DEFINE(SVCPU_VMHANDLER, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> +					 vmhandler));
> 	DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, =
cr));
> 	DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, =
xer));
> 	DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, =
ctr));
> @@ -435,16 +515,6 @@ int main(void)
> 	DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, =
gpr[11]));
> 	DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, =
gpr[12]));
> 	DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, =
gpr[13]));
> -	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, =
host_r1));
> -	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, =
host_r2));
> -	DEFINE(SVCPU_VMHANDLER, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> -					 vmhandler));
> -	DEFINE(SVCPU_SCRATCH0, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> -					scratch0));
> -	DEFINE(SVCPU_SCRATCH1, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> -					scratch1));
> -	DEFINE(SVCPU_IN_GUEST, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> -					in_guest));
> 	DEFINE(SVCPU_FAULT_DSISR, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> 					   fault_dsisr));
> 	DEFINE(SVCPU_FAULT_DAR, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> @@ -453,6 +523,7 @@ int main(void)
> 					 last_inst));
> 	DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct =
kvmppc_book3s_shadow_vcpu,
> 					   shadow_srr1));
> +#endif /* CONFIG_KVM_BOOK3S_NONHV */
> #ifdef CONFIG_PPC_BOOK3S_32
> 	DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, =
sr));
> #endif
> diff --git a/arch/powerpc/kernel/exceptions-64s.S =
b/arch/powerpc/kernel/exceptions-64s.S
> index cbdf374..80c6456 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -87,14 +87,14 @@ data_access_not_stab:
> END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
> #endif
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, =
EXC_STD,
> -				 KVMTEST, 0x300)
> +				 KVMTEST_NONHV, 0x300)
>=20
> 	. =3D 0x380
> 	.globl data_access_slb_pSeries
> data_access_slb_pSeries:
> 	HMT_MEDIUM
> 	SET_SCRATCH0(r13)
> -	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
> +	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x380)
> 	std	r3,PACA_EXSLB+EX_R3(r13)
> 	mfspr	r3,SPRN_DAR
> #ifdef __DISABLED__
> @@ -125,7 +125,7 @@ data_access_slb_pSeries:
> instruction_access_slb_pSeries:
> 	HMT_MEDIUM
> 	SET_SCRATCH0(r13)
> -	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
> +	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x480)
> 	std	r3,PACA_EXSLB+EX_R3(r13)
> 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
> #ifdef __DISABLED__
> @@ -153,32 +153,32 @@ instruction_access_slb_pSeries:
> hardware_interrupt_pSeries:
> hardware_interrupt_hv:
> 	BEGIN_FTR_SECTION
> -		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
> -					    EXC_STD, SOFTEN_TEST)
> -		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
> -	FTR_SECTION_ELSE
> 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
> 					    EXC_HV, SOFTEN_TEST_HV)
> 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
> -	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
> +	FTR_SECTION_ELSE
> +		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
> +					    EXC_STD, SOFTEN_TEST_NONHV)
> +		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
> +	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
>=20
> 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x600)
>=20
> 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x700)
>=20
> 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x800)
>=20
> 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
> 	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
>=20
> 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xa00)
>=20
> 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xb00)
>=20
> 	. =3D 0xc00
> 	.globl	system_call_pSeries
> @@ -219,7 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
> 	b	.
>=20
> 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xd00)
>=20
> 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
> 	 * out of line to handle them
> @@ -254,23 +254,23 @@ vsx_unavailable_pSeries_1:
>=20
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
> #endif /* CONFIG_CBE_RAS */
>=20
> 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
>=20
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
> #endif /* CONFIG_CBE_RAS */
>=20
> 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x1700)
>=20
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
> #endif /* CONFIG_CBE_RAS */
>=20
> 	. =3D 0x3000
> @@ -297,7 +297,7 @@ data_access_check_stab:
> 	mfspr	r9,SPRN_DSISR
> 	srdi	r10,r10,60
> 	rlwimi	r10,r9,16,0x20
> -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
> 	rlwimi	r10,r9,8,0x300
> #endif
> @@ -316,11 +316,11 @@ do_stab_bolted_pSeries:
> 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
> #endif /* CONFIG_POWER4_ONLY */
>=20
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
> -	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
> -	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x300)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXSLB, EXC_STD, 0x380)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x400)
> +	KVM_HANDLER_NONHV(PACA_EXSLB, EXC_STD, 0x480)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x900)
> 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
>=20
> 	.align	7
> @@ -337,11 +337,11 @@ do_stab_bolted_pSeries:
>=20
> 	/* moved from 0xf00 */
> 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf00)
> 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf20)
> 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf40)
>=20
> /*
>  * An interrupt came in while soft-disabled; clear EE in SRR1,
> @@ -418,7 +418,11 @@ slb_miss_user_pseries:
> /* KVM's trampoline code needs to be close to the interrupt handlers =
*/
>=20
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> #include "../kvm/book3s_rmhandlers.S"
> +#else
> +#include "../kvm/book3s_hv_rmhandlers.S"
> +#endif
> #endif
>=20
> 	.align	7
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index b7baff7..6ff191b 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -20,7 +20,6 @@ config KVM
> 	bool
> 	select PREEMPT_NOTIFIERS
> 	select ANON_INODES
> -	select KVM_MMIO
>=20
> config KVM_BOOK3S_HANDLER
> 	bool
> @@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
> config KVM_BOOK3S_32_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
> +	select KVM_MMIO
>=20
> config KVM_BOOK3S_64_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
>=20
> +config KVM_BOOK3S_NONHV
> +	bool
> +	select KVM_MMIO
> +
> config KVM_BOOK3S_32
> 	tristate "KVM support for PowerPC book3s_32 processors"
> 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
> 	select KVM
> 	select KVM_BOOK3S_32_HANDLER
> +	select KVM_BOOK3S_NONHV
> 	---help---
> 	  Support running unmodified book3s_32 guest kernels
> 	  in virtual machines on book3s_32 host processors.
> @@ -48,10 +53,38 @@ config KVM_BOOK3S_32
> 	  If unsure, say N.
>=20
> config KVM_BOOK3S_64
> -	tristate "KVM support for PowerPC book3s_64 processors"
> +	bool
> +	select KVM_BOOK3S_64_HANDLER
> +
> +config KVM_BOOK3S_64_HV
> +	bool "KVM support for POWER7 using hypervisor mode in host"
> 	depends on EXPERIMENTAL && PPC_BOOK3S_64
> 	select KVM
> -	select KVM_BOOK3S_64_HANDLER
> +	select KVM_BOOK3S_64
> +	---help---
> +	  Support running unmodified book3s_64 guest kernels in
> +	  virtual machines on POWER7 processors that have hypervisor
> +	  mode available to the host.
> +
> +	  If you say Y here, KVM will use the hardware virtualization
> +	  facilities of POWER7 (and later) processors, meaning that
> +	  guest operating systems will run at full hardware speed
> +	  using supervisor and user modes.  However, this also means
> +	  that KVM is not usable under PowerVM (pHyp), is only usable
> +	  on POWER7 (or later) processors, and can only emulate
> +	  POWER5+, POWER6 and POWER7 processors.
> +
> +	  This module provides access to the hardware capabilities =
through
> +	  a character device node named /dev/kvm.
> +
> +	  If unsure, say N.
> +
> +config KVM_BOOK3S_64_NONHV
> +	tristate "KVM support for PowerPC book3s_64 processors"
> +	depends on EXPERIMENTAL && PPC_BOOK3S_64 && !KVM_BOOK3S_64_HV
> +	select KVM
> +	select KVM_BOOK3S_64
> +	select KVM_BOOK3S_NONHV
> 	---help---
> 	  Support running unmodified book3s_64 and book3s_32 guest =
kernels
> 	  in virtual machines on book3s_64 host processors.
> @@ -65,6 +98,7 @@ config KVM_440
> 	bool "KVM support for PowerPC 440 processors"
> 	depends on EXPERIMENTAL && 44x
> 	select KVM
> +	select KVM_MMIO

e500 should also select MMIO, no?

> 	---help---
> 	  Support running unmodified 440 guest kernels in virtual =
machines on
> 	  440 host processors.
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index bf9854f..37c1a60 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -14,7 +14,7 @@ CFLAGS_emulate.o  :=3D -I.
>=20
> common-objs-y +=3D powerpc.o emulate.o
> obj-$(CONFIG_KVM_EXIT_TIMING) +=3D timing.o
> -obj-$(CONFIG_KVM_BOOK3S_HANDLER) +=3D book3s_exports.o
> +obj-$(CONFIG_KVM_BOOK3S_NONHV) +=3D book3s_exports.o
>=20
> AFLAGS_booke_interrupts.o :=3D -I$(obj)
>=20
> @@ -38,7 +38,7 @@ kvm-e500-objs :=3D \
> 	e500_emulate.o
> kvm-objs-$(CONFIG_KVM_E500) :=3D $(kvm-e500-objs)
>=20
> -kvm-book3s_64-objs :=3D \
> +kvm-book3s_64_nonhv-objs :=3D \
> 	$(common-objs-y) \
> 	fpu.o \
> 	book3s_paired_singles.o \
> @@ -50,7 +50,17 @@ kvm-book3s_64-objs :=3D \
> 	book3s_64_mmu_host.o \
> 	book3s_64_mmu.o \
> 	book3s_32_mmu.o
> -kvm-objs-$(CONFIG_KVM_BOOK3S_64) :=3D $(kvm-book3s_64-objs)
> +kvm-objs-$(CONFIG_KVM_BOOK3S_64_NONHV) :=3D =
$(kvm-book3s_64_nonhv-objs)
> +
> +kvm-book3s_64_hv-objs :=3D \
> +	../../../virt/kvm/kvm_main.o \
> +	powerpc.o \
> +	emulate.o \
> +	book3s.o \
> +	book3s_hv.o \
> +	book3s_hv_interrupts.o \
> +	book3s_64_mmu_hv.o
> +kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) :=3D $(kvm-book3s_64_hv-objs)
>=20
> kvm-book3s_32-objs :=3D \
> 	$(common-objs-y) \
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c =
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> new file mode 100644
> index 0000000..52d1be1
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -0,0 +1,258 @@
> +/*
> + * This program is free software; you can redistribute it and/or =
modify
> + * it under the terms of the GNU General Public License, version 2, =
as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  =
02110-1301, USA.
> + *
> + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + */
> +
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/highmem.h>
> +#include <linux/gfp.h>
> +#include <linux/slab.h>
> +#include <linux/hugetlb.h>
> +
> +#include <asm/tlbflush.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu-hash64.h>
> +#include <asm/hvcall.h>
> +#include <asm/synch.h>
> +#include <asm/ppc-opcode.h>
> +#include <asm/cputable.h>
> +
> +/* For now use fixed-size 16MB page table */
> +#define HPT_ORDER	24
> +#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg =
*/
> +#define HPT_HASH_MASK	(HPT_NPTEG - 1)
> +
> +/* Pages in the VRMA are 16MB pages */
> +#define VRMA_PAGE_ORDER	24
> +#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
> +
> +#define NR_LPIDS	1024
> +unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
> +
> +long kvmppc_alloc_hpt(struct kvm *kvm)
> +{
> +	unsigned long hpt;
> +	unsigned long lpid;
> +
> +	hpt =3D =
__get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
> +			       HPT_ORDER - PAGE_SHIFT);

This would end up failing quite often, no?

> +	if (!hpt) {
> +		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
> +		return -ENOMEM;
> +	}
> +	kvm->arch.hpt_virt =3D hpt;
> +
> +	do {
> +		lpid =3D find_first_zero_bit(lpid_inuse, NR_LPIDS);
> +		if (lpid >=3D NR_LPIDS) {
> +			pr_err("kvm_alloc_hpt: No LPIDs free\n");
> +			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
> +			return -ENOMEM;
> +		}
> +	} while (test_and_set_bit(lpid, lpid_inuse));
> +
> +	kvm->arch.sdr1 =3D __pa(hpt) | (HPT_ORDER - 18);
> +	kvm->arch.lpid =3D lpid;
> +	kvm->arch.host_sdr1 =3D mfspr(SPRN_SDR1);
> +	kvm->arch.host_lpid =3D mfspr(SPRN_LPID);
> +	kvm->arch.host_lpcr =3D mfspr(SPRN_LPCR);

How do these correlate with the guest's hv mmu? I'd like to keep the =
code clean enough so we can potentially use it for PR mode as well :).

> +=09
> +	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
> +	return 0;
> +}
> +
> +void kvmppc_free_hpt(struct kvm *kvm)
> +{
> +	unsigned long i;
> +	struct kvmppc_pginfo *pginfo;
> +
> +	clear_bit(kvm->arch.lpid, lpid_inuse);
> +	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
> +
> +	if (kvm->arch.ram_pginfo) {
> +		pginfo =3D kvm->arch.ram_pginfo;
> +		kvm->arch.ram_pginfo =3D NULL;
> +		for (i =3D 0; i < kvm->arch.ram_npages; ++i)
> +			put_page(pfn_to_page(pginfo[i].pfn));
> +		kfree(pginfo);
> +	}
> +}
> +
> +static unsigned long user_page_size(unsigned long addr)
> +{
> +	struct vm_area_struct *vma;
> +	unsigned long size =3D PAGE_SIZE;
> +
> +	down_read(&current->mm->mmap_sem);
> +	vma =3D find_vma(current->mm, addr);
> +	if (vma)
> +		size =3D vma_kernel_pagesize(vma);
> +	up_read(&current->mm->mmap_sem);
> +	return size;
> +}
> +
> +static pfn_t hva_to_pfn(unsigned long addr)
> +{
> +	struct page *page[1];
> +	int npages;
> +
> +	might_sleep();
> +
> +	npages =3D get_user_pages_fast(addr, 1, 1, page);
> +
> +	if (unlikely(npages !=3D 1))
> +		return 0;
> +
> +	return page_to_pfn(page[0]);
> +}
> +
> +long kvmppc_prepare_vrma(struct kvm *kvm,
> +			 struct kvm_userspace_memory_region *mem)
> +{
> +	unsigned long psize, porder;
> +	unsigned long i, npages;
> +	struct kvmppc_pginfo *pginfo;
> +	pfn_t pfn;
> +	unsigned long hva;
> +
> +	/* First see what page size we have */
> +	psize =3D user_page_size(mem->userspace_addr);
> +	/* For now, only allow 16MB pages */

The reason to go for 16MB pages is because of the host mmu code, not the =
guest hv mmu. So please at least #ifdef the code to HV so we document =
that correlation.

> +	if (psize !=3D 1ul << VRMA_PAGE_ORDER || (mem->memory_size & =
(psize - 1))) {
> +		pr_err("bad psize=3D%lx memory_size=3D%llx @ %llx\n",
> +		       psize, mem->memory_size, mem->userspace_addr);
> +		return -EINVAL;
> +	}
> +	porder =3D __ilog2(psize);
> +
> +	npages =3D mem->memory_size >> porder;
> +	pginfo =3D kzalloc(npages * sizeof(struct kvmppc_pginfo), =
GFP_KERNEL);
> +	if (!pginfo) {
> +		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu =
bytes\n",
> +		       npages * sizeof(struct kvmppc_pginfo));
> +		return -ENOMEM;
> +	}
> +
> +	for (i =3D 0; i < npages; ++i) {
> +		hva =3D mem->userspace_addr + (i << porder);
> +		if (user_page_size(hva) !=3D psize)
> +			goto err;
> +		pfn =3D hva_to_pfn(hva);
> +		if (pfn =3D=3D 0) {
> +			pr_err("oops, no pfn for hva %lx\n", hva);
> +			goto err;
> +		}
> +		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
> +			pr_err("oops, unaligned pfn %llx\n", pfn);
> +			put_page(pfn_to_page(pfn));
> +			goto err;
> +		}
> +		pginfo[i].pfn =3D pfn;
> +	}
> +
> +	kvm->arch.ram_npages =3D npages;
> +	kvm->arch.ram_psize =3D psize;
> +	kvm->arch.ram_porder =3D porder;
> +	kvm->arch.ram_pginfo =3D pginfo;
> +
> +	return 0;
> +
> + err:
> +	kfree(pginfo);
> +	return -EINVAL;
> +}
> +
> +void kvmppc_map_vrma(struct kvm *kvm, struct =
kvm_userspace_memory_region *mem)
> +{
> +	unsigned long i;
> +	unsigned long npages =3D kvm->arch.ram_npages;
> +	unsigned long pfn;
> +	unsigned long *hpte;
> +	unsigned long hash;
> +	struct kvmppc_pginfo *pginfo =3D kvm->arch.ram_pginfo;
> +
> +	if (!pginfo)
> +		return;
> +
> +	/* VRMA can't be > 1TB */
> +	if (npages > 1ul << (40 - kvm->arch.ram_porder))
> +		npages =3D 1ul << (40 - kvm->arch.ram_porder);
> +	/* Can't use more than 1 HPTE per HPTEG */
> +	if (npages > HPT_NPTEG)
> +		npages =3D HPT_NPTEG;
> +
> +	for (i =3D 0; i < npages; ++i) {
> +		pfn =3D pginfo[i].pfn;
> +		/* can't use hpt_hash since va > 64 bits */
> +		hash =3D (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & =
HPT_HASH_MASK;
> +		/*
> +		 * We assume that the hash table is empty and no
> +		 * vcpus are using it at this stage.  Since we create
> +		 * at most one HPTE per HPTEG, we just assume entry 7
> +		 * is available and use it.
> +		 */
> +		hpte =3D (unsigned long *) (kvm->arch.hpt_virt + (hash =
<< 7));
> +		hpte +=3D 7 * 2;
> +		/* HPTE low word - RPN, protection, etc. */
> +		hpte[1] =3D (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
> +			HPTE_R_M | PP_RWXX;
> +		wmb();
> +		hpte[0] =3D HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
> +			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
> +			HPTE_V_LARGE | HPTE_V_VALID;
> +	}
> +}
> +
> +int kvmppc_mmu_hv_init(void)
> +{
> +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> +		return 0;

Return 0 for "it doesn't work" might not be the right exit code ;).

> +	memset(lpid_inuse, 0, sizeof(lpid_inuse));
> +	set_bit(mfspr(SPRN_LPID), lpid_inuse);
> +	set_bit(NR_LPIDS - 1, lpid_inuse);
> +
> +	return 0;
> +}
> +
> +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
> +{
> +}
> +
> +static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
> +{
> +	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
> +}
> +
> +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t =
eaddr,
> +				struct kvmppc_pte *gpte, bool data)
> +{
> +	return -ENOENT;

Can't you just call the normal book3s_64 mmu code here? Without xlate, =
doing ppc_ld doesn't work, which means that reading out the faulting =
guest instruction breaks. We'd also need it for PR mode :).

> +}
> +
> +void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_mmu *mmu =3D &vcpu->arch.mmu;
> +
> +	vcpu->arch.slb_nr =3D 32;		/* Assume POWER7 for now =
*/
> +
> +	mmu->xlate =3D kvmppc_mmu_book3s_64_hv_xlate;
> +	mmu->reset_msr =3D kvmppc_mmu_book3s_64_hv_reset_msr;
> +
> +	vcpu->arch.hflags |=3D BOOK3S_HFLAG_SLB;
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv.c =
b/arch/powerpc/kvm/book3s_hv.c
> new file mode 100644
> index 0000000..f6b7cd1
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -0,0 +1,413 @@
> +/*
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
> + *
> + * Authors:
> + *    Paul Mackerras <paulus@au1.ibm.com>
> + *    Alexander Graf <agraf@suse.de>
> + *    Kevin Wolf <mail@kevin-wolf.de>
> + *
> + * Description: KVM functions specific to running on Book 3S
> + * processors in hypervisor mode (specifically POWER7 and later).
> + *
> + * This file is derived from arch/powerpc/kvm/book3s.c,
> + * by Alexander Graf <agraf@suse.de>.
> + *
> + * This program is free software; you can redistribute it and/or =
modify
> + * it under the terms of the GNU General Public License, version 2, =
as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/preempt.h>
> +#include <linux/sched.h>
> +#include <linux/delay.h>
> +#include <linux/fs.h>
> +#include <linux/anon_inodes.h>
> +
> +#include <asm/reg.h>
> +#include <asm/cputable.h>
> +#include <asm/cacheflush.h>
> +#include <asm/tlbflush.h>
> +#include <asm/uaccess.h>
> +#include <asm/io.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu_context.h>
> +#include <asm/lppaca.h>
> +#include <asm/processor.h>
> +#include <linux/gfp.h>
> +#include <linux/sched.h>
> +#include <linux/vmalloc.h>
> +#include <linux/highmem.h>
> +
> +/* #define EXIT_DEBUG */
> +/* #define EXIT_DEBUG_SIMPLE */
> +/* #define EXIT_DEBUG_INT */
> +
> +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> +	local_paca->kvm_vcpu =3D vcpu;
> +	vcpu->cpu =3D cpu;
> +}
> +
> +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
> +{
> +	vcpu->cpu =3D -1;
> +}
> +
> +void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
> +{
> +	u64 now;
> +	unsigned long dec_nsec;
> +
> +	now =3D get_tb();
> +	if (now >=3D vcpu->arch.dec_expires && =
!kvmppc_core_pending_dec(vcpu))
> +		kvmppc_core_queue_dec(vcpu);
> +	if (vcpu->arch.pending_exceptions)
> +		return;
> +	if (vcpu->arch.dec_expires !=3D ~(u64)0) {
> +		dec_nsec =3D (vcpu->arch.dec_expires - now) * =
NSEC_PER_SEC /
> +			tb_ticks_per_sec;
> +		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, =
dec_nsec),
> +			      HRTIMER_MODE_REL);
> +	}
> +
> +	kvm_vcpu_block(vcpu);
> +	vcpu->stat.halt_wakeup++;
> +
> +	if (vcpu->arch.dec_expires !=3D ~(u64)0)
> +		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
> +}
> +
> +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
> +{
> +	vcpu->arch.msr =3D msr;
> +}
> +
> +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
> +{
> +	vcpu->arch.pvr =3D pvr;
> +	kvmppc_mmu_book3s_hv_init(vcpu);

No need for you to do it depending on pvr. You can just do the mmu =
initialization on normal init :).

> +}
> +
> +void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
> +{
> +	int r;
> +
> +	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
> +	pr_err("pc  =3D %.16lx  msr =3D %.16lx  trap =3D %x\n",
> +	       vcpu->arch.pc, vcpu->arch.msr, vcpu->arch.trap);
> +	for (r =3D 0; r < 16; ++r)
> +		pr_err("r%2d =3D %.16lx  r%d =3D %.16lx\n",
> +		       r, kvmppc_get_gpr(vcpu, r),
> +		       r+16, kvmppc_get_gpr(vcpu, r+16));
> +	pr_err("ctr =3D %.16lx  lr  =3D %.16lx\n",
> +	       vcpu->arch.ctr, vcpu->arch.lr);
> +	pr_err("srr0 =3D %.16lx srr1 =3D %.16lx\n",
> +	       vcpu->arch.srr0, vcpu->arch.srr1);
> +	pr_err("sprg0 =3D %.16lx sprg1 =3D %.16lx\n",
> +	       vcpu->arch.sprg0, vcpu->arch.sprg1);
> +	pr_err("sprg2 =3D %.16lx sprg3 =3D %.16lx\n",
> +	       vcpu->arch.sprg2, vcpu->arch.sprg3);
> +	pr_err("cr =3D %.8x  xer =3D %.16lx  dsisr =3D %.8x\n",
> +	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.dsisr);
> +	pr_err("dar =3D %.16lx\n", vcpu->arch.dear);
> +	pr_err("fault dar =3D %.16lx dsisr =3D %.8x\n",
> +	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
> +	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
> +	for (r =3D 0; r < vcpu->arch.slb_max; ++r)
> +		pr_err("  ESID =3D %.16llx VSID =3D %.16llx\n",
> +		       vcpu->arch.slb[r].orige, =
vcpu->arch.slb[r].origv);
> +	pr_err("lpcr =3D %.16lx sdr1 =3D %.16lx last_inst =3D %.8x\n",
> +	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
> +	       vcpu->arch.last_inst);
> +}
> +
> +static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu =
*vcpu,
> +			      struct task_struct *tsk)
> +{
> +	int r =3D RESUME_HOST;
> +
> +	vcpu->stat.sum_exits++;
> +
> +	run->exit_reason =3D KVM_EXIT_UNKNOWN;
> +	run->ready_for_interrupt_injection =3D 1;
> +	switch (vcpu->arch.trap) {
> +	/* We're good on these - the host merely wanted to get our =
attention */
> +	case BOOK3S_INTERRUPT_HV_DECREMENTER:
> +		vcpu->stat.dec_exits++;
> +		r =3D RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_EXTERNAL:
> +		vcpu->stat.ext_intr_exits++;
> +		r =3D RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_PERFMON:
> +		r =3D RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_PROGRAM:
> +	{
> +		ulong flags;
> +		/*
> +		 * Normally program interrupts are delivered directly
> +		 * to the guest by the hardware, but we can get here
> +		 * as a result of a hypervisor emulation interrupt
> +		 * (e40) getting turned into a 700 by BML RTAS.

Not sure I fully understand what's going on there. Mind to explain? :)

> +		 */
> +		flags =3D vcpu->arch.msr & 0x1f0000ull;
> +		kvmppc_core_queue_program(vcpu, flags);
> +		r =3D RESUME_GUEST;
> +		break;
> +	}
> +	case BOOK3S_INTERRUPT_SYSCALL:
> +	{
> +		/* hcall - punt to userspace */
> +		int i;
> +

Do we really want to accept sc from pr=3D1? I'd just reject them =
straight away.

> +		run->papr_hcall.nr =3D kvmppc_get_gpr(vcpu, 3);
> +		for (i =3D 0; i < 9; ++i)
> +			run->papr_hcall.args[i] =3D kvmppc_get_gpr(vcpu, =
4 + i);
> +		run->exit_reason =3D KVM_EXIT_PAPR_HCALL;
> +		vcpu->arch.hcall_needed =3D 1;
> +		r =3D RESUME_HOST;
> +		break;
> +	}
> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> +		vcpu->arch.dsisr =3D vcpu->arch.fault_dsisr;
> +		vcpu->arch.dear =3D vcpu->arch.fault_dar;
> +		kvmppc_inject_interrupt(vcpu, =
BOOK3S_INTERRUPT_DATA_STORAGE, 0);
> +		r =3D RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> +		kvmppc_inject_interrupt(vcpu, =
BOOK3S_INTERRUPT_INST_STORAGE,
> +					0x08000000);

What do these do? I thought the guest gets DSI and ISI directly?

> +		r =3D RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
> +		kvmppc_core_queue_program(vcpu, 0x80000);
> +		r =3D RESUME_GUEST;
> +		break;
> +	default:
> +		kvmppc_dump_regs(vcpu);
> +		printk(KERN_EMERG "trap=3D0x%x | pc=3D0x%lx | =
msr=3D0x%lx\n",
> +			vcpu->arch.trap, kvmppc_get_pc(vcpu), =
vcpu->arch.msr);
> +		r =3D RESUME_HOST;
> +		BUG();
> +		break;
> +	}
> +
> +
> +	if (!(r & RESUME_HOST)) {
> +		/* To avoid clobbering exit_reason, only check for =
signals if
> +		 * we aren't already exiting to userspace for some other
> +		 * reason. */
> +		if (signal_pending(tsk)) {
> +			vcpu->stat.signal_exits++;
> +			run->exit_reason =3D KVM_EXIT_INTR;
> +			r =3D -EINTR;
> +		} else {
> +			kvmppc_core_deliver_interrupts(vcpu);
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> +                                  struct kvm_sregs *sregs)
> +{
> +	int i;
> +
> +	sregs->pvr =3D vcpu->arch.pvr;
> +
> +	memset(sregs, 0, sizeof(struct kvm_sregs));
> +	for (i =3D 0; i < vcpu->arch.slb_max; i++) {
> +		sregs->u.s.ppc64.slb[i].slbe =3D =
vcpu->arch.slb[i].orige;
> +		sregs->u.s.ppc64.slb[i].slbv =3D =
vcpu->arch.slb[i].origv;
> +	}
> +
> +	return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> +                                  struct kvm_sregs *sregs)
> +{
> +	int i, j;
> +
> +	kvmppc_set_pvr(vcpu, sregs->pvr);
> +
> +	j =3D 0;
> +	for (i =3D 0; i < vcpu->arch.slb_nr; i++) {
> +		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
> +			vcpu->arch.slb[j].orige =3D =
sregs->u.s.ppc64.slb[i].slbe;
> +			vcpu->arch.slb[j].origv =3D =
sregs->u.s.ppc64.slb[i].slbv;
> +			++j;
> +		}
> +	}
> +	vcpu->arch.slb_max =3D j;
> +
> +	return 0;
> +}
> +
> +int kvmppc_core_check_processor_compat(void)
> +{
> +	if (cpu_has_feature(CPU_FTR_HVMODE_206))
> +		return 0;
> +	return -EIO;
> +}
> +
> +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned =
int id)
> +{
> +	struct kvm_vcpu *vcpu;
> +	int err =3D -ENOMEM;
> +	unsigned long lpcr;
> +
> +	vcpu =3D kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
> +	if (!vcpu)
> +		goto out;
> +
> +	err =3D kvm_vcpu_init(vcpu, kvm, id);
> +	if (err)
> +		goto free_vcpu;
> +
> +	vcpu->arch.last_cpu =3D -1;
> +	vcpu->arch.host_msr =3D mfmsr();
> +	vcpu->arch.mmcr[0] =3D MMCR0_FC;
> +	vcpu->arch.ctrl =3D CTRL_RUNLATCH;
> +	/* default to book3s_64 (power7) */
> +	vcpu->arch.pvr =3D 0x3f0200;

Wouldn't it make sense to simply default to the host pvr? Not sure - =
maybe it's not :).

> +	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
> +
> +	/* remember where some real-mode handlers are */
> +	vcpu->arch.trampoline_lowmem =3D kvmppc_trampoline_lowmem;
> +	vcpu->arch.trampoline_enter =3D kvmppc_trampoline_enter;
> +	vcpu->arch.highmem_handler =3D (ulong)kvmppc_handler_highmem;
> +	vcpu->arch.rmcall =3D *(ulong*)kvmppc_rmcall;
> +
> +	lpcr =3D kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
> +	lpcr |=3D LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | =
LPCR_HDICE;
> +	vcpu->arch.lpcr =3D lpcr;
> +
> +	return vcpu;
> +
> +free_vcpu:
> +	kfree(vcpu);
> +out:
> +	return ERR_PTR(err);
> +}
> +
> +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
> +{
> +	kvm_vcpu_uninit(vcpu);
> +	kfree(vcpu);
> +}
> +
> +extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct =
kvm_vcpu *vcpu);
> +
> +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +{
> +	u64 now;
> +
> +	if (signal_pending(current)) {
> +		run->exit_reason =3D KVM_EXIT_INTR;
> +		return -EINTR;
> +	}
> +
> +	flush_fp_to_thread(current);

Do those with fine with preemption enabled?

> +	flush_altivec_to_thread(current);
> +	flush_vsx_to_thread(current);
> +	preempt_disable();
> +
> +	kvm_guest_enter();
> +
> +	__kvmppc_vcore_entry(NULL, vcpu);
> +
> +	kvm_guest_exit();
> +
> +	preempt_enable();
> +	kvm_resched(vcpu);
> +
> +	now =3D get_tb();
> +	/* cancel pending dec exception if dec is positive */
> +	if (now < vcpu->arch.dec_expires && =
kvmppc_core_pending_dec(vcpu))
> +		kvmppc_core_dequeue_dec(vcpu);
> +
> +	return kvmppc_handle_exit(run, vcpu, current);
> +}
> +
> +int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem)
> +{
> +	if (mem->guest_phys_addr =3D=3D 0 && mem->memory_size !=3D 0)
> +		return kvmppc_prepare_vrma(kvm, mem);
> +	return 0;
> +}
> +
> +void kvmppc_core_commit_memory_region(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem)
> +{
> +	if (mem->guest_phys_addr =3D=3D 0 && mem->memory_size !=3D 0)
> +		kvmppc_map_vrma(kvm, mem);
> +}
> +
> +int kvmppc_core_init_vm(struct kvm *kvm)
> +{
> +	long r;
> +
> +	/* Allocate hashed page table */
> +	r =3D kvmppc_alloc_hpt(kvm);
> +
> +	return r;
> +}
> +
> +void kvmppc_core_destroy_vm(struct kvm *kvm)
> +{
> +	kvmppc_free_hpt(kvm);
> +}
> +
> +/* These are stubs for now */
> +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, =
ulong pa_end)
> +{
> +}
> +
> +/* We don't need to emulate any privileged instructions or dcbz */
> +int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu =
*vcpu,
> +                           unsigned int inst, int *advance)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int =
rs)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int =
rt)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +static int kvmppc_book3s_hv_init(void)
> +{
> +	int r;
> +
> +	r =3D kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
> +
> +	if (r)
> +		return r;
> +
> +	r =3D kvmppc_mmu_hv_init();
> +
> +	return r;
> +}
> +
> +static void kvmppc_book3s_hv_exit(void)
> +{
> +	kvm_exit();
> +}
> +
> +module_init(kvmppc_book3s_hv_init);
> +module_exit(kvmppc_book3s_hv_exit);
> diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S =
b/arch/powerpc/kvm/book3s_hv_interrupts.S
> new file mode 100644
> index 0000000..19d152d
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
> @@ -0,0 +1,326 @@
> +/*
> + * This program is free software; you can redistribute it and/or =
modify
> + * it under the terms of the GNU General Public License, version 2, =
as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  =
02110-1301, USA.
> + *
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + *
> + * Derived from book3s_interrupts.S, which is:
> + * Copyright SUSE Linux Products GmbH 2009
> + *
> + * Authors: Alexander Graf <agraf@suse.de>
> + */
> +
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/page.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/exception-64s.h>
> +#include <asm/ppc-opcode.h>
> +
> +#define DISABLE_INTERRUPTS	\
> +	mfmsr   r0;		\
> +	rldicl  r0,r0,48,1;	\
> +	rotldi  r0,r0,16;	\
> +	mtmsrd  r0,1;		\
> +
> =
+/************************************************************************=
*****
> + *                                                                    =
       *
> + *     Guest entry / exit code that is in kernel module memory =
(vmalloc)     *
> + *                                                                    =
       *
> + =
**************************************************************************=
**/
> +
> +/* Registers:
> + *  r4: vcpu pointer
> + */
> +_GLOBAL(__kvmppc_vcore_entry)
> +
> +	/* Write correct stack frame */
> +	mflr	r0
> +	std	r0,PPC_LR_STKOFF(r1)
> +
> +	/* Save host state to the stack */
> +	stdu	r1, -SWITCH_FRAME_SIZE(r1)
> +
> +	/* Save non-volatile registers (r14 - r31) */
> +	SAVE_NVGPRS(r1)
> +
> +	/* Save host PMU registers and load guest PMU registers */
> +	/* R4 is live here (vcpu pointer) but not r3 or r5 */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) =
bit */
> +	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable =
interrupts */
> +	isync
> +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
> +	lbz	r5, LPPACA_PMCINUSE(r3)
> +	cmpwi	r5, 0
> +	beq	31f			/* skip if not */
> +	mfspr	r5, SPRN_MMCR1
> +	mfspr	r6, SPRN_MMCRA
> +	std	r7, PACA_HOST_MMCR(r13)
> +	std	r5, PACA_HOST_MMCR + 8(r13)
> +	std	r6, PACA_HOST_MMCR + 16(r13)
> +	mfspr	r3, SPRN_PMC1
> +	mfspr	r5, SPRN_PMC2
> +	mfspr	r6, SPRN_PMC3
> +	mfspr	r7, SPRN_PMC4
> +	mfspr	r8, SPRN_PMC5
> +	mfspr	r9, SPRN_PMC6
> +	stw	r3, PACA_HOST_PMC(r13)
> +	stw	r5, PACA_HOST_PMC + 4(r13)
> +	stw	r6, PACA_HOST_PMC + 8(r13)
> +	stw	r7, PACA_HOST_PMC + 12(r13)
> +	stw	r8, PACA_HOST_PMC + 16(r13)
> +	stw	r9, PACA_HOST_PMC + 20(r13)
> +31:
> +
> +	/* Save host DSCR */
> +	mfspr	r3, SPRN_DSCR
> +	std	r3, PACA_HOST_DSCR(r13)
> +
> +	/* Save host DABR */
> +	mfspr	r3, SPRN_DABR
> +	std	r3, PACA_DABR(r13)
> +
> +	DISABLE_INTERRUPTS
> +
> +	/*
> +	 * Put whatever is in the decrementer into the
> +	 * hypervisor decrementer.
> +	 */
> +	mfspr	r8,SPRN_DEC
> +	mftb	r7
> +	mtspr	SPRN_HDEC,r8

Can't we just always use HDEC on the host? That's save us from all the =
swapping here.

> +	extsw	r8,r8
> +	add	r8,r8,r7
> +	std	r8,PACA_KVM_DECEXP(r13)

Why is dec+hdec on vcpu_run decexp? What exactly does this store?

> +
> +	ld	r5, VCPU_TRAMPOLINE_ENTER(r4)
> +	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
> +
> +	/* Jump to segment patching handler and into our guest */
> +	b	kvmppc_rmcall
> +
> +/*
> + * This is the handler in module memory. It gets jumped at from the
> + * lowmem trampoline code, so it's basically the guest exit code.
> + *
> + */
> +
> +.global kvmppc_handler_highmem
> +kvmppc_handler_highmem:
> +
> +	/*
> +	 * Register usage at this point:
> +	 *
> +	 * R1       =3D host R1
> +	 * R2       =3D host R2
> +	 * R12      =3D exit handler id
> +	 * R13      =3D PACA
> +	 * SVCPU.*  =3D guest *
> +	 *
> +	 */
> +
> +	/* R7 =3D vcpu */
> +	ld	r7, PACA_KVM_VCPU(r13)
> +
> +	/*
> +	 * Reload DEC.  HDEC interrupts were disabled when
> +	 * we reloaded the host's LPCR value.
> +	 */
> +	ld	r3, PACA_KVM_DECEXP(r13)
> +	mftb	r4
> +	subf	r4, r4, r3
> +	mtspr	SPRN_DEC, r4
> +
> +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
> +	lbz	r4, LPPACA_PMCINUSE(r3)
> +	cmpwi	r4, 0
> +	beq	23f			/* skip if not */
> +	lwz	r3, PACA_HOST_PMC(r13)
> +	lwz	r4, PACA_HOST_PMC + 4(r13)
> +	lwz	r5, PACA_HOST_PMC + 4(r13)

copy&paste error?

> +	lwz	r6, PACA_HOST_PMC + 4(r13)
> +	lwz	r8, PACA_HOST_PMC + 4(r13)
> +	lwz	r9, PACA_HOST_PMC + 4(r13)
> +	mtspr	SPRN_PMC1, r3
> +	mtspr	SPRN_PMC2, r4
> +	mtspr	SPRN_PMC3, r5
> +	mtspr	SPRN_PMC4, r6
> +	mtspr	SPRN_PMC5, r8
> +	mtspr	SPRN_PMC6, r9
> +	ld	r3, PACA_HOST_MMCR(r13)
> +	ld	r4, PACA_HOST_MMCR + 8(r13)
> +	ld	r5, PACA_HOST_MMCR + 16(r13)
> +	mtspr	SPRN_MMCR1, r4
> +	mtspr	SPRN_MMCRA, r5
> +	mtspr	SPRN_MMCR0, r3
> +	isync
> +23:
> +
> +	/* Restore host msr -> SRR1 */
> +	ld	r4, VCPU_HOST_MSR(r7)
> +
> +	/*
> +	 * For some interrupts, we need to call the real Linux
> +	 * handler, so it can do work for us. This has to happen
> +	 * as if the interrupt arrived from the kernel though,
> +	 * so let's fake it here where most state is restored.
> +	 *
> +	 * Call Linux for hardware interrupts/decrementer
> +	 * r3 =3D address of interrupt handler (exit reason)
> +	 */
> +	/* Note: preemption is disabled at this point */
> +
> +	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
> +	beq	1f
> +	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
> +	beq	1f
> +
> +	/* Back to EE=3D1 */
> +	mtmsr	r4
> +	sync
> +	b	kvm_return_point
> +
> +1:	bl	call_linux_handler
> +
> +.global kvm_return_point
> +kvm_return_point:
> +	/* Restore non-volatile host registers (r14 - r31) */
> +	REST_NVGPRS(r1)
> +
> +	addi    r1, r1, SWITCH_FRAME_SIZE
> +	ld	r0, PPC_LR_STKOFF(r1)
> +	mtlr	r0
> +	blr
> +
> +call_linux_handler:
> +	/* Restore host IP -> SRR0 */
> +	mflr	r3
> +	mtlr	r12
> +
> +	ld	r5, VCPU_TRAMPOLINE_LOWMEM(r7)
> +	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
> +	b	kvmppc_rmcall
> +
> +/*
> + * Save away FP, VMX and VSX registers.
> + * r3 =3D vcpu pointer
> +*/
> +_GLOBAL(kvmppc_save_fp)
> +	mfmsr	r9
> +	ori	r8,r9,MSR_FP
> +#ifdef CONFIG_ALTIVEC
> +#ifdef CONFIG_VSX
> +	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
> +#else
> +	oris	r8,r8,MSR_VEC@h
> +#endif
> +#endif
> +	mtmsrd	r8
> +	isync
> +#ifdef CONFIG_VSX
> +BEGIN_FTR_SECTION
> +	reg =3D 0
> +	.rept	32
> +	li	r6,reg*16+VCPU_VSRS
> +	stxvd2x	reg,r6,r3
> +	reg =3D reg + 1
> +	.endr
> +FTR_SECTION_ELSE
> +#endif
> +	reg =3D 0
> +	.rept	32
> +	stfd	reg,reg*8+VCPU_FPRS(r3)
> +	reg =3D reg + 1
> +	.endr
> +#ifdef CONFIG_VSX
> +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
> +#endif
> +	mffs	fr0
> +	stfd	fr0,VCPU_FPSCR(r3)
> +
> +#ifdef CONFIG_ALTIVEC
> +BEGIN_FTR_SECTION
> +	reg =3D 0
> +	.rept	32
> +	li	r6,reg*16+VCPU_VRS
> +	stvx	reg,r6,r3
> +	reg =3D reg + 1
> +	.endr
> +	mfvscr	vr0
> +	li	r6,VCPU_VSCR
> +	stvx	vr0,r6,r3
> +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
> +#endif
> +	mfspr	r6,SPRN_VRSAVE
> +	stw	r6,VCPU_VRSAVE(r3)
> +	mtmsrd	r9
> +	isync
> +	blr
> +
> +/*
> + * Load up FP, VMX and VSX registers
> + * r4 =3D vcpu pointer
> + */
> +	.globl	kvmppc_load_fp
> +kvmppc_load_fp:
> +	mfmsr	r9
> +	ori	r8,r9,MSR_FP
> +#ifdef CONFIG_ALTIVEC
> +#ifdef CONFIG_VSX
> +	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
> +#else
> +	oris	r8,r8,MSR_VEC@h
> +#endif
> +#endif
> +	mtmsrd	r8
> +	isync
> +	lfd	fr0,VCPU_FPSCR(r4)
> +	MTFSF_L(fr0)
> +#ifdef CONFIG_VSX
> +BEGIN_FTR_SECTION
> +	reg =3D 0
> +	.rept	32
> +	li	r7,reg*16+VCPU_VSRS
> +	lxvd2x	reg,r7,r4
> +	reg =3D reg + 1
> +	.endr
> +FTR_SECTION_ELSE
> +#endif
> +	reg =3D 0
> +	.rept	32
> +	lfd	reg,reg*8+VCPU_FPRS(r4)
> +	reg =3D reg + 1
> +	.endr
> +#ifdef CONFIG_VSX
> +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
> +#endif
> +
> +#ifdef CONFIG_ALTIVEC
> +	li	r7,VCPU_VSCR
> +	lvx	vr0,r7,r4
> +	mtvscr	vr0
> +BEGIN_FTR_SECTION
> +	reg =3D 0
> +	.rept	32
> +	li	r7,reg*16+VCPU_VRS
> +	lvx	reg,r7,r4
> +	reg =3D reg + 1
> +	.endr
> +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
> +#endif
> +	lwz	r7,VCPU_VRSAVE(r4)
> +	mtspr	SPRN_VRSAVE,r7
> +	blr
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S =
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> new file mode 100644
> index 0000000..813b01c
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -0,0 +1,663 @@
> +/*
> + * This program is free software; you can redistribute it and/or =
modify
> + * it under the terms of the GNU General Public License, version 2, =
as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + *
> + * Derived from book3s_rmhandlers.S and other files, which are:
> + *
> + * Copyright SUSE Linux Products GmbH 2009
> + *
> + * Authors: Alexander Graf <agraf@suse.de>
> + */
> +
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/page.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/exception-64s.h>
> +
> =
+/************************************************************************=
*****
> + *                                                                    =
       *
> + *        Real Mode handlers that need to be in the linear mapping    =
       *
> + *                                                                    =
       *
> + =
**************************************************************************=
**/
> +
> +#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
> +
> +	.globl	kvmppc_skip_interrupt
> +kvmppc_skip_interrupt:
> +	mfspr	r13,SPRN_SRR0
> +	addi	r13,r13,4
> +	mtspr	SPRN_SRR0,r13
> +	GET_SCRATCH0(r13)
> +	rfid
> +	b	.
> +
> +	.globl	kvmppc_skip_Hinterrupt
> +kvmppc_skip_Hinterrupt:
> +	mfspr	r13,SPRN_HSRR0
> +	addi	r13,r13,4
> +	mtspr	SPRN_HSRR0,r13
> +	GET_SCRATCH0(r13)
> +	hrfid
> +	b	.
> +
> +/*
> + * This trampoline brings us back to a real mode handler
> + *
> + * Input Registers:
> + *
> + * R5 =3D SRR0
> + * R6 =3D SRR1
> + * R12 =3D real-mode IP
> + * LR =3D real-mode IP
> + *
> + */
> +.global kvmppc_handler_lowmem_trampoline
> +kvmppc_handler_lowmem_trampoline:
> +	cmpwi	r12,0x500
> +	beq	1f
> +	cmpwi	r12,0x980
> +	beq	1f

What?

1) use the macros please
2) why?

> +	mtsrr0	r3
> +	mtsrr1	r4
> +	blr
> +1:	mtspr	SPRN_HSRR0,r3
> +	mtspr	SPRN_HSRR1,r4
> +	blr
> +
> +/*
> + * Call a function in real mode.
> + * Must be called with interrupts hard-disabled.
> + *
> + * Input Registers:
> + *
> + * R5 =3D function
> + * R6 =3D MSR
> + * R7 =3D scratch register
> + *
> + */
> +_GLOBAL(kvmppc_rmcall)
> +	mfmsr	r7
> +	li	r0,MSR_RI		/* clear RI in MSR */
> +	andc	r7,r7,r0
> +	mtmsrd	r7,1
> +	mtsrr0	r5
> +	mtsrr1	r6
> +	RFI
> +
> +.global kvmppc_trampoline_lowmem
> +kvmppc_trampoline_lowmem:
> +	PPC_LONG kvmppc_handler_lowmem_trampoline - _stext
> +
> +.global kvmppc_trampoline_enter
> +kvmppc_trampoline_enter:
> +	PPC_LONG kvmppc_handler_trampoline_enter - _stext
> +
> +#define ULONG_SIZE 		8
> +#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))
> +
> =
+/************************************************************************=
******
> + *                                                                    =
        *
> + *                               Entry code                           =
        *
> + *                                                                    =
        *
> + =
**************************************************************************=
***/
> +
> +.global kvmppc_handler_trampoline_enter
> +kvmppc_handler_trampoline_enter:
> +
> +	/* Required state:
> +	 *
> +	 * R4 =3D vcpu pointer
> +	 * MSR =3D ~IR|DR
> +	 * R13 =3D PACA
> +	 * R1 =3D host R1
> +	 * all other volatile GPRS =3D free
> +	 */
> +	ld	r14, VCPU_GPR(r14)(r4)
> +	ld	r15, VCPU_GPR(r15)(r4)
> +	ld	r16, VCPU_GPR(r16)(r4)
> +	ld	r17, VCPU_GPR(r17)(r4)
> +	ld	r18, VCPU_GPR(r18)(r4)
> +	ld	r19, VCPU_GPR(r19)(r4)
> +	ld	r20, VCPU_GPR(r20)(r4)
> +	ld	r21, VCPU_GPR(r21)(r4)
> +	ld	r22, VCPU_GPR(r22)(r4)
> +	ld	r23, VCPU_GPR(r23)(r4)
> +	ld	r24, VCPU_GPR(r24)(r4)
> +	ld	r25, VCPU_GPR(r25)(r4)
> +	ld	r26, VCPU_GPR(r26)(r4)
> +	ld	r27, VCPU_GPR(r27)(r4)
> +	ld	r28, VCPU_GPR(r28)(r4)
> +	ld	r29, VCPU_GPR(r29)(r4)
> +	ld	r30, VCPU_GPR(r30)(r4)
> +	ld	r31, VCPU_GPR(r31)(r4)
> +
> +	/* Load guest PMU registers */
> +	/* R4 is live here (vcpu pointer) */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) =
bit */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable =
ints */
> +	isync
> +	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU =
registers */
> +	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak =
*/
> +	lwz	r6, VCPU_PMC + 8(r4)
> +	lwz	r7, VCPU_PMC + 12(r4)
> +	lwz	r8, VCPU_PMC + 16(r4)
> +	lwz	r9, VCPU_PMC + 20(r4)
> +	mtspr	SPRN_PMC1, r3
> +	mtspr	SPRN_PMC2, r5
> +	mtspr	SPRN_PMC3, r6
> +	mtspr	SPRN_PMC4, r7
> +	mtspr	SPRN_PMC5, r8
> +	mtspr	SPRN_PMC6, r9
> +	ld	r3, VCPU_MMCR(r4)
> +	ld	r5, VCPU_MMCR + 8(r4)
> +	ld	r6, VCPU_MMCR + 16(r4)
> +	mtspr	SPRN_MMCR1, r5
> +	mtspr	SPRN_MMCRA, r6
> +	mtspr	SPRN_MMCR0, r3
> +	isync
> +
> +	/* Load up FP, VMX and VSX registers */
> +	bl	kvmppc_load_fp
> +
> +	/* Switch DSCR to guest value */
> +	ld	r5, VCPU_DSCR(r4)
> +	mtspr	SPRN_DSCR, r5
> +
> +	/*
> +	 * Set the decrementer to the guest decrementer.
> +	 */
> +	ld	r8,VCPU_DEC_EXPIRES(r4)
> +	mftb	r7
> +	subf	r3,r7,r8
> +	mtspr	SPRN_DEC,r3
> +	stw	r3,VCPU_DEC(r4)
> +
> +	ld	r5, VCPU_SPRG0(r4)
> +	ld	r6, VCPU_SPRG1(r4)
> +	ld	r7, VCPU_SPRG2(r4)
> +	ld	r8, VCPU_SPRG3(r4)
> +	mtspr	SPRN_SPRG0, r5
> +	mtspr	SPRN_SPRG1, r6
> +	mtspr	SPRN_SPRG2, r7
> +	mtspr	SPRN_SPRG3, r8
> +
> +	/* Save R1 in the PACA */
> +	std	r1, PACA_KVM_SVCPU + SVCPU_HOST_R1(r13)
> +
> +	/* Load up DAR and DSISR */
> +	ld	r5, VCPU_DAR(r4)
> +	lwz	r6, VCPU_DSISR(r4)
> +	mtspr	SPRN_DAR, r5
> +	mtspr	SPRN_DSISR, r6
> +
> +	/* Set partition DABR */
> +	li	r5,3
> +	ld	r6,VCPU_DABR(r4)
> +	mtspr	SPRN_DABRX,r5
> +	mtspr	SPRN_DABR,r6
> +
> +	/* Restore AMR and UAMOR, set AMOR to all 1s */
> +	ld	r5,VCPU_AMR(r4)
> +	ld	r6,VCPU_UAMOR(r4)
> +	li	r7,-1
> +	mtspr	SPRN_AMR,r5
> +	mtspr	SPRN_UAMOR,r6
> +	mtspr	SPRN_AMOR,r7
> +
> +	/* Clear out SLB */
> +	li	r6,0
> +	slbmte	r6,r6
> +	slbia
> +	ptesync
> +
> +	/* Switch to guest partition. */
> +	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> +	ld	r6,KVM_SDR1(r9)
> +	lwz	r7,KVM_LPID(r9)
> +	li	r0,0x3ff		/* switch to reserved LPID */
> +	mtspr	SPRN_LPID,r0
> +	ptesync
> +	mtspr	SPRN_SDR1,r6		/* switch to partition page =
table */
> +	mtspr	SPRN_LPID,r7
> +	isync
> +	ld	r8,VCPU_LPCR(r4)
> +	mtspr	SPRN_LPCR,r8
> +	isync
> +
> +	/* Check if HDEC expires soon */
> +	mfspr	r3,SPRN_HDEC
> +	cmpwi	r3,10
> +	li	r12,0x980

define

> +	mr	r9,r4
> +	blt	hdec_soon

Is it faster to do the check than to save the check and take the odds? =
Also, maybe we should rather do the check in preemptible code that could =
just directly pass the time slice on.

> +
> +	/*
> +	 * Invalidate the TLB if we could possibly have stale TLB
> +	 * entries for this partition on this core due to the use
> +	 * of tlbiel.
> +	 */
> +	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> +	lwz	r5,VCPU_VCPUID(r4)
> +	lhz	r6,PACAPACAINDEX(r13)
> +	lhz	r8,VCPU_LAST_CPU(r4)
> +	sldi	r7,r6,1			/* see if this is the same vcpu =
*/
> +	add	r7,r7,r9		/* as last ran on this pcpu */
> +	lhz	r0,KVM_LAST_VCPU(r7)
> +	cmpw	r6,r8			/* on the same cpu core as last =
time? */
> +	bne	3f
> +	cmpw	r0,r5			/* same vcpu as this core last =
ran? */
> +	beq	1f
> +3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition =
TLB */
> +	sth	r5,KVM_LAST_VCPU(r7)
> +	li	r6,128
> +	mtctr	r6
> +	li	r7,0x800		/* IS field =3D 0b10 */
> +	ptesync
> +2:	tlbiel	r7
> +	addi	r7,r7,0x1000
> +	bdnz	2b
> +	ptesync
> +1:
> +
> +	/* Save purr/spurr */
> +	mfspr	r5,SPRN_PURR
> +	mfspr	r6,SPRN_SPURR
> +	std	r5,PACA_HOST_PURR(r13)
> +	std	r6,PACA_HOST_SPURR(r13)
> +	ld	r7,VCPU_PURR(r4)
> +	ld	r8,VCPU_SPURR(r4)
> +	mtspr	SPRN_PURR,r7
> +	mtspr	SPRN_SPURR,r8
> +
> +	/* Load up guest SLB entries */
> +	lwz	r5,VCPU_SLB_MAX(r4)
> +	cmpwi	r5,0
> +	beq	9f
> +	mtctr	r5
> +	addi	r6,r4,VCPU_SLB
> +1:	ld	r8,VCPU_SLB_E(r6)
> +	ld	r9,VCPU_SLB_V(r6)
> +	slbmte	r9,r8
> +	addi	r6,r6,VCPU_SLB_SIZE
> +	bdnz	1b
> +9:
> +
> +	/* Restore state of CTRL run bit; assume 1 on entry */
> +	lwz	r5,VCPU_CTRL(r4)
> +	andi.	r5,r5,1
> +	bne	4f
> +	mfspr	r6,SPRN_CTRLF
> +	clrrdi	r6,r6,1
> +	mtspr	SPRN_CTRLT,r6
> +4:
> +	ld	r6, VCPU_CTR(r4)
> +	lwz	r7, VCPU_XER(r4)
> +
> +	mtctr	r6
> +	mtxer	r7
> +
> +	/* Move SRR0 and SRR1 into the respective regs */
> +	ld	r6, VCPU_SRR0(r4)
> +	ld	r7, VCPU_SRR1(r4)
> +	mtspr	SPRN_SRR0, r6
> +	mtspr	SPRN_SRR1, r7
> +
> +	ld	r10, VCPU_PC(r4)
> +
> +	ld	r11, VCPU_MSR(r4)	/* r10 =3D vcpu->arch.msr & =
~MSR_HV */
> +	rldicl	r11, r11, 63 - MSR_HV_LG, 1
> +	rotldi	r11, r11, 1 + MSR_HV_LG
> +	ori	r11, r11, MSR_ME
> +
> +fast_guest_return:
> +	mtspr	SPRN_HSRR0,r10
> +	mtspr	SPRN_HSRR1,r11
> +
> +	/* Activate guest mode, so faults get handled by KVM */
> +	li	r9, KVM_GUEST_MODE_GUEST
> +	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
> +
> +	/* Enter guest */
> +
> +	ld	r5, VCPU_LR(r4)
> +	lwz	r6, VCPU_CR(r4)
> +	mtlr	r5
> +	mtcr	r6
> +
> +	ld	r0, VCPU_GPR(r0)(r4)
> +	ld	r1, VCPU_GPR(r1)(r4)
> +	ld	r2, VCPU_GPR(r2)(r4)
> +	ld	r3, VCPU_GPR(r3)(r4)
> +	ld	r5, VCPU_GPR(r5)(r4)
> +	ld	r6, VCPU_GPR(r6)(r4)
> +	ld	r7, VCPU_GPR(r7)(r4)
> +	ld	r8, VCPU_GPR(r8)(r4)
> +	ld	r9, VCPU_GPR(r9)(r4)
> +	ld	r10, VCPU_GPR(r10)(r4)
> +	ld	r11, VCPU_GPR(r11)(r4)
> +	ld	r12, VCPU_GPR(r12)(r4)
> +	ld	r13, VCPU_GPR(r13)(r4)
> +
> +	ld	r4, VCPU_GPR(r4)(r4)
> +
> +	hrfid
> +	b	.
> +kvmppc_handler_trampoline_enter_end:
> +
> +
> =
+/************************************************************************=
******
> + *                                                                    =
        *
> + *                               Exit code                            =
        *
> + *                                                                    =
        *
> + =
**************************************************************************=
***/
> +
> +/*
> + * We come here from the first-level interrupt handlers.
> + */
> +	.globl	kvmppc_interrupt
> +kvmppc_interrupt:
> +	/*
> +	 * Register contents:
> +	 * R12		=3D interrupt vector
> +	 * R13		=3D PACA
> +	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
> +	 * guest R13 saved in SPRN_SCRATCH0
> +	 */
> +	/* abuse host_r2 as third scratch area; we get r2 from =
PACATOC(r13) */
> +	std	r9, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
> +	ld	r9, PACA_KVM_VCPU(r13)
> +
> +	/* Save registers */
> +
> +	std	r0, VCPU_GPR(r0)(r9)
> +	std	r1, VCPU_GPR(r1)(r9)
> +	std	r2, VCPU_GPR(r2)(r9)
> +	std	r3, VCPU_GPR(r3)(r9)
> +	std	r4, VCPU_GPR(r4)(r9)
> +	std	r5, VCPU_GPR(r5)(r9)
> +	std	r6, VCPU_GPR(r6)(r9)
> +	std	r7, VCPU_GPR(r7)(r9)
> +	std	r8, VCPU_GPR(r8)(r9)
> +	ld	r0, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
> +	std	r0, VCPU_GPR(r9)(r9)
> +	std	r10, VCPU_GPR(r10)(r9)
> +	std	r11, VCPU_GPR(r11)(r9)
> +	ld	r3, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
> +	lwz	r4, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
> +	std	r3, VCPU_GPR(r12)(r9)
> +	stw	r4, VCPU_CR(r9)
> +
> +	/* Restore R1/R2 so we can handle faults */
> +	ld	r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
> +	ld	r2, PACATOC(r13)
> +
> +	mfspr	r10, SPRN_SRR0
> +	mfspr	r11, SPRN_SRR1
> +	std	r10, VCPU_SRR0(r9)
> +	std	r11, VCPU_SRR1(r9)
> +	andi.	r0, r12, 2		/* need to read HSRR0/1? */
> +	beq	1f
> +	mfspr	r10, SPRN_HSRR0
> +	mfspr	r11, SPRN_HSRR1
> +	clrrdi	r12, r12, 2
> +1:	std	r10, VCPU_PC(r9)
> +	std	r11, VCPU_MSR(r9)
> +
> +	GET_SCRATCH0(r3)
> +	mflr	r4
> +	std	r3, VCPU_GPR(r13)(r9)
> +	std	r4, VCPU_LR(r9)
> +
> +	/* Unset guest mode */
> +	li	r0, KVM_GUEST_MODE_NONE
> +	stb	r0, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
> +
> +	stw	r12,VCPU_TRAP(r9)
> +
> +	/* See if this is a leftover HDEC interrupt */
> +	cmpwi	r12,0x980

define

> +	bne	2f
> +	mfspr	r3,SPRN_HDEC
> +	cmpwi	r3,0
> +	bge	ignore_hdec
> +2:
> +
> +	/* Check for mediated interrupts (could be done earlier really =
...) */
> +	cmpwi	r12,0x500

define

> +	bne+	1f
> +	ld	r5,VCPU_LPCR(r9)
> +	andi.	r0,r11,MSR_EE
> +	beq	1f
> +	andi.	r0,r5,LPCR_MER
> +	bne	bounce_ext_interrupt

So there's no need for the external check that directly goes into the =
Linux handler code on full-fledged exits?

> +1:
> +
> +	/* Save DEC */
> +	mfspr	r5,SPRN_DEC
> +	mftb	r6
> +	extsw	r5,r5
> +	add	r5,r5,r6
> +	std	r5,VCPU_DEC_EXPIRES(r9)
> +
> +	/* Save HEIR (in last_inst) if this is a HEI (e40) */
> +	li	r3,-1
> +	cmpwi	r12,0xe40
> +	bne	11f
> +	mfspr	r3,SPRN_HEIR
> +11:	stw	r3,VCPU_LAST_INST(r9)
> +=09
> +	/* Save more register state  */
> +	mfxer	r5
> +	mfdar	r6
> +	mfdsisr	r7
> +	mfctr	r8
> +
> +	stw	r5, VCPU_XER(r9)
> +	std	r6, VCPU_DAR(r9)
> +	stw	r7, VCPU_DSISR(r9)
> +	std	r8, VCPU_CTR(r9)
> +	cmpwi	r12,0xe00		/* grab HDAR & HDSISR if HDSI */
> +	beq	6f
> +7:	std	r6, VCPU_FAULT_DAR(r9)
> +	stw	r7, VCPU_FAULT_DSISR(r9)
> +
> +	/* Save guest CTRL register, set runlatch to 1 */
> +	mfspr	r6,SPRN_CTRLF
> +	stw	r6,VCPU_CTRL(r9)
> +	andi.	r0,r6,1
> +	bne	4f
> +	ori	r6,r6,1
> +	mtspr	SPRN_CTRLT,r6
> +4:
> +	/* Read the guest SLB and save it away */
> +	li	r6,0
> +	addi	r7,r9,VCPU_SLB
> +	li	r5,0
> +1:	slbmfee	r8,r6
> +	andis.	r0,r8,SLB_ESID_V@h
> +	beq	2f
> +	add	r8,r8,r6		/* put index in */
> +	slbmfev	r3,r6
> +	std	r8,VCPU_SLB_E(r7)
> +	std	r3,VCPU_SLB_V(r7)
> +	addi	r7,r7,VCPU_SLB_SIZE
> +	addi	r5,r5,1
> +2:	addi	r6,r6,1
> +	cmpwi	r6,32

I don't like how the 32 is hardcoded here. Better create a define for it =
and use the same in the init code.

> +	blt	1b
> +	stw	r5,VCPU_SLB_MAX(r9)
> +
> +	/*
> +	 * Save the guest PURR/SPURR
> +	 */
> +	mfspr	r5,SPRN_PURR
> +	mfspr	r6,SPRN_SPURR
> +	ld	r7,VCPU_PURR(r9)
> +	ld	r8,VCPU_SPURR(r9)
> +	std	r5,VCPU_PURR(r9)
> +	std	r6,VCPU_SPURR(r9)
> +	subf	r5,r7,r5
> +	subf	r6,r8,r6
> +
> +	/*
> +	 * Restore host PURR/SPURR and add guest times
> +	 * so that the time in the guest gets accounted.
> +	 */
> +	ld	r3,PACA_HOST_PURR(r13)
> +	ld	r4,PACA_HOST_SPURR(r13)
> +	add	r3,r3,r5
> +	add	r4,r4,r6
> +	mtspr	SPRN_PURR,r3
> +	mtspr	SPRN_SPURR,r4
> +
> +	/* Clear out SLB */
> +	li	r5,0
> +	slbmte	r5,r5
> +	slbia
> +	ptesync
> +
> +hdec_soon:
> +	/* Switch back to host partition */
> +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> +	ld	r6,KVM_HOST_SDR1(r4)
> +	lwz	r7,KVM_HOST_LPID(r4)
> +	li	r8,0x3ff		/* switch to reserved LPID */

is it reserved by ISA? Either way, hard-coding the constant without a =
name is not nice :).

> +	mtspr	SPRN_LPID,r8
> +	ptesync
> +	mtspr	SPRN_SDR1,r6		/* switch to partition page =
table */
> +	mtspr	SPRN_LPID,r7
> +	isync
> +	lis	r8,0x7fff

INT_MAX@h might be more readable.

> +	mtspr	SPRN_HDEC,r8
> +
> +	ld	r8,KVM_HOST_LPCR(r4)
> +	mtspr	SPRN_LPCR,r8
> +	isync
> +
> +	/* load host SLB entries */
> +	ld	r8,PACA_SLBSHADOWPTR(r13)
> +
> +	.rept	SLB_NUM_BOLTED
> +	ld	r5,SLBSHADOW_SAVEAREA(r8)
> +	ld	r6,SLBSHADOW_SAVEAREA+8(r8)
> +	andis.	r7,r5,SLB_ESID_V@h
> +	beq	1f
> +	slbmte	r6,r5
> +1:	addi	r8,r8,16
> +	.endr
> +
> +	/* Save and reset AMR and UAMOR before turning on the MMU */
> +	mfspr	r5,SPRN_AMR
> +	mfspr	r6,SPRN_UAMOR
> +	std	r5,VCPU_AMR(r9)
> +	std	r6,VCPU_UAMOR(r9)
> +	li	r6,0
> +	mtspr	SPRN_AMR,r6
> +
> +	/* Restore host DABR and DABRX */
> +	ld	r5,PACA_DABR(r13)
> +	li	r6,7
> +	mtspr	SPRN_DABR,r5
> +	mtspr	SPRN_DABRX,r6
> +
> +	/* Switch DSCR back to host value */
> +	mfspr	r8, SPRN_DSCR
> +	ld	r7, PACA_HOST_DSCR(r13)
> +	std	r8, VCPU_DSCR(r7)
> +	mtspr	SPRN_DSCR, r7
> +
> +	/* Save non-volatile GPRs */
> +	std	r14, VCPU_GPR(r14)(r9)
> +	std	r15, VCPU_GPR(r15)(r9)
> +	std	r16, VCPU_GPR(r16)(r9)
> +	std	r17, VCPU_GPR(r17)(r9)
> +	std	r18, VCPU_GPR(r18)(r9)
> +	std	r19, VCPU_GPR(r19)(r9)
> +	std	r20, VCPU_GPR(r20)(r9)
> +	std	r21, VCPU_GPR(r21)(r9)
> +	std	r22, VCPU_GPR(r22)(r9)
> +	std	r23, VCPU_GPR(r23)(r9)
> +	std	r24, VCPU_GPR(r24)(r9)
> +	std	r25, VCPU_GPR(r25)(r9)
> +	std	r26, VCPU_GPR(r26)(r9)
> +	std	r27, VCPU_GPR(r27)(r9)
> +	std	r28, VCPU_GPR(r28)(r9)
> +	std	r29, VCPU_GPR(r29)(r9)
> +	std	r30, VCPU_GPR(r30)(r9)
> +	std	r31, VCPU_GPR(r31)(r9)
> +
> +	/* Save SPRGs */
> +	mfspr	r3, SPRN_SPRG0
> +	mfspr	r4, SPRN_SPRG1
> +	mfspr	r5, SPRN_SPRG2
> +	mfspr	r6, SPRN_SPRG3
> +	std	r3, VCPU_SPRG0(r9)
> +	std	r4, VCPU_SPRG1(r9)
> +	std	r5, VCPU_SPRG2(r9)
> +	std	r6, VCPU_SPRG3(r9)
> +
> +	/* Save PMU registers */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) =
bit */
> +	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable =
ints */
> +	isync
> +	mfspr	r5, SPRN_MMCR1
> +	mfspr	r6, SPRN_MMCRA
> +	std	r4, VCPU_MMCR(r9)
> +	std	r5, VCPU_MMCR + 8(r9)
> +	std	r6, VCPU_MMCR + 16(r9)
> +	mfspr	r3, SPRN_PMC1
> +	mfspr	r4, SPRN_PMC2
> +	mfspr	r5, SPRN_PMC3
> +	mfspr	r6, SPRN_PMC4
> +	mfspr	r7, SPRN_PMC5
> +	mfspr	r8, SPRN_PMC6
> +	stw	r3, VCPU_PMC(r9)
> +	stw	r4, VCPU_PMC + 4(r9)
> +	stw	r5, VCPU_PMC + 8(r9)
> +	stw	r6, VCPU_PMC + 12(r9)
> +	stw	r7, VCPU_PMC + 16(r9)
> +	stw	r8, VCPU_PMC + 20(r9)
> +22:
> +	/* save FP state */
> +	mr	r3, r9
> +	bl	.kvmppc_save_fp
> +
> +	/* RFI into the highmem handler */
> +	mfmsr	r7
> +	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging =
*/
> +	mtsrr1	r7
> +	/* Load highmem handler address */
> +	ld	r8, VCPU_HIGHMEM_HANDLER(r3)
> +	mtsrr0	r8
> +
> +	RFI
> +kvmppc_handler_trampoline_exit_end:
> +
> +6:	mfspr	r6,SPRN_HDAR
> +	mfspr	r7,SPRN_HDSISR
> +	b	7b
> +
> +ignore_hdec:
> +	mr	r4,r9
> +	b	fast_guest_return
> +
> +bounce_ext_interrupt:
> +	mr	r4,r9
> +	mtspr	SPRN_SRR0,r10
> +	mtspr	SPRN_SRR1,r11
> +	li	r10,0x500
> +	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
> +	b	fast_guest_return
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 4b54148..e5e3f92 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,8 +38,12 @@
>=20
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> -	return !(v->arch.shared->msr & MSR_WE) ||
> +#ifndef CONFIG_KVM_BOOK3S_64_HV
> +	return !(kvmppc_get_msr(v) & MSR_WE) ||
> 	       !!(v->arch.pending_exceptions);
> +#else
> +	return 1;

So what happens if the guest sets MSR_WE? It just stays in guest context =
idling? That'd be pretty bad for scheduling on the host.

> +#endif
> }
>=20
> int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
> @@ -52,7 +56,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
> 	unsigned long __maybe_unused param4 =3D kvmppc_get_gpr(vcpu, 6);
> 	unsigned long r2 =3D 0;
>=20
> -	if (!(vcpu->arch.shared->msr & MSR_SF)) {
> +	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
> 		/* 32 bit mode */
> 		param1 &=3D 0xffffffff;
> 		param2 &=3D 0xffffffff;
> @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_PPC_IRQ_LEVEL:
> 	case KVM_CAP_ENABLE_CAP:
> 	case KVM_CAP_PPC_OSI:
> +#ifndef CONFIG_KVM_BOOK3S_64_HV

You also don't do OSI on HV :).

> 	case KVM_CAP_PPC_GET_PVINFO:
> 		r =3D 1;
> 		break;
> 	case KVM_CAP_COALESCED_MMIO:
> 		r =3D KVM_COALESCED_MMIO_PAGE_OFFSET;
> 		break;
> +#endif
> 	default:
> 		r =3D 0;
> 		break;
> @@ -286,6 +292,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, =
HRTIMER_MODE_ABS);
> 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, =
(ulong)vcpu);
> 	vcpu->arch.dec_timer.function =3D kvmppc_decrementer_wakeup;
> +	vcpu->arch.dec_expires =3D ~(u64)0;
>=20
> 	return 0;
> }
> @@ -474,6 +481,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu =
*vcpu, struct kvm_run *run)
> 		for (i =3D 0; i < 32; i++)
> 			kvmppc_set_gpr(vcpu, i, gprs[i]);
> 		vcpu->arch.osi_needed =3D 0;
> +	} else if (vcpu->arch.hcall_needed) {
> +		int i;
> +
> +		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
> +		for (i =3D 0; i < 6; ++i)
> +			kvmppc_set_gpr(vcpu, 4 + i, =
run->papr_hcall.args[i]);
> +		vcpu->arch.hcall_needed =3D 0;
> 	}
>=20
> 	kvmppc_core_deliver_interrupts(vcpu);
> @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu =
*vcpu, struct kvm_interrupt *irq)
> 	if (waitqueue_active(&vcpu->wq)) {
> 		wake_up_interruptible(&vcpu->wq);
> 		vcpu->stat.halt_wakeup++;
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	} else if (vcpu->cpu !=3D -1) {
> +		smp_send_reschedule(vcpu->cpu);

Shouldn't this be done for non-HV too? The only reason we don't do it =
yet is because we don't do SMP, no?

> +#endif
> 	}
> -

eh... :)

> 	return 0;
> }
>=20
> diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
> index d62a14b..e5c99b8 100644
> --- a/arch/powerpc/kvm/trace.h
> +++ b/arch/powerpc/kvm/trace.h
> @@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
>  *                         Book3S trace points                         =
  *
>  =
*************************************************************************/=

>=20
> -#ifdef CONFIG_PPC_BOOK3S
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
>=20
> TRACE_EVENT(kvm_book3s_exit,
> 	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..a4447ce 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
> #define KVM_EXIT_NMI              16
> #define KVM_EXIT_INTERNAL_ERROR   17
> #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_PAPR_HCALL	  19
>=20
> /* For KVM_EXIT_INTERNAL_ERROR */
> #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -264,6 +265,11 @@ struct kvm_run {
> 		struct {
> 			__u64 gprs[32];
> 		} osi;
> +		struct {
> +			__u64 nr;
> +			__u64 ret;
> +			__u64 args[9];
> +		} papr_hcall;

This needs some information in the documentation.


Alex

PS: I CC'ed kvm-ppc@vger. Please make sure to CC that mailing list, so =
people interested in kvm on ppc get your patches as well :).

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-15 21:58     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-15 21:58 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc


On 11.05.2011, at 12:44, Paul Mackerras wrote:

> This adds support for KVM running on 64-bit Book 3S processors,
> specifically POWER7, in hypervisor mode.  Using hypervisor mode means
> that the guest can use the processor's supervisor mode.  That means
> that the guest can execute privileged instructions and access privileged
> registers itself without trapping to the host.  This gives excellent
> performance, but does mean that KVM cannot emulate a processor
> architecture other than the one that the hardware implements.
> 
> This code assumes that the guest is running paravirtualized using the
> PAPR (Power Architecture Platform Requirements) interface, which is the
> interface that IBM's PowerVM hypervisor uses.  That means that existing
> Linux distributions that run on IBM pSeries machines will also run
> under KVM without modification.  In order to communicate the PAPR
> hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
> to include/linux/kvm.h.
> 
> Currently the choice between book3s_hv support and book3s_pr support
> (i.e. the existing code, which runs the guest in user mode) has to be
> made at kernel configuration time, so a given kernel binary can only
> do one or the other.
> 
> This new book3s_hv code doesn't support MMIO emulation at present.
> Since we are running paravirtualized guests, this isn't a serious
> restriction.
> 
> With the guest running in supervisor mode, most exceptions go straight
> to the guest.  We will never get data or instruction storage or segment
> interrupts, alignment interrupts, decrementer interrupts, program
> interrupts, single-step interrupts, etc., coming to the hypervisor from
> the guest.  Therefore this introduces a new KVMTEST_NONHV macro for the
> exception entry path so that we don't have to do the KVM test on entry
> to those exception handlers.
> 
> We do however get hypervisor decrementer, hypervisor data storage,
> hypervisor instruction storage, and hypervisor emulation assist
> interrupts, so we have to handle those.
> 
> In hypervisor mode, real-mode accesses can access all of RAM, not just
> a limited amount.  Therefore we put all the guest state in the vcpu.arch
> and use the shadow_vcpu in the PACA only for temporary scratch space.
> We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
> anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
> 
> The POWER7 processor has a restriction that all threads in a core have
> to be in the same partition.  MMU-on kernel code counts as a partition
> (partition 0), so we have to do a partition switch on every entry to and
> exit from the guest.  At present we require the host and guest to run
> in single-thread mode because of this hardware restriction.
> 
> This code allocates a hashed page table for the guest and initializes
> it with HPTEs for the guest's Virtual Real Memory Area (VRMA).  We
> require that the guest memory is allocated using 16MB huge pages, in
> order to simplify the low-level memory management.  This also means that
> we can get away without tracking paging activity in the host for now,
> since huge pages can't be paged or swapped.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/exception-64s.h  |   27 +-
> arch/powerpc/include/asm/kvm_asm.h        |    4 +
> arch/powerpc/include/asm/kvm_book3s.h     |  148 ++++++-
> arch/powerpc/include/asm/kvm_book3s_asm.h |    4 +-
> arch/powerpc/include/asm/kvm_booke.h      |    4 +
> arch/powerpc/include/asm/kvm_host.h       |   60 +++-
> arch/powerpc/include/asm/kvm_ppc.h        |    6 +
> arch/powerpc/include/asm/mmu-hash64.h     |   10 +-
> arch/powerpc/include/asm/paca.h           |   10 +
> arch/powerpc/include/asm/reg.h            |    4 +
> arch/powerpc/kernel/asm-offsets.c         |   95 ++++-
> arch/powerpc/kernel/exceptions-64s.S      |   60 ++--
> arch/powerpc/kvm/Kconfig                  |   40 ++-
> arch/powerpc/kvm/Makefile                 |   16 +-
> arch/powerpc/kvm/book3s_64_mmu_hv.c       |  258 +++++++++++
> arch/powerpc/kvm/book3s_hv.c              |  413 ++++++++++++++++++
> arch/powerpc/kvm/book3s_hv_interrupts.S   |  326 ++++++++++++++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  663 +++++++++++++++++++++++++++++
> arch/powerpc/kvm/powerpc.c                |   23 +-
> arch/powerpc/kvm/trace.h                  |    2 +-
> include/linux/kvm.h                       |    6 +
> 21 files changed, 2094 insertions(+), 85 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_64_mmu_hv.c
> create mode 100644 arch/powerpc/kvm/book3s_hv.c
> create mode 100644 arch/powerpc/kvm/book3s_hv_interrupts.S
> create mode 100644 arch/powerpc/kvm/book3s_hv_rmhandlers.S
> 
> diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
> index 2a770d8..d32e1ef 100644
> --- a/arch/powerpc/include/asm/exception-64s.h
> +++ b/arch/powerpc/include/asm/exception-64s.h
> @@ -65,14 +65,14 @@
> 	GET_PACA(r13);							\
> 	std	r9,area+EX_R9(r13);	/* save r9 - r12 */		\
> 	std	r10,area+EX_R10(r13);					\
> -	mfcr	r9;							\
> -	extra(vec);							\
> -	std	r11,area+EX_R11(r13);					\
> -	std	r12,area+EX_R12(r13);					\
> 	BEGIN_FTR_SECTION_NESTED(66);					\
> 	mfspr	r10,SPRN_CFAR;						\
> 	std	r10,area+EX_CFAR(r13);					\
> 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		\
> +	mfcr	r9;							\
> +	extra(vec);							\
> +	std	r11,area+EX_R11(r13);					\
> +	std	r12,area+EX_R12(r13);					\


I don't really understand that change :).

> 	GET_SCRATCH0(r10);						\
> 	std	r10,area+EX_R13(r13)
> #define EXCEPTION_PROLOG_1(area, extra, vec)				\
> @@ -134,6 +134,17 @@ do_kvm_##n:								\
> #define KVM_HANDLER_SKIP(area, h, n)
> #endif
> 
> +#ifdef CONFIG_KVM_BOOK3S_NONHV

I really liked how you called the .c file _pr - why call it NONHV now?

> +#define KVMTEST_NONHV(n)		__KVMTEST(n)
> +#define KVM_HANDLER_NONHV(area, h, n)	__KVM_HANDLER(area, h, n)
> +#define KVM_HANDLER_NONHV_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
> +
> +#else
> +#define KVMTEST_NONHV(n)
> +#define KVM_HANDLER_NONHV(area, h, n)
> +#define KVM_HANDLER_NONHV_SKIP(area, h, n)
> +#endif
> +
> #define NOTEST(n)
> 
> /*
> @@ -210,7 +221,7 @@ label##_pSeries:					\
> 	HMT_MEDIUM;					\
> 	SET_SCRATCH0(r13);		/* save r13 */		\
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
> -				 EXC_STD, KVMTEST, vec)
> +				 EXC_STD, KVMTEST_NONHV, vec)
> 
> #define STD_EXCEPTION_HV(loc, vec, label)		\
> 	. = loc;					\
> @@ -227,8 +238,8 @@ label##_hv:						\
> 	beq	masked_##h##interrupt
> #define _SOFTEN_TEST(h)	__SOFTEN_TEST(h)
> 
> -#define SOFTEN_TEST(vec)						\
> -	KVMTEST(vec);							\
> +#define SOFTEN_TEST_NONHV(vec)						\
> +	KVMTEST_NONHV(vec);						\
> 	_SOFTEN_TEST(EXC_STD)
> 
> #define SOFTEN_TEST_HV(vec)						\
> @@ -248,7 +259,7 @@ label##_hv:						\
> 	.globl label##_pSeries;						\
> label##_pSeries:							\
> 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
> -				    EXC_STD, SOFTEN_TEST)
> +				    EXC_STD, SOFTEN_TEST_NONHV)
> 
> #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
> 	. = loc;							\
> diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
> index 0951b17..7b1f0e0 100644
> --- a/arch/powerpc/include/asm/kvm_asm.h
> +++ b/arch/powerpc/include/asm/kvm_asm.h
> @@ -64,8 +64,12 @@
> #define BOOK3S_INTERRUPT_PROGRAM	0x700
> #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
> #define BOOK3S_INTERRUPT_DECREMENTER	0x900
> +#define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
> #define BOOK3S_INTERRUPT_SYSCALL	0xc00
> #define BOOK3S_INTERRUPT_TRACE		0xd00
> +#define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
> +#define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
> +#define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
> #define BOOK3S_INTERRUPT_PERFMON	0xf00
> #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
> #define BOOK3S_INTERRUPT_VSX		0xf40
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 12829bb..5b76073 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -117,6 +117,7 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
> extern void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr);
> extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
> extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
> +extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
> extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
> extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
> extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
> @@ -128,10 +129,12 @@ extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
> extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
> extern int kvmppc_mmu_hpte_sysinit(void);
> extern void kvmppc_mmu_hpte_sysexit(void);
> +extern int kvmppc_mmu_hv_init(void);
> 
> extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
> extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
> extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
> +extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
> extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
> 			   bool upper, u32 val);
> extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
> @@ -152,6 +155,19 @@ static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
> 	return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
> }
> 
> +extern void kvm_return_point(void);
> +
> +/* Also add subarch specific defines */
> +
> +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
> +#include <asm/kvm_book3s_32.h>
> +#endif
> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#include <asm/kvm_book3s_64.h>
> +#endif
> +
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> +
> #define vcpu_guest_state(vcpu)	((vcpu)->arch.shared)
> 
> static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
> @@ -168,16 +184,6 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
> 		vcpu_guest_state(vcpu)->int_pending = 0;
> }
> 
> -static inline ulong dsisr(void)
> -{
> -	ulong r;
> -	asm ( "mfdsisr %0 " : "=r" (r) );
> -	return r;
> -}
> -
> -extern void kvm_return_point(void);
> -static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu);
> -
> static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
> {
> 	if ( num < 14 ) {
> @@ -265,6 +271,11 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
> 	return to_svcpu(vcpu)->fault_dar;
> }
> 
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.shared->msr;
> +}
> +
> static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> {
> 	ulong crit_raw = vcpu_guest_state(vcpu)->critical;
> @@ -284,6 +295,115 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> 
> 	return crit;
> }
> +#else /* CONFIG_KVM_BOOK3S_NONHV */
> +
> +#define vcpu_guest_state(vcpu)	(&(vcpu)->arch)
> +
> +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
> +{
> +	return 0;
> +}
> +
> +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
> +			unsigned long pending_now, unsigned long old_pending)
> +{
> +	/* Recalculate LPCR:MER based on the presence of
> +	 * a pending external interrupt
> +	 */
> +	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions) ||
> +	    test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions))

Wasn't pending_now pending_exceptions?

> +		vcpu->arch.lpcr |= LPCR_MER;
> +	else
> +		vcpu->arch.lpcr &= ~((u64)LPCR_MER);
> +}
> +
> +static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
> +{
> +	vcpu->arch.gpr[num] = val;
> +}
> +
> +static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
> +{
> +	return vcpu->arch.gpr[num];
> +}
> +
> +static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
> +{
> +	vcpu->arch.cr = val;
> +}
> +
> +static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.cr;
> +}
> +
> +static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
> +{
> +	vcpu->arch.xer = val;
> +}
> +
> +static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.xer;
> +}
> +
> +static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.ctr = val;
> +}
> +
> +static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.ctr;
> +}
> +
> +static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.lr = val;
> +}
> +
> +static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.lr;
> +}
> +
> +static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val)
> +{
> +	vcpu->arch.pc = val;
> +}
> +
> +static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.pc;
> +}
> +
> +static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
> +{
> +	ulong pc = kvmppc_get_pc(vcpu);
> +
> +	/* Load the instruction manually if it failed to do so in the
> +	 * exit path */
> +	if (vcpu->arch.last_inst = KVM_INST_FETCH_FAILED)
> +		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
> +
> +	return vcpu->arch.last_inst;
> +}
> +
> +static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.fault_dar;
> +}
> +
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.msr;
> +}
> +
> +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> +{
> +	return false;
> +}
> +#endif
> 
> /* Magic register values loaded into r3 and r4 before the 'sc' assembly
>  * instruction for the OSI hypercalls */
> @@ -292,12 +412,4 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
> 
> #define INS_DCBZ			0x7c0007ec
> 
> -/* Also add subarch specific defines */
> -
> -#ifdef CONFIG_PPC_BOOK3S_32
> -#include <asm/kvm_book3s_32.h>
> -#else
> -#include <asm/kvm_book3s_64.h>
> -#endif
> -
> #endif /* __ASM_KVM_BOOK3S_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index d5a8a38..d7279f5 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -61,6 +61,7 @@ kvmppc_resume_\intno:
> #else  /*__ASSEMBLY__ */
> 
> struct kvmppc_book3s_shadow_vcpu {
> +#ifdef CONFIG_KVM_BOOK3S_NONHV

Do you really want any shadow_vcpu in the paca at all?

> 	ulong gpr[14];
> 	u32 cr;
> 	u32 xer;
> @@ -72,6 +73,7 @@ struct kvmppc_book3s_shadow_vcpu {
> 	ulong pc;
> 	ulong shadow_srr1;
> 	ulong fault_dar;
> +#endif
> 
> 	ulong host_r1;
> 	ulong host_r2;
> @@ -84,7 +86,7 @@ struct kvmppc_book3s_shadow_vcpu {
> #ifdef CONFIG_PPC_BOOK3S_32
> 	u32     sr[16];			/* Guest SRs */
> #endif
> -#ifdef CONFIG_PPC_BOOK3S_64
> +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_NONHV)
> 	u8 slb_max;			/* highest used guest slb entry */
> 	struct  {
> 		u64     esid;
> diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
> index 9c9ba3d..a90e091 100644
> --- a/arch/powerpc/include/asm/kvm_booke.h
> +++ b/arch/powerpc/include/asm/kvm_booke.h
> @@ -93,4 +93,8 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
> 	return vcpu->arch.fault_dear;
> }
> 
> +static inline ulong kvmppc_get_msr(struct kvm_vcpu *vcpu)
> +{
> +	return vcpu->arch.shared->msr;
> +}
> #endif /* __ASM_KVM_BOOKE_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 3ebe51b..ec62365 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -33,7 +33,9 @@
> /* memory slots that does not exposed to userspace */
> #define KVM_PRIVATE_MEM_SLOTS 4
> 
> +#ifdef CONFIG_KVM_MMIO
> #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
> +#endif
> 
> /* We don't currently support large pages. */
> #define KVM_HPAGE_GFN_SHIFT(x)	0
> @@ -133,7 +135,24 @@ struct kvmppc_exit_timing {
> 	};
> };
> 
> +struct kvmppc_pginfo {
> +	unsigned long pfn;
> +	atomic_t refcnt;
> +};
> +
> struct kvm_arch {
> +	unsigned long hpt_virt;
> +	unsigned long ram_npages;
> +	unsigned long ram_psize;
> +	unsigned long ram_porder;
> +	struct kvmppc_pginfo *ram_pginfo;
> +	unsigned int lpid;
> +	unsigned int host_lpid;
> +	unsigned long host_lpcr;
> +	unsigned long sdr1;
> +	unsigned long host_sdr1;
> +	int tlbie_lock;
> +	unsigned short last_vcpu[NR_CPUS];

This should all be #ifdef CONFIG_KVM_BOOK3S_HV

> };
> 
> struct kvmppc_pte {
> @@ -190,7 +209,7 @@ struct kvm_vcpu_arch {
> 	ulong rmcall;
> 	ulong host_paca_phys;
> 	struct kvmppc_slb slb[64];
> -	int slb_max;		/* # valid entries in slb[] */
> +	int slb_max;		/* 1 + index of last valid entry in slb[] */
> 	int slb_nr;		/* total number of entries in SLB */
> 	struct kvmppc_mmu mmu;
> #endif
> @@ -204,9 +223,10 @@ struct kvm_vcpu_arch {
> 	vector128 vr[32];
> 	vector128 vscr;
> #endif
> +	u32 vrsave;
> 
> #ifdef CONFIG_VSX
> -	u64 vsr[32];
> +	u64 vsr[64];
> #endif
> 
> #ifdef CONFIG_PPC_BOOK3S
> @@ -214,29 +234,45 @@ struct kvm_vcpu_arch {
> 	u32 qpr[32];
> #endif
> 
> -#ifdef CONFIG_BOOKE
> -	ulong pc;
> 	ulong ctr;
> 	ulong lr;
> 
> 	ulong xer;
> 	u32 cr;
> -#endif
> +
> +	ulong pc;
> +	ulong msr;
> 
> #ifdef CONFIG_PPC_BOOK3S
> 	ulong shadow_msr;
> 	ulong hflags;
> 	ulong guest_owned_ext;
> +	ulong purr;
> +	ulong spurr;
> +	ulong lpcr;
> +	ulong dscr;
> +	ulong amr;
> +	ulong uamor;
> +	u32 ctrl;
> +	u32 dsisr;
> +	ulong dabr;
> #endif
> 	u32 mmucr;
> +	ulong sprg0;
> +	ulong sprg1;
> +	ulong sprg2;
> +	ulong sprg3;
> 	ulong sprg4;
> 	ulong sprg5;
> 	ulong sprg6;
> 	ulong sprg7;
> +	ulong srr0;
> +	ulong srr1;
> 	ulong csrr0;
> 	ulong csrr1;
> 	ulong dsrr0;
> 	ulong dsrr1;
> +	ulong dear;
> 	ulong esr;
> 	u32 dec;
> 	u32 decar;
> @@ -259,6 +295,9 @@ struct kvm_vcpu_arch {
> 	u32 dbcr1;
> 	u32 dbsr;
> 
> +	u64 mmcr[3];
> +	u32 pmc[6];
> +
> #ifdef CONFIG_KVM_EXIT_TIMING
> 	struct kvmppc_exit_timing timing_exit;
> 	struct kvmppc_exit_timing timing_last_enter;
> @@ -272,8 +311,12 @@ struct kvm_vcpu_arch {
> 	struct dentry *debugfs_exit_timing;
> #endif
> 
> +#ifdef CONFIG_PPC_BOOK3S
> +	ulong fault_dar;
> +	u32 fault_dsisr;
> +#endif
> +
> #ifdef CONFIG_BOOKE
> -	u32 last_inst;
> 	ulong fault_dear;
> 	ulong fault_esr;
> 	ulong queued_dear;
> @@ -288,13 +331,18 @@ struct kvm_vcpu_arch {
> 	u8 dcr_is_write;
> 	u8 osi_needed;
> 	u8 osi_enabled;
> +	u8 hcall_needed;
> 
> 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
> 
> 	struct hrtimer dec_timer;
> 	struct tasklet_struct tasklet;
> 	u64 dec_jiffies;
> +	u64 dec_expires;
> 	unsigned long pending_exceptions;
> +	u16 last_cpu;
> +	u32 last_inst;
> +	int trap;
> 	struct kvm_vcpu_arch_shared *shared;
> 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
> 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 3210911..cd9ad96 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -110,6 +110,12 @@ extern void kvmppc_booke_exit(void);
> extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
> extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
> 
> +extern long kvmppc_alloc_hpt(struct kvm *kvm);
> +extern void kvmppc_free_hpt(struct kvm *kvm);
> +extern long kvmppc_prepare_vrma(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem);
> +extern void kvmppc_map_vrma(struct kvm *kvm,
> +			    struct kvm_userspace_memory_region *mem);
> extern int kvmppc_core_init_vm(struct kvm *kvm);
> extern void kvmppc_core_destroy_vm(struct kvm *kvm);
> extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> index ae7b3ef..0bb3fc1 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -90,13 +90,19 @@ extern char initial_stab[];
> 
> #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
> #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
> +#define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
> #define HPTE_R_RPN_SHIFT	12
> -#define HPTE_R_RPN		ASM_CONST(0x3ffffffffffff000)
> -#define HPTE_R_FLAGS		ASM_CONST(0x00000000000003ff)
> +#define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
> #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
> #define HPTE_R_N		ASM_CONST(0x0000000000000004)
> +#define HPTE_R_G		ASM_CONST(0x0000000000000008)
> +#define HPTE_R_M		ASM_CONST(0x0000000000000010)
> +#define HPTE_R_I		ASM_CONST(0x0000000000000020)
> +#define HPTE_R_W		ASM_CONST(0x0000000000000040)
> +#define HPTE_R_WIMG		ASM_CONST(0x0000000000000078)
> #define HPTE_R_C		ASM_CONST(0x0000000000000080)
> #define HPTE_R_R		ASM_CONST(0x0000000000000100)
> +#define HPTE_R_KEY_LO		ASM_CONST(0x0000000000000e00)
> 
> #define HPTE_V_1TB_SEG		ASM_CONST(0x4000000000000000)
> #define HPTE_V_VRMA_MASK	ASM_CONST(0x4001ffffff000000)
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index 7412676..8dba5f6 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -149,6 +149,16 @@ struct paca_struct {
> #ifdef CONFIG_KVM_BOOK3S_HANDLER
> 	/* We use this to store guest state in */
> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	struct kvm_vcpu *kvm_vcpu;
> +	u64 dabr;
> +	u64 host_mmcr[3];
> +	u32 host_pmc[6];
> +	u64 host_purr;
> +	u64 host_spurr;
> +	u64 host_dscr;
> +	u64 dec_expires;

Hrm. I'd say either push those into shadow_vcpu for HV mode or get rid of the shadow_vcpu reference. I'd probably prefer the former.

> +#endif
> #endif
> };
> 
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index c07b7be..0036977 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -189,6 +189,10 @@
> #define SPRN_CTR	0x009	/* Count Register */
> #define SPRN_DSCR	0x11
> #define SPRN_CFAR	0x1c	/* Come From Address Register */
> +#define SPRN_AMR	0x1d	/* Authority Mask Register */
> +#define SPRN_UAMOR	0x9d	/* User Authority Mask Override Register */
> +#define SPRN_AMOR	0x15d	/* Authority Mask Override Register */
> +#define SPRN_RWMR	885
> #define SPRN_CTRLF	0x088
> #define SPRN_CTRLT	0x098
> #define   CTRL_CT	0xc0000000	/* current thread */
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 6887661..49e97fd 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -187,6 +187,7 @@ int main(void)
> 	DEFINE(LPPACASRR1, offsetof(struct lppaca, saved_srr1));
> 	DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int));
> 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
> +	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
> 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
> 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
> #endif /* CONFIG_PPC_STD_MMU_64 */
> @@ -200,9 +201,17 @@ int main(void)
> 	DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
> -	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
> -	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	DEFINE(PACA_KVM_VCPU, offsetof(struct paca_struct, kvm_vcpu));
> +	DEFINE(PACA_HOST_MMCR, offsetof(struct paca_struct, host_mmcr));
> +	DEFINE(PACA_HOST_PMC, offsetof(struct paca_struct, host_pmc));
> +	DEFINE(PACA_HOST_PURR, offsetof(struct paca_struct, host_purr));
> +	DEFINE(PACA_HOST_SPURR, offsetof(struct paca_struct, host_spurr));
> +	DEFINE(PACA_HOST_DSCR, offsetof(struct paca_struct, host_dscr));
> +	DEFINE(PACA_DABR, offsetof(struct paca_struct, dabr));
> +	DEFINE(PACA_KVM_DECEXP, offsetof(struct paca_struct, dec_expires));
> #endif
> +#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
> #endif /* CONFIG_PPC64 */
> 
> 	/* RTAS */
> @@ -396,6 +405,28 @@ int main(void)
> 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
> 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
> 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
> +	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
> +	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
> +#ifdef CONFIG_ALTIVEC
> +	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
> +	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
> +#endif
> +#ifdef CONFIG_VSX
> +	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
> +#endif
> +	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
> +	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
> +	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
> +	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
> +	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
> +	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
> +	DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
> +	DEFINE(VCPU_SRR0, offsetof(struct kvm_vcpu, arch.srr0));
> +	DEFINE(VCPU_SRR1, offsetof(struct kvm_vcpu, arch.srr1));
> +	DEFINE(VCPU_SPRG0, offsetof(struct kvm_vcpu, arch.sprg0));
> +	DEFINE(VCPU_SPRG1, offsetof(struct kvm_vcpu, arch.sprg1));
> +	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.sprg2));
> +	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.sprg3));
> 	DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
> 	DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
> 	DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
> @@ -406,16 +437,65 @@ int main(void)
> 
> 	/* book3s */
> #ifdef CONFIG_PPC_BOOK3S
> +	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
> +	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
> +	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
> +	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
> +	DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
> +	DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
> +	DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
> +	DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
> +	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
> +	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
> 	DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
> 	DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
> 	DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
> +	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
> +	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
> +	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
> +	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
> +	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
> +	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
> +	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
> 	DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
> 	DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
> 	DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
> 	DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall));
> 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
> +	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.dsisr));
> +	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.dear));
> +	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
> +	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
> +	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
> +	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
> +	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
> +	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
> +	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
> +	DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
> +	DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
> +	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
> +	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
> +	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
> 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
> 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
> +	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
> +	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
> +	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					scratch0));
> +	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					scratch1));
> +	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					in_guest));
> +	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
> +	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
> +	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> +#ifdef CONFIG_PPC64
> +	DEFINE(SVCPU_SLB, offsetof(struct kvmppc_book3s_shadow_vcpu, slb));
> +	DEFINE(SVCPU_SLB_MAX, offsetof(struct kvmppc_book3s_shadow_vcpu, slb_max));
> +#endif
> +	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
> +					 vmhandler));
> 	DEFINE(SVCPU_CR, offsetof(struct kvmppc_book3s_shadow_vcpu, cr));
> 	DEFINE(SVCPU_XER, offsetof(struct kvmppc_book3s_shadow_vcpu, xer));
> 	DEFINE(SVCPU_CTR, offsetof(struct kvmppc_book3s_shadow_vcpu, ctr));
> @@ -435,16 +515,6 @@ int main(void)
> 	DEFINE(SVCPU_R11, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[11]));
> 	DEFINE(SVCPU_R12, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[12]));
> 	DEFINE(SVCPU_R13, offsetof(struct kvmppc_book3s_shadow_vcpu, gpr[13]));
> -	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
> -	DEFINE(SVCPU_HOST_R2, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r2));
> -	DEFINE(SVCPU_VMHANDLER, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					 vmhandler));
> -	DEFINE(SVCPU_SCRATCH0, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					scratch0));
> -	DEFINE(SVCPU_SCRATCH1, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					scratch1));
> -	DEFINE(SVCPU_IN_GUEST, offsetof(struct kvmppc_book3s_shadow_vcpu,
> -					in_guest));
> 	DEFINE(SVCPU_FAULT_DSISR, offsetof(struct kvmppc_book3s_shadow_vcpu,
> 					   fault_dsisr));
> 	DEFINE(SVCPU_FAULT_DAR, offsetof(struct kvmppc_book3s_shadow_vcpu,
> @@ -453,6 +523,7 @@ int main(void)
> 					 last_inst));
> 	DEFINE(SVCPU_SHADOW_SRR1, offsetof(struct kvmppc_book3s_shadow_vcpu,
> 					   shadow_srr1));
> +#endif /* CONFIG_KVM_BOOK3S_NONHV */
> #ifdef CONFIG_PPC_BOOK3S_32
> 	DEFINE(SVCPU_SR, offsetof(struct kvmppc_book3s_shadow_vcpu, sr));
> #endif
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index cbdf374..80c6456 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -87,14 +87,14 @@ data_access_not_stab:
> END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB)
> #endif
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
> -				 KVMTEST, 0x300)
> +				 KVMTEST_NONHV, 0x300)
> 
> 	. = 0x380
> 	.globl data_access_slb_pSeries
> data_access_slb_pSeries:
> 	HMT_MEDIUM
> 	SET_SCRATCH0(r13)
> -	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
> +	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x380)
> 	std	r3,PACA_EXSLB+EX_R3(r13)
> 	mfspr	r3,SPRN_DAR
> #ifdef __DISABLED__
> @@ -125,7 +125,7 @@ data_access_slb_pSeries:
> instruction_access_slb_pSeries:
> 	HMT_MEDIUM
> 	SET_SCRATCH0(r13)
> -	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
> +	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_NONHV, 0x480)
> 	std	r3,PACA_EXSLB+EX_R3(r13)
> 	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
> #ifdef __DISABLED__
> @@ -153,32 +153,32 @@ instruction_access_slb_pSeries:
> hardware_interrupt_pSeries:
> hardware_interrupt_hv:
> 	BEGIN_FTR_SECTION
> -		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
> -					    EXC_STD, SOFTEN_TEST)
> -		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
> -	FTR_SECTION_ELSE
> 		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
> 					    EXC_HV, SOFTEN_TEST_HV)
> 		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
> -	ALT_FTR_SECTION_END_IFCLR(CPU_FTR_HVMODE_206)
> +	FTR_SECTION_ELSE
> +		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
> +					    EXC_STD, SOFTEN_TEST_NONHV)
> +		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
> +	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206)
> 
> 	STD_EXCEPTION_PSERIES(0x600, 0x600, alignment)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x600)
> 
> 	STD_EXCEPTION_PSERIES(0x700, 0x700, program_check)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x700)
> 
> 	STD_EXCEPTION_PSERIES(0x800, 0x800, fp_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x800)
> 
> 	MASKABLE_EXCEPTION_PSERIES(0x900, 0x900, decrementer)
> 	MASKABLE_EXCEPTION_HV(0x980, 0x982, decrementer)
> 
> 	STD_EXCEPTION_PSERIES(0xa00, 0xa00, trap_0a)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xa00)
> 
> 	STD_EXCEPTION_PSERIES(0xb00, 0xb00, trap_0b)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xb00)
> 
> 	. = 0xc00
> 	.globl	system_call_pSeries
> @@ -219,7 +219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
> 	b	.
> 
> 	STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xd00)
> 
> 	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
> 	 * out of line to handle them
> @@ -254,23 +254,23 @@ vsx_unavailable_pSeries_1:
> 
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
> #endif /* CONFIG_CBE_RAS */
> 
> 	STD_EXCEPTION_PSERIES(0x1300, 0x1300, instruction_breakpoint)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
> 
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
> #endif /* CONFIG_CBE_RAS */
> 
> 	STD_EXCEPTION_PSERIES(0x1700, 0x1700, altivec_assist)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x1700)
> 
> #ifdef CONFIG_CBE_RAS
> 	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
> #endif /* CONFIG_CBE_RAS */
> 
> 	. = 0x3000
> @@ -297,7 +297,7 @@ data_access_check_stab:
> 	mfspr	r9,SPRN_DSISR
> 	srdi	r10,r10,60
> 	rlwimi	r10,r9,16,0x20
> -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 	lbz	r9,PACA_KVM_SVCPU+SVCPU_IN_GUEST(r13)
> 	rlwimi	r10,r9,8,0x300
> #endif
> @@ -316,11 +316,11 @@ do_stab_bolted_pSeries:
> 	EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD)
> #endif /* CONFIG_POWER4_ONLY */
> 
> -	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
> -	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
> -	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXGEN, EXC_STD, 0x300)
> +	KVM_HANDLER_NONHV_SKIP(PACA_EXSLB, EXC_STD, 0x380)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x400)
> +	KVM_HANDLER_NONHV(PACA_EXSLB, EXC_STD, 0x480)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0x900)
> 	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
> 
> 	.align	7
> @@ -337,11 +337,11 @@ do_stab_bolted_pSeries:
> 
> 	/* moved from 0xf00 */
> 	STD_EXCEPTION_PSERIES(., 0xf00, performance_monitor)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf00)
> 	STD_EXCEPTION_PSERIES(., 0xf20, altivec_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf20)
> 	STD_EXCEPTION_PSERIES(., 0xf40, vsx_unavailable)
> -	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
> +	KVM_HANDLER_NONHV(PACA_EXGEN, EXC_STD, 0xf40)
> 
> /*
>  * An interrupt came in while soft-disabled; clear EE in SRR1,
> @@ -418,7 +418,11 @@ slb_miss_user_pseries:
> /* KVM's trampoline code needs to be close to the interrupt handlers */
> 
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> #include "../kvm/book3s_rmhandlers.S"
> +#else
> +#include "../kvm/book3s_hv_rmhandlers.S"
> +#endif
> #endif
> 
> 	.align	7
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index b7baff7..6ff191b 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -20,7 +20,6 @@ config KVM
> 	bool
> 	select PREEMPT_NOTIFIERS
> 	select ANON_INODES
> -	select KVM_MMIO
> 
> config KVM_BOOK3S_HANDLER
> 	bool
> @@ -28,16 +27,22 @@ config KVM_BOOK3S_HANDLER
> config KVM_BOOK3S_32_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
> +	select KVM_MMIO
> 
> config KVM_BOOK3S_64_HANDLER
> 	bool
> 	select KVM_BOOK3S_HANDLER
> 
> +config KVM_BOOK3S_NONHV
> +	bool
> +	select KVM_MMIO
> +
> config KVM_BOOK3S_32
> 	tristate "KVM support for PowerPC book3s_32 processors"
> 	depends on EXPERIMENTAL && PPC_BOOK3S_32 && !SMP && !PTE_64BIT
> 	select KVM
> 	select KVM_BOOK3S_32_HANDLER
> +	select KVM_BOOK3S_NONHV
> 	---help---
> 	  Support running unmodified book3s_32 guest kernels
> 	  in virtual machines on book3s_32 host processors.
> @@ -48,10 +53,38 @@ config KVM_BOOK3S_32
> 	  If unsure, say N.
> 
> config KVM_BOOK3S_64
> -	tristate "KVM support for PowerPC book3s_64 processors"
> +	bool
> +	select KVM_BOOK3S_64_HANDLER
> +
> +config KVM_BOOK3S_64_HV
> +	bool "KVM support for POWER7 using hypervisor mode in host"
> 	depends on EXPERIMENTAL && PPC_BOOK3S_64
> 	select KVM
> -	select KVM_BOOK3S_64_HANDLER
> +	select KVM_BOOK3S_64
> +	---help---
> +	  Support running unmodified book3s_64 guest kernels in
> +	  virtual machines on POWER7 processors that have hypervisor
> +	  mode available to the host.
> +
> +	  If you say Y here, KVM will use the hardware virtualization
> +	  facilities of POWER7 (and later) processors, meaning that
> +	  guest operating systems will run at full hardware speed
> +	  using supervisor and user modes.  However, this also means
> +	  that KVM is not usable under PowerVM (pHyp), is only usable
> +	  on POWER7 (or later) processors, and can only emulate
> +	  POWER5+, POWER6 and POWER7 processors.
> +
> +	  This module provides access to the hardware capabilities through
> +	  a character device node named /dev/kvm.
> +
> +	  If unsure, say N.
> +
> +config KVM_BOOK3S_64_NONHV
> +	tristate "KVM support for PowerPC book3s_64 processors"
> +	depends on EXPERIMENTAL && PPC_BOOK3S_64 && !KVM_BOOK3S_64_HV
> +	select KVM
> +	select KVM_BOOK3S_64
> +	select KVM_BOOK3S_NONHV
> 	---help---
> 	  Support running unmodified book3s_64 and book3s_32 guest kernels
> 	  in virtual machines on book3s_64 host processors.
> @@ -65,6 +98,7 @@ config KVM_440
> 	bool "KVM support for PowerPC 440 processors"
> 	depends on EXPERIMENTAL && 44x
> 	select KVM
> +	select KVM_MMIO

e500 should also select MMIO, no?

> 	---help---
> 	  Support running unmodified 440 guest kernels in virtual machines on
> 	  440 host processors.
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index bf9854f..37c1a60 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -14,7 +14,7 @@ CFLAGS_emulate.o  := -I.
> 
> common-objs-y += powerpc.o emulate.o
> obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
> -obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
> +obj-$(CONFIG_KVM_BOOK3S_NONHV) += book3s_exports.o
> 
> AFLAGS_booke_interrupts.o := -I$(obj)
> 
> @@ -38,7 +38,7 @@ kvm-e500-objs := \
> 	e500_emulate.o
> kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
> 
> -kvm-book3s_64-objs := \
> +kvm-book3s_64_nonhv-objs := \
> 	$(common-objs-y) \
> 	fpu.o \
> 	book3s_paired_singles.o \
> @@ -50,7 +50,17 @@ kvm-book3s_64-objs := \
> 	book3s_64_mmu_host.o \
> 	book3s_64_mmu.o \
> 	book3s_32_mmu.o
> -kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
> +kvm-objs-$(CONFIG_KVM_BOOK3S_64_NONHV) := $(kvm-book3s_64_nonhv-objs)
> +
> +kvm-book3s_64_hv-objs := \
> +	../../../virt/kvm/kvm_main.o \
> +	powerpc.o \
> +	emulate.o \
> +	book3s.o \
> +	book3s_hv.o \
> +	book3s_hv_interrupts.o \
> +	book3s_64_mmu_hv.o
> +kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) := $(kvm-book3s_64_hv-objs)
> 
> kvm-book3s_32-objs := \
> 	$(common-objs-y) \
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> new file mode 100644
> index 0000000..52d1be1
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -0,0 +1,258 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + *
> + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + */
> +
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/highmem.h>
> +#include <linux/gfp.h>
> +#include <linux/slab.h>
> +#include <linux/hugetlb.h>
> +
> +#include <asm/tlbflush.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu-hash64.h>
> +#include <asm/hvcall.h>
> +#include <asm/synch.h>
> +#include <asm/ppc-opcode.h>
> +#include <asm/cputable.h>
> +
> +/* For now use fixed-size 16MB page table */
> +#define HPT_ORDER	24
> +#define HPT_NPTEG	(1ul << (HPT_ORDER - 7))	/* 128B per pteg */
> +#define HPT_HASH_MASK	(HPT_NPTEG - 1)
> +
> +/* Pages in the VRMA are 16MB pages */
> +#define VRMA_PAGE_ORDER	24
> +#define VRMA_VSID	0x1ffffffUL	/* 1TB VSID reserved for VRMA */
> +
> +#define NR_LPIDS	1024
> +unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
> +
> +long kvmppc_alloc_hpt(struct kvm *kvm)
> +{
> +	unsigned long hpt;
> +	unsigned long lpid;
> +
> +	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
> +			       HPT_ORDER - PAGE_SHIFT);

This would end up failing quite often, no?

> +	if (!hpt) {
> +		pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
> +		return -ENOMEM;
> +	}
> +	kvm->arch.hpt_virt = hpt;
> +
> +	do {
> +		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
> +		if (lpid >= NR_LPIDS) {
> +			pr_err("kvm_alloc_hpt: No LPIDs free\n");
> +			free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
> +			return -ENOMEM;
> +		}
> +	} while (test_and_set_bit(lpid, lpid_inuse));
> +
> +	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
> +	kvm->arch.lpid = lpid;
> +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
> +	kvm->arch.host_lpid = mfspr(SPRN_LPID);
> +	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);

How do these correlate with the guest's hv mmu? I'd like to keep the code clean enough so we can potentially use it for PR mode as well :).

> +	
> +	pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
> +	return 0;
> +}
> +
> +void kvmppc_free_hpt(struct kvm *kvm)
> +{
> +	unsigned long i;
> +	struct kvmppc_pginfo *pginfo;
> +
> +	clear_bit(kvm->arch.lpid, lpid_inuse);
> +	free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
> +
> +	if (kvm->arch.ram_pginfo) {
> +		pginfo = kvm->arch.ram_pginfo;
> +		kvm->arch.ram_pginfo = NULL;
> +		for (i = 0; i < kvm->arch.ram_npages; ++i)
> +			put_page(pfn_to_page(pginfo[i].pfn));
> +		kfree(pginfo);
> +	}
> +}
> +
> +static unsigned long user_page_size(unsigned long addr)
> +{
> +	struct vm_area_struct *vma;
> +	unsigned long size = PAGE_SIZE;
> +
> +	down_read(&current->mm->mmap_sem);
> +	vma = find_vma(current->mm, addr);
> +	if (vma)
> +		size = vma_kernel_pagesize(vma);
> +	up_read(&current->mm->mmap_sem);
> +	return size;
> +}
> +
> +static pfn_t hva_to_pfn(unsigned long addr)
> +{
> +	struct page *page[1];
> +	int npages;
> +
> +	might_sleep();
> +
> +	npages = get_user_pages_fast(addr, 1, 1, page);
> +
> +	if (unlikely(npages != 1))
> +		return 0;
> +
> +	return page_to_pfn(page[0]);
> +}
> +
> +long kvmppc_prepare_vrma(struct kvm *kvm,
> +			 struct kvm_userspace_memory_region *mem)
> +{
> +	unsigned long psize, porder;
> +	unsigned long i, npages;
> +	struct kvmppc_pginfo *pginfo;
> +	pfn_t pfn;
> +	unsigned long hva;
> +
> +	/* First see what page size we have */
> +	psize = user_page_size(mem->userspace_addr);
> +	/* For now, only allow 16MB pages */

The reason to go for 16MB pages is because of the host mmu code, not the guest hv mmu. So please at least #ifdef the code to HV so we document that correlation.

> +	if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) {
> +		pr_err("bad psize=%lx memory_size=%llx @ %llx\n",
> +		       psize, mem->memory_size, mem->userspace_addr);
> +		return -EINVAL;
> +	}
> +	porder = __ilog2(psize);
> +
> +	npages = mem->memory_size >> porder;
> +	pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL);
> +	if (!pginfo) {
> +		pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n",
> +		       npages * sizeof(struct kvmppc_pginfo));
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < npages; ++i) {
> +		hva = mem->userspace_addr + (i << porder);
> +		if (user_page_size(hva) != psize)
> +			goto err;
> +		pfn = hva_to_pfn(hva);
> +		if (pfn = 0) {
> +			pr_err("oops, no pfn for hva %lx\n", hva);
> +			goto err;
> +		}
> +		if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) {
> +			pr_err("oops, unaligned pfn %llx\n", pfn);
> +			put_page(pfn_to_page(pfn));
> +			goto err;
> +		}
> +		pginfo[i].pfn = pfn;
> +	}
> +
> +	kvm->arch.ram_npages = npages;
> +	kvm->arch.ram_psize = psize;
> +	kvm->arch.ram_porder = porder;
> +	kvm->arch.ram_pginfo = pginfo;
> +
> +	return 0;
> +
> + err:
> +	kfree(pginfo);
> +	return -EINVAL;
> +}
> +
> +void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
> +{
> +	unsigned long i;
> +	unsigned long npages = kvm->arch.ram_npages;
> +	unsigned long pfn;
> +	unsigned long *hpte;
> +	unsigned long hash;
> +	struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo;
> +
> +	if (!pginfo)
> +		return;
> +
> +	/* VRMA can't be > 1TB */
> +	if (npages > 1ul << (40 - kvm->arch.ram_porder))
> +		npages = 1ul << (40 - kvm->arch.ram_porder);
> +	/* Can't use more than 1 HPTE per HPTEG */
> +	if (npages > HPT_NPTEG)
> +		npages = HPT_NPTEG;
> +
> +	for (i = 0; i < npages; ++i) {
> +		pfn = pginfo[i].pfn;
> +		/* can't use hpt_hash since va > 64 bits */
> +		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
> +		/*
> +		 * We assume that the hash table is empty and no
> +		 * vcpus are using it at this stage.  Since we create
> +		 * at most one HPTE per HPTEG, we just assume entry 7
> +		 * is available and use it.
> +		 */
> +		hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7));
> +		hpte += 7 * 2;
> +		/* HPTE low word - RPN, protection, etc. */
> +		hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C |
> +			HPTE_R_M | PP_RWXX;
> +		wmb();
> +		hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
> +			(i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED |
> +			HPTE_V_LARGE | HPTE_V_VALID;
> +	}
> +}
> +
> +int kvmppc_mmu_hv_init(void)
> +{
> +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> +		return 0;

Return 0 for "it doesn't work" might not be the right exit code ;).

> +	memset(lpid_inuse, 0, sizeof(lpid_inuse));
> +	set_bit(mfspr(SPRN_LPID), lpid_inuse);
> +	set_bit(NR_LPIDS - 1, lpid_inuse);
> +
> +	return 0;
> +}
> +
> +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
> +{
> +}
> +
> +static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
> +{
> +	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
> +}
> +
> +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
> +				struct kvmppc_pte *gpte, bool data)
> +{
> +	return -ENOENT;

Can't you just call the normal book3s_64 mmu code here? Without xlate, doing ppc_ld doesn't work, which means that reading out the faulting guest instruction breaks. We'd also need it for PR mode :).

> +}
> +
> +void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
> +
> +	vcpu->arch.slb_nr = 32;		/* Assume POWER7 for now */
> +
> +	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
> +	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
> +
> +	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> new file mode 100644
> index 0000000..f6b7cd1
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -0,0 +1,413 @@
> +/*
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
> + *
> + * Authors:
> + *    Paul Mackerras <paulus@au1.ibm.com>
> + *    Alexander Graf <agraf@suse.de>
> + *    Kevin Wolf <mail@kevin-wolf.de>
> + *
> + * Description: KVM functions specific to running on Book 3S
> + * processors in hypervisor mode (specifically POWER7 and later).
> + *
> + * This file is derived from arch/powerpc/kvm/book3s.c,
> + * by Alexander Graf <agraf@suse.de>.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/err.h>
> +#include <linux/slab.h>
> +#include <linux/preempt.h>
> +#include <linux/sched.h>
> +#include <linux/delay.h>
> +#include <linux/fs.h>
> +#include <linux/anon_inodes.h>
> +
> +#include <asm/reg.h>
> +#include <asm/cputable.h>
> +#include <asm/cacheflush.h>
> +#include <asm/tlbflush.h>
> +#include <asm/uaccess.h>
> +#include <asm/io.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu_context.h>
> +#include <asm/lppaca.h>
> +#include <asm/processor.h>
> +#include <linux/gfp.h>
> +#include <linux/sched.h>
> +#include <linux/vmalloc.h>
> +#include <linux/highmem.h>
> +
> +/* #define EXIT_DEBUG */
> +/* #define EXIT_DEBUG_SIMPLE */
> +/* #define EXIT_DEBUG_INT */
> +
> +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> +	local_paca->kvm_vcpu = vcpu;
> +	vcpu->cpu = cpu;
> +}
> +
> +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
> +{
> +	vcpu->cpu = -1;
> +}
> +
> +void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
> +{
> +	u64 now;
> +	unsigned long dec_nsec;
> +
> +	now = get_tb();
> +	if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu))
> +		kvmppc_core_queue_dec(vcpu);
> +	if (vcpu->arch.pending_exceptions)
> +		return;
> +	if (vcpu->arch.dec_expires != ~(u64)0) {
> +		dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC /
> +			tb_ticks_per_sec;
> +		hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
> +			      HRTIMER_MODE_REL);
> +	}
> +
> +	kvm_vcpu_block(vcpu);
> +	vcpu->stat.halt_wakeup++;
> +
> +	if (vcpu->arch.dec_expires != ~(u64)0)
> +		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
> +}
> +
> +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
> +{
> +	vcpu->arch.msr = msr;
> +}
> +
> +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
> +{
> +	vcpu->arch.pvr = pvr;
> +	kvmppc_mmu_book3s_hv_init(vcpu);

No need for you to do it depending on pvr. You can just do the mmu initialization on normal init :).

> +}
> +
> +void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
> +{
> +	int r;
> +
> +	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
> +	pr_err("pc  = %.16lx  msr = %.16lx  trap = %x\n",
> +	       vcpu->arch.pc, vcpu->arch.msr, vcpu->arch.trap);
> +	for (r = 0; r < 16; ++r)
> +		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
> +		       r, kvmppc_get_gpr(vcpu, r),
> +		       r+16, kvmppc_get_gpr(vcpu, r+16));
> +	pr_err("ctr = %.16lx  lr  = %.16lx\n",
> +	       vcpu->arch.ctr, vcpu->arch.lr);
> +	pr_err("srr0 = %.16lx srr1 = %.16lx\n",
> +	       vcpu->arch.srr0, vcpu->arch.srr1);
> +	pr_err("sprg0 = %.16lx sprg1 = %.16lx\n",
> +	       vcpu->arch.sprg0, vcpu->arch.sprg1);
> +	pr_err("sprg2 = %.16lx sprg3 = %.16lx\n",
> +	       vcpu->arch.sprg2, vcpu->arch.sprg3);
> +	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
> +	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.dsisr);
> +	pr_err("dar = %.16lx\n", vcpu->arch.dear);
> +	pr_err("fault dar = %.16lx dsisr = %.8x\n",
> +	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
> +	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
> +	for (r = 0; r < vcpu->arch.slb_max; ++r)
> +		pr_err("  ESID = %.16llx VSID = %.16llx\n",
> +		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
> +	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
> +	       vcpu->arch.lpcr, vcpu->kvm->arch.sdr1,
> +	       vcpu->arch.last_inst);
> +}
> +
> +static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
> +			      struct task_struct *tsk)
> +{
> +	int r = RESUME_HOST;
> +
> +	vcpu->stat.sum_exits++;
> +
> +	run->exit_reason = KVM_EXIT_UNKNOWN;
> +	run->ready_for_interrupt_injection = 1;
> +	switch (vcpu->arch.trap) {
> +	/* We're good on these - the host merely wanted to get our attention */
> +	case BOOK3S_INTERRUPT_HV_DECREMENTER:
> +		vcpu->stat.dec_exits++;
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_EXTERNAL:
> +		vcpu->stat.ext_intr_exits++;
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_PERFMON:
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_PROGRAM:
> +	{
> +		ulong flags;
> +		/*
> +		 * Normally program interrupts are delivered directly
> +		 * to the guest by the hardware, but we can get here
> +		 * as a result of a hypervisor emulation interrupt
> +		 * (e40) getting turned into a 700 by BML RTAS.

Not sure I fully understand what's going on there. Mind to explain? :)

> +		 */
> +		flags = vcpu->arch.msr & 0x1f0000ull;
> +		kvmppc_core_queue_program(vcpu, flags);
> +		r = RESUME_GUEST;
> +		break;
> +	}
> +	case BOOK3S_INTERRUPT_SYSCALL:
> +	{
> +		/* hcall - punt to userspace */
> +		int i;
> +

Do we really want to accept sc from pr=1? I'd just reject them straight away.

> +		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
> +		for (i = 0; i < 9; ++i)
> +			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
> +		run->exit_reason = KVM_EXIT_PAPR_HCALL;
> +		vcpu->arch.hcall_needed = 1;
> +		r = RESUME_HOST;
> +		break;
> +	}
> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> +		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
> +		vcpu->arch.dear = vcpu->arch.fault_dar;
> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
> +					0x08000000);

What do these do? I thought the guest gets DSI and ISI directly?

> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
> +		kvmppc_core_queue_program(vcpu, 0x80000);
> +		r = RESUME_GUEST;
> +		break;
> +	default:
> +		kvmppc_dump_regs(vcpu);
> +		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%lx\n",
> +			vcpu->arch.trap, kvmppc_get_pc(vcpu), vcpu->arch.msr);
> +		r = RESUME_HOST;
> +		BUG();
> +		break;
> +	}
> +
> +
> +	if (!(r & RESUME_HOST)) {
> +		/* To avoid clobbering exit_reason, only check for signals if
> +		 * we aren't already exiting to userspace for some other
> +		 * reason. */
> +		if (signal_pending(tsk)) {
> +			vcpu->stat.signal_exits++;
> +			run->exit_reason = KVM_EXIT_INTR;
> +			r = -EINTR;
> +		} else {
> +			kvmppc_core_deliver_interrupts(vcpu);
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
> +                                  struct kvm_sregs *sregs)
> +{
> +	int i;
> +
> +	sregs->pvr = vcpu->arch.pvr;
> +
> +	memset(sregs, 0, sizeof(struct kvm_sregs));
> +	for (i = 0; i < vcpu->arch.slb_max; i++) {
> +		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
> +		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
> +	}
> +
> +	return 0;
> +}
> +
> +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
> +                                  struct kvm_sregs *sregs)
> +{
> +	int i, j;
> +
> +	kvmppc_set_pvr(vcpu, sregs->pvr);
> +
> +	j = 0;
> +	for (i = 0; i < vcpu->arch.slb_nr; i++) {
> +		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
> +			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
> +			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
> +			++j;
> +		}
> +	}
> +	vcpu->arch.slb_max = j;
> +
> +	return 0;
> +}
> +
> +int kvmppc_core_check_processor_compat(void)
> +{
> +	if (cpu_has_feature(CPU_FTR_HVMODE_206))
> +		return 0;
> +	return -EIO;
> +}
> +
> +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
> +{
> +	struct kvm_vcpu *vcpu;
> +	int err = -ENOMEM;
> +	unsigned long lpcr;
> +
> +	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
> +	if (!vcpu)
> +		goto out;
> +
> +	err = kvm_vcpu_init(vcpu, kvm, id);
> +	if (err)
> +		goto free_vcpu;
> +
> +	vcpu->arch.last_cpu = -1;
> +	vcpu->arch.host_msr = mfmsr();
> +	vcpu->arch.mmcr[0] = MMCR0_FC;
> +	vcpu->arch.ctrl = CTRL_RUNLATCH;
> +	/* default to book3s_64 (power7) */
> +	vcpu->arch.pvr = 0x3f0200;

Wouldn't it make sense to simply default to the host pvr? Not sure - maybe it's not :).

> +	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
> +
> +	/* remember where some real-mode handlers are */
> +	vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
> +	vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
> +	vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
> +	vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall;
> +
> +	lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES);
> +	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
> +	vcpu->arch.lpcr = lpcr;
> +
> +	return vcpu;
> +
> +free_vcpu:
> +	kfree(vcpu);
> +out:
> +	return ERR_PTR(err);
> +}
> +
> +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
> +{
> +	kvm_vcpu_uninit(vcpu);
> +	kfree(vcpu);
> +}
> +
> +extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
> +
> +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +{
> +	u64 now;
> +
> +	if (signal_pending(current)) {
> +		run->exit_reason = KVM_EXIT_INTR;
> +		return -EINTR;
> +	}
> +
> +	flush_fp_to_thread(current);

Do those with fine with preemption enabled?

> +	flush_altivec_to_thread(current);
> +	flush_vsx_to_thread(current);
> +	preempt_disable();
> +
> +	kvm_guest_enter();
> +
> +	__kvmppc_vcore_entry(NULL, vcpu);
> +
> +	kvm_guest_exit();
> +
> +	preempt_enable();
> +	kvm_resched(vcpu);
> +
> +	now = get_tb();
> +	/* cancel pending dec exception if dec is positive */
> +	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
> +		kvmppc_core_dequeue_dec(vcpu);
> +
> +	return kvmppc_handle_exit(run, vcpu, current);
> +}
> +
> +int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem)
> +{
> +	if (mem->guest_phys_addr = 0 && mem->memory_size != 0)
> +		return kvmppc_prepare_vrma(kvm, mem);
> +	return 0;
> +}
> +
> +void kvmppc_core_commit_memory_region(struct kvm *kvm,
> +				struct kvm_userspace_memory_region *mem)
> +{
> +	if (mem->guest_phys_addr = 0 && mem->memory_size != 0)
> +		kvmppc_map_vrma(kvm, mem);
> +}
> +
> +int kvmppc_core_init_vm(struct kvm *kvm)
> +{
> +	long r;
> +
> +	/* Allocate hashed page table */
> +	r = kvmppc_alloc_hpt(kvm);
> +
> +	return r;
> +}
> +
> +void kvmppc_core_destroy_vm(struct kvm *kvm)
> +{
> +	kvmppc_free_hpt(kvm);
> +}
> +
> +/* These are stubs for now */
> +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
> +{
> +}
> +
> +/* We don't need to emulate any privileged instructions or dcbz */
> +int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
> +                           unsigned int inst, int *advance)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
> +{
> +	return EMULATE_FAIL;
> +}
> +
> +static int kvmppc_book3s_hv_init(void)
> +{
> +	int r;
> +
> +	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
> +
> +	if (r)
> +		return r;
> +
> +	r = kvmppc_mmu_hv_init();
> +
> +	return r;
> +}
> +
> +static void kvmppc_book3s_hv_exit(void)
> +{
> +	kvm_exit();
> +}
> +
> +module_init(kvmppc_book3s_hv_init);
> +module_exit(kvmppc_book3s_hv_exit);
> diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
> new file mode 100644
> index 0000000..19d152d
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
> @@ -0,0 +1,326 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + *
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + *
> + * Derived from book3s_interrupts.S, which is:
> + * Copyright SUSE Linux Products GmbH 2009
> + *
> + * Authors: Alexander Graf <agraf@suse.de>
> + */
> +
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/page.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/exception-64s.h>
> +#include <asm/ppc-opcode.h>
> +
> +#define DISABLE_INTERRUPTS	\
> +	mfmsr   r0;		\
> +	rldicl  r0,r0,48,1;	\
> +	rotldi  r0,r0,16;	\
> +	mtmsrd  r0,1;		\
> +
> +/*****************************************************************************
> + *                                                                           *
> + *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
> + *                                                                           *
> + ****************************************************************************/
> +
> +/* Registers:
> + *  r4: vcpu pointer
> + */
> +_GLOBAL(__kvmppc_vcore_entry)
> +
> +	/* Write correct stack frame */
> +	mflr	r0
> +	std	r0,PPC_LR_STKOFF(r1)
> +
> +	/* Save host state to the stack */
> +	stdu	r1, -SWITCH_FRAME_SIZE(r1)
> +
> +	/* Save non-volatile registers (r14 - r31) */
> +	SAVE_NVGPRS(r1)
> +
> +	/* Save host PMU registers and load guest PMU registers */
> +	/* R4 is live here (vcpu pointer) but not r3 or r5 */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
> +	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
> +	isync
> +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
> +	lbz	r5, LPPACA_PMCINUSE(r3)
> +	cmpwi	r5, 0
> +	beq	31f			/* skip if not */
> +	mfspr	r5, SPRN_MMCR1
> +	mfspr	r6, SPRN_MMCRA
> +	std	r7, PACA_HOST_MMCR(r13)
> +	std	r5, PACA_HOST_MMCR + 8(r13)
> +	std	r6, PACA_HOST_MMCR + 16(r13)
> +	mfspr	r3, SPRN_PMC1
> +	mfspr	r5, SPRN_PMC2
> +	mfspr	r6, SPRN_PMC3
> +	mfspr	r7, SPRN_PMC4
> +	mfspr	r8, SPRN_PMC5
> +	mfspr	r9, SPRN_PMC6
> +	stw	r3, PACA_HOST_PMC(r13)
> +	stw	r5, PACA_HOST_PMC + 4(r13)
> +	stw	r6, PACA_HOST_PMC + 8(r13)
> +	stw	r7, PACA_HOST_PMC + 12(r13)
> +	stw	r8, PACA_HOST_PMC + 16(r13)
> +	stw	r9, PACA_HOST_PMC + 20(r13)
> +31:
> +
> +	/* Save host DSCR */
> +	mfspr	r3, SPRN_DSCR
> +	std	r3, PACA_HOST_DSCR(r13)
> +
> +	/* Save host DABR */
> +	mfspr	r3, SPRN_DABR
> +	std	r3, PACA_DABR(r13)
> +
> +	DISABLE_INTERRUPTS
> +
> +	/*
> +	 * Put whatever is in the decrementer into the
> +	 * hypervisor decrementer.
> +	 */
> +	mfspr	r8,SPRN_DEC
> +	mftb	r7
> +	mtspr	SPRN_HDEC,r8

Can't we just always use HDEC on the host? That's save us from all the swapping here.

> +	extsw	r8,r8
> +	add	r8,r8,r7
> +	std	r8,PACA_KVM_DECEXP(r13)

Why is dec+hdec on vcpu_run decexp? What exactly does this store?

> +
> +	ld	r5, VCPU_TRAMPOLINE_ENTER(r4)
> +	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
> +
> +	/* Jump to segment patching handler and into our guest */
> +	b	kvmppc_rmcall
> +
> +/*
> + * This is the handler in module memory. It gets jumped at from the
> + * lowmem trampoline code, so it's basically the guest exit code.
> + *
> + */
> +
> +.global kvmppc_handler_highmem
> +kvmppc_handler_highmem:
> +
> +	/*
> +	 * Register usage at this point:
> +	 *
> +	 * R1       = host R1
> +	 * R2       = host R2
> +	 * R12      = exit handler id
> +	 * R13      = PACA
> +	 * SVCPU.*  = guest *
> +	 *
> +	 */
> +
> +	/* R7 = vcpu */
> +	ld	r7, PACA_KVM_VCPU(r13)
> +
> +	/*
> +	 * Reload DEC.  HDEC interrupts were disabled when
> +	 * we reloaded the host's LPCR value.
> +	 */
> +	ld	r3, PACA_KVM_DECEXP(r13)
> +	mftb	r4
> +	subf	r4, r4, r3
> +	mtspr	SPRN_DEC, r4
> +
> +	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
> +	lbz	r4, LPPACA_PMCINUSE(r3)
> +	cmpwi	r4, 0
> +	beq	23f			/* skip if not */
> +	lwz	r3, PACA_HOST_PMC(r13)
> +	lwz	r4, PACA_HOST_PMC + 4(r13)
> +	lwz	r5, PACA_HOST_PMC + 4(r13)

copy&paste error?

> +	lwz	r6, PACA_HOST_PMC + 4(r13)
> +	lwz	r8, PACA_HOST_PMC + 4(r13)
> +	lwz	r9, PACA_HOST_PMC + 4(r13)
> +	mtspr	SPRN_PMC1, r3
> +	mtspr	SPRN_PMC2, r4
> +	mtspr	SPRN_PMC3, r5
> +	mtspr	SPRN_PMC4, r6
> +	mtspr	SPRN_PMC5, r8
> +	mtspr	SPRN_PMC6, r9
> +	ld	r3, PACA_HOST_MMCR(r13)
> +	ld	r4, PACA_HOST_MMCR + 8(r13)
> +	ld	r5, PACA_HOST_MMCR + 16(r13)
> +	mtspr	SPRN_MMCR1, r4
> +	mtspr	SPRN_MMCRA, r5
> +	mtspr	SPRN_MMCR0, r3
> +	isync
> +23:
> +
> +	/* Restore host msr -> SRR1 */
> +	ld	r4, VCPU_HOST_MSR(r7)
> +
> +	/*
> +	 * For some interrupts, we need to call the real Linux
> +	 * handler, so it can do work for us. This has to happen
> +	 * as if the interrupt arrived from the kernel though,
> +	 * so let's fake it here where most state is restored.
> +	 *
> +	 * Call Linux for hardware interrupts/decrementer
> +	 * r3 = address of interrupt handler (exit reason)
> +	 */
> +	/* Note: preemption is disabled at this point */
> +
> +	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
> +	beq	1f
> +	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
> +	beq	1f
> +
> +	/* Back to EE=1 */
> +	mtmsr	r4
> +	sync
> +	b	kvm_return_point
> +
> +1:	bl	call_linux_handler
> +
> +.global kvm_return_point
> +kvm_return_point:
> +	/* Restore non-volatile host registers (r14 - r31) */
> +	REST_NVGPRS(r1)
> +
> +	addi    r1, r1, SWITCH_FRAME_SIZE
> +	ld	r0, PPC_LR_STKOFF(r1)
> +	mtlr	r0
> +	blr
> +
> +call_linux_handler:
> +	/* Restore host IP -> SRR0 */
> +	mflr	r3
> +	mtlr	r12
> +
> +	ld	r5, VCPU_TRAMPOLINE_LOWMEM(r7)
> +	LOAD_REG_IMMEDIATE(r6, MSR_KERNEL & ~(MSR_IR | MSR_DR))
> +	b	kvmppc_rmcall
> +
> +/*
> + * Save away FP, VMX and VSX registers.
> + * r3 = vcpu pointer
> +*/
> +_GLOBAL(kvmppc_save_fp)
> +	mfmsr	r9
> +	ori	r8,r9,MSR_FP
> +#ifdef CONFIG_ALTIVEC
> +#ifdef CONFIG_VSX
> +	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
> +#else
> +	oris	r8,r8,MSR_VEC@h
> +#endif
> +#endif
> +	mtmsrd	r8
> +	isync
> +#ifdef CONFIG_VSX
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r6,reg*16+VCPU_VSRS
> +	stxvd2x	reg,r6,r3
> +	reg = reg + 1
> +	.endr
> +FTR_SECTION_ELSE
> +#endif
> +	reg = 0
> +	.rept	32
> +	stfd	reg,reg*8+VCPU_FPRS(r3)
> +	reg = reg + 1
> +	.endr
> +#ifdef CONFIG_VSX
> +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
> +#endif
> +	mffs	fr0
> +	stfd	fr0,VCPU_FPSCR(r3)
> +
> +#ifdef CONFIG_ALTIVEC
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r6,reg*16+VCPU_VRS
> +	stvx	reg,r6,r3
> +	reg = reg + 1
> +	.endr
> +	mfvscr	vr0
> +	li	r6,VCPU_VSCR
> +	stvx	vr0,r6,r3
> +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
> +#endif
> +	mfspr	r6,SPRN_VRSAVE
> +	stw	r6,VCPU_VRSAVE(r3)
> +	mtmsrd	r9
> +	isync
> +	blr
> +
> +/*
> + * Load up FP, VMX and VSX registers
> + * r4 = vcpu pointer
> + */
> +	.globl	kvmppc_load_fp
> +kvmppc_load_fp:
> +	mfmsr	r9
> +	ori	r8,r9,MSR_FP
> +#ifdef CONFIG_ALTIVEC
> +#ifdef CONFIG_VSX
> +	oris	r8,r8,(MSR_VEC|MSR_VSX)@h
> +#else
> +	oris	r8,r8,MSR_VEC@h
> +#endif
> +#endif
> +	mtmsrd	r8
> +	isync
> +	lfd	fr0,VCPU_FPSCR(r4)
> +	MTFSF_L(fr0)
> +#ifdef CONFIG_VSX
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r7,reg*16+VCPU_VSRS
> +	lxvd2x	reg,r7,r4
> +	reg = reg + 1
> +	.endr
> +FTR_SECTION_ELSE
> +#endif
> +	reg = 0
> +	.rept	32
> +	lfd	reg,reg*8+VCPU_FPRS(r4)
> +	reg = reg + 1
> +	.endr
> +#ifdef CONFIG_VSX
> +ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
> +#endif
> +
> +#ifdef CONFIG_ALTIVEC
> +	li	r7,VCPU_VSCR
> +	lvx	vr0,r7,r4
> +	mtvscr	vr0
> +BEGIN_FTR_SECTION
> +	reg = 0
> +	.rept	32
> +	li	r7,reg*16+VCPU_VRS
> +	lvx	reg,r7,r4
> +	reg = reg + 1
> +	.endr
> +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
> +#endif
> +	lwz	r7,VCPU_VRSAVE(r4)
> +	mtspr	SPRN_VRSAVE,r7
> +	blr
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> new file mode 100644
> index 0000000..813b01c
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -0,0 +1,663 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + *
> + * Derived from book3s_rmhandlers.S and other files, which are:
> + *
> + * Copyright SUSE Linux Products GmbH 2009
> + *
> + * Authors: Alexander Graf <agraf@suse.de>
> + */
> +
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/page.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/exception-64s.h>
> +
> +/*****************************************************************************
> + *                                                                           *
> + *        Real Mode handlers that need to be in the linear mapping           *
> + *                                                                           *
> + ****************************************************************************/
> +
> +#define SHADOW_VCPU_OFF		PACA_KVM_SVCPU
> +
> +	.globl	kvmppc_skip_interrupt
> +kvmppc_skip_interrupt:
> +	mfspr	r13,SPRN_SRR0
> +	addi	r13,r13,4
> +	mtspr	SPRN_SRR0,r13
> +	GET_SCRATCH0(r13)
> +	rfid
> +	b	.
> +
> +	.globl	kvmppc_skip_Hinterrupt
> +kvmppc_skip_Hinterrupt:
> +	mfspr	r13,SPRN_HSRR0
> +	addi	r13,r13,4
> +	mtspr	SPRN_HSRR0,r13
> +	GET_SCRATCH0(r13)
> +	hrfid
> +	b	.
> +
> +/*
> + * This trampoline brings us back to a real mode handler
> + *
> + * Input Registers:
> + *
> + * R5 = SRR0
> + * R6 = SRR1
> + * R12 = real-mode IP
> + * LR = real-mode IP
> + *
> + */
> +.global kvmppc_handler_lowmem_trampoline
> +kvmppc_handler_lowmem_trampoline:
> +	cmpwi	r12,0x500
> +	beq	1f
> +	cmpwi	r12,0x980
> +	beq	1f

What?

1) use the macros please
2) why?

> +	mtsrr0	r3
> +	mtsrr1	r4
> +	blr
> +1:	mtspr	SPRN_HSRR0,r3
> +	mtspr	SPRN_HSRR1,r4
> +	blr
> +
> +/*
> + * Call a function in real mode.
> + * Must be called with interrupts hard-disabled.
> + *
> + * Input Registers:
> + *
> + * R5 = function
> + * R6 = MSR
> + * R7 = scratch register
> + *
> + */
> +_GLOBAL(kvmppc_rmcall)
> +	mfmsr	r7
> +	li	r0,MSR_RI		/* clear RI in MSR */
> +	andc	r7,r7,r0
> +	mtmsrd	r7,1
> +	mtsrr0	r5
> +	mtsrr1	r6
> +	RFI
> +
> +.global kvmppc_trampoline_lowmem
> +kvmppc_trampoline_lowmem:
> +	PPC_LONG kvmppc_handler_lowmem_trampoline - _stext
> +
> +.global kvmppc_trampoline_enter
> +kvmppc_trampoline_enter:
> +	PPC_LONG kvmppc_handler_trampoline_enter - _stext
> +
> +#define ULONG_SIZE 		8
> +#define VCPU_GPR(n)		(VCPU_GPRS + (n * ULONG_SIZE))
> +
> +/******************************************************************************
> + *                                                                            *
> + *                               Entry code                                   *
> + *                                                                            *
> + *****************************************************************************/
> +
> +.global kvmppc_handler_trampoline_enter
> +kvmppc_handler_trampoline_enter:
> +
> +	/* Required state:
> +	 *
> +	 * R4 = vcpu pointer
> +	 * MSR = ~IR|DR
> +	 * R13 = PACA
> +	 * R1 = host R1
> +	 * all other volatile GPRS = free
> +	 */
> +	ld	r14, VCPU_GPR(r14)(r4)
> +	ld	r15, VCPU_GPR(r15)(r4)
> +	ld	r16, VCPU_GPR(r16)(r4)
> +	ld	r17, VCPU_GPR(r17)(r4)
> +	ld	r18, VCPU_GPR(r18)(r4)
> +	ld	r19, VCPU_GPR(r19)(r4)
> +	ld	r20, VCPU_GPR(r20)(r4)
> +	ld	r21, VCPU_GPR(r21)(r4)
> +	ld	r22, VCPU_GPR(r22)(r4)
> +	ld	r23, VCPU_GPR(r23)(r4)
> +	ld	r24, VCPU_GPR(r24)(r4)
> +	ld	r25, VCPU_GPR(r25)(r4)
> +	ld	r26, VCPU_GPR(r26)(r4)
> +	ld	r27, VCPU_GPR(r27)(r4)
> +	ld	r28, VCPU_GPR(r28)(r4)
> +	ld	r29, VCPU_GPR(r29)(r4)
> +	ld	r30, VCPU_GPR(r30)(r4)
> +	ld	r31, VCPU_GPR(r31)(r4)
> +
> +	/* Load guest PMU registers */
> +	/* R4 is live here (vcpu pointer) */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
> +	isync
> +	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
> +	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
> +	lwz	r6, VCPU_PMC + 8(r4)
> +	lwz	r7, VCPU_PMC + 12(r4)
> +	lwz	r8, VCPU_PMC + 16(r4)
> +	lwz	r9, VCPU_PMC + 20(r4)
> +	mtspr	SPRN_PMC1, r3
> +	mtspr	SPRN_PMC2, r5
> +	mtspr	SPRN_PMC3, r6
> +	mtspr	SPRN_PMC4, r7
> +	mtspr	SPRN_PMC5, r8
> +	mtspr	SPRN_PMC6, r9
> +	ld	r3, VCPU_MMCR(r4)
> +	ld	r5, VCPU_MMCR + 8(r4)
> +	ld	r6, VCPU_MMCR + 16(r4)
> +	mtspr	SPRN_MMCR1, r5
> +	mtspr	SPRN_MMCRA, r6
> +	mtspr	SPRN_MMCR0, r3
> +	isync
> +
> +	/* Load up FP, VMX and VSX registers */
> +	bl	kvmppc_load_fp
> +
> +	/* Switch DSCR to guest value */
> +	ld	r5, VCPU_DSCR(r4)
> +	mtspr	SPRN_DSCR, r5
> +
> +	/*
> +	 * Set the decrementer to the guest decrementer.
> +	 */
> +	ld	r8,VCPU_DEC_EXPIRES(r4)
> +	mftb	r7
> +	subf	r3,r7,r8
> +	mtspr	SPRN_DEC,r3
> +	stw	r3,VCPU_DEC(r4)
> +
> +	ld	r5, VCPU_SPRG0(r4)
> +	ld	r6, VCPU_SPRG1(r4)
> +	ld	r7, VCPU_SPRG2(r4)
> +	ld	r8, VCPU_SPRG3(r4)
> +	mtspr	SPRN_SPRG0, r5
> +	mtspr	SPRN_SPRG1, r6
> +	mtspr	SPRN_SPRG2, r7
> +	mtspr	SPRN_SPRG3, r8
> +
> +	/* Save R1 in the PACA */
> +	std	r1, PACA_KVM_SVCPU + SVCPU_HOST_R1(r13)
> +
> +	/* Load up DAR and DSISR */
> +	ld	r5, VCPU_DAR(r4)
> +	lwz	r6, VCPU_DSISR(r4)
> +	mtspr	SPRN_DAR, r5
> +	mtspr	SPRN_DSISR, r6
> +
> +	/* Set partition DABR */
> +	li	r5,3
> +	ld	r6,VCPU_DABR(r4)
> +	mtspr	SPRN_DABRX,r5
> +	mtspr	SPRN_DABR,r6
> +
> +	/* Restore AMR and UAMOR, set AMOR to all 1s */
> +	ld	r5,VCPU_AMR(r4)
> +	ld	r6,VCPU_UAMOR(r4)
> +	li	r7,-1
> +	mtspr	SPRN_AMR,r5
> +	mtspr	SPRN_UAMOR,r6
> +	mtspr	SPRN_AMOR,r7
> +
> +	/* Clear out SLB */
> +	li	r6,0
> +	slbmte	r6,r6
> +	slbia
> +	ptesync
> +
> +	/* Switch to guest partition. */
> +	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> +	ld	r6,KVM_SDR1(r9)
> +	lwz	r7,KVM_LPID(r9)
> +	li	r0,0x3ff		/* switch to reserved LPID */
> +	mtspr	SPRN_LPID,r0
> +	ptesync
> +	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
> +	mtspr	SPRN_LPID,r7
> +	isync
> +	ld	r8,VCPU_LPCR(r4)
> +	mtspr	SPRN_LPCR,r8
> +	isync
> +
> +	/* Check if HDEC expires soon */
> +	mfspr	r3,SPRN_HDEC
> +	cmpwi	r3,10
> +	li	r12,0x980

define

> +	mr	r9,r4
> +	blt	hdec_soon

Is it faster to do the check than to save the check and take the odds? Also, maybe we should rather do the check in preemptible code that could just directly pass the time slice on.

> +
> +	/*
> +	 * Invalidate the TLB if we could possibly have stale TLB
> +	 * entries for this partition on this core due to the use
> +	 * of tlbiel.
> +	 */
> +	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> +	lwz	r5,VCPU_VCPUID(r4)
> +	lhz	r6,PACAPACAINDEX(r13)
> +	lhz	r8,VCPU_LAST_CPU(r4)
> +	sldi	r7,r6,1			/* see if this is the same vcpu */
> +	add	r7,r7,r9		/* as last ran on this pcpu */
> +	lhz	r0,KVM_LAST_VCPU(r7)
> +	cmpw	r6,r8			/* on the same cpu core as last time? */
> +	bne	3f
> +	cmpw	r0,r5			/* same vcpu as this core last ran? */
> +	beq	1f
> +3:	sth	r6,VCPU_LAST_CPU(r4)	/* if not, invalidate partition TLB */
> +	sth	r5,KVM_LAST_VCPU(r7)
> +	li	r6,128
> +	mtctr	r6
> +	li	r7,0x800		/* IS field = 0b10 */
> +	ptesync
> +2:	tlbiel	r7
> +	addi	r7,r7,0x1000
> +	bdnz	2b
> +	ptesync
> +1:
> +
> +	/* Save purr/spurr */
> +	mfspr	r5,SPRN_PURR
> +	mfspr	r6,SPRN_SPURR
> +	std	r5,PACA_HOST_PURR(r13)
> +	std	r6,PACA_HOST_SPURR(r13)
> +	ld	r7,VCPU_PURR(r4)
> +	ld	r8,VCPU_SPURR(r4)
> +	mtspr	SPRN_PURR,r7
> +	mtspr	SPRN_SPURR,r8
> +
> +	/* Load up guest SLB entries */
> +	lwz	r5,VCPU_SLB_MAX(r4)
> +	cmpwi	r5,0
> +	beq	9f
> +	mtctr	r5
> +	addi	r6,r4,VCPU_SLB
> +1:	ld	r8,VCPU_SLB_E(r6)
> +	ld	r9,VCPU_SLB_V(r6)
> +	slbmte	r9,r8
> +	addi	r6,r6,VCPU_SLB_SIZE
> +	bdnz	1b
> +9:
> +
> +	/* Restore state of CTRL run bit; assume 1 on entry */
> +	lwz	r5,VCPU_CTRL(r4)
> +	andi.	r5,r5,1
> +	bne	4f
> +	mfspr	r6,SPRN_CTRLF
> +	clrrdi	r6,r6,1
> +	mtspr	SPRN_CTRLT,r6
> +4:
> +	ld	r6, VCPU_CTR(r4)
> +	lwz	r7, VCPU_XER(r4)
> +
> +	mtctr	r6
> +	mtxer	r7
> +
> +	/* Move SRR0 and SRR1 into the respective regs */
> +	ld	r6, VCPU_SRR0(r4)
> +	ld	r7, VCPU_SRR1(r4)
> +	mtspr	SPRN_SRR0, r6
> +	mtspr	SPRN_SRR1, r7
> +
> +	ld	r10, VCPU_PC(r4)
> +
> +	ld	r11, VCPU_MSR(r4)	/* r10 = vcpu->arch.msr & ~MSR_HV */
> +	rldicl	r11, r11, 63 - MSR_HV_LG, 1
> +	rotldi	r11, r11, 1 + MSR_HV_LG
> +	ori	r11, r11, MSR_ME
> +
> +fast_guest_return:
> +	mtspr	SPRN_HSRR0,r10
> +	mtspr	SPRN_HSRR1,r11
> +
> +	/* Activate guest mode, so faults get handled by KVM */
> +	li	r9, KVM_GUEST_MODE_GUEST
> +	stb	r9, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
> +
> +	/* Enter guest */
> +
> +	ld	r5, VCPU_LR(r4)
> +	lwz	r6, VCPU_CR(r4)
> +	mtlr	r5
> +	mtcr	r6
> +
> +	ld	r0, VCPU_GPR(r0)(r4)
> +	ld	r1, VCPU_GPR(r1)(r4)
> +	ld	r2, VCPU_GPR(r2)(r4)
> +	ld	r3, VCPU_GPR(r3)(r4)
> +	ld	r5, VCPU_GPR(r5)(r4)
> +	ld	r6, VCPU_GPR(r6)(r4)
> +	ld	r7, VCPU_GPR(r7)(r4)
> +	ld	r8, VCPU_GPR(r8)(r4)
> +	ld	r9, VCPU_GPR(r9)(r4)
> +	ld	r10, VCPU_GPR(r10)(r4)
> +	ld	r11, VCPU_GPR(r11)(r4)
> +	ld	r12, VCPU_GPR(r12)(r4)
> +	ld	r13, VCPU_GPR(r13)(r4)
> +
> +	ld	r4, VCPU_GPR(r4)(r4)
> +
> +	hrfid
> +	b	.
> +kvmppc_handler_trampoline_enter_end:
> +
> +
> +/******************************************************************************
> + *                                                                            *
> + *                               Exit code                                    *
> + *                                                                            *
> + *****************************************************************************/
> +
> +/*
> + * We come here from the first-level interrupt handlers.
> + */
> +	.globl	kvmppc_interrupt
> +kvmppc_interrupt:
> +	/*
> +	 * Register contents:
> +	 * R12		= interrupt vector
> +	 * R13		= PACA
> +	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
> +	 * guest R13 saved in SPRN_SCRATCH0
> +	 */
> +	/* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
> +	std	r9, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
> +	ld	r9, PACA_KVM_VCPU(r13)
> +
> +	/* Save registers */
> +
> +	std	r0, VCPU_GPR(r0)(r9)
> +	std	r1, VCPU_GPR(r1)(r9)
> +	std	r2, VCPU_GPR(r2)(r9)
> +	std	r3, VCPU_GPR(r3)(r9)
> +	std	r4, VCPU_GPR(r4)(r9)
> +	std	r5, VCPU_GPR(r5)(r9)
> +	std	r6, VCPU_GPR(r6)(r9)
> +	std	r7, VCPU_GPR(r7)(r9)
> +	std	r8, VCPU_GPR(r8)(r9)
> +	ld	r0, (SHADOW_VCPU_OFF + SVCPU_HOST_R2)(r13)
> +	std	r0, VCPU_GPR(r9)(r9)
> +	std	r10, VCPU_GPR(r10)(r9)
> +	std	r11, VCPU_GPR(r11)(r9)
> +	ld	r3, (SHADOW_VCPU_OFF + SVCPU_SCRATCH0)(r13)
> +	lwz	r4, (SHADOW_VCPU_OFF + SVCPU_SCRATCH1)(r13)
> +	std	r3, VCPU_GPR(r12)(r9)
> +	stw	r4, VCPU_CR(r9)
> +
> +	/* Restore R1/R2 so we can handle faults */
> +	ld	r1, (SHADOW_VCPU_OFF + SVCPU_HOST_R1)(r13)
> +	ld	r2, PACATOC(r13)
> +
> +	mfspr	r10, SPRN_SRR0
> +	mfspr	r11, SPRN_SRR1
> +	std	r10, VCPU_SRR0(r9)
> +	std	r11, VCPU_SRR1(r9)
> +	andi.	r0, r12, 2		/* need to read HSRR0/1? */
> +	beq	1f
> +	mfspr	r10, SPRN_HSRR0
> +	mfspr	r11, SPRN_HSRR1
> +	clrrdi	r12, r12, 2
> +1:	std	r10, VCPU_PC(r9)
> +	std	r11, VCPU_MSR(r9)
> +
> +	GET_SCRATCH0(r3)
> +	mflr	r4
> +	std	r3, VCPU_GPR(r13)(r9)
> +	std	r4, VCPU_LR(r9)
> +
> +	/* Unset guest mode */
> +	li	r0, KVM_GUEST_MODE_NONE
> +	stb	r0, (SHADOW_VCPU_OFF + SVCPU_IN_GUEST)(r13)
> +
> +	stw	r12,VCPU_TRAP(r9)
> +
> +	/* See if this is a leftover HDEC interrupt */
> +	cmpwi	r12,0x980

define

> +	bne	2f
> +	mfspr	r3,SPRN_HDEC
> +	cmpwi	r3,0
> +	bge	ignore_hdec
> +2:
> +
> +	/* Check for mediated interrupts (could be done earlier really ...) */
> +	cmpwi	r12,0x500

define

> +	bne+	1f
> +	ld	r5,VCPU_LPCR(r9)
> +	andi.	r0,r11,MSR_EE
> +	beq	1f
> +	andi.	r0,r5,LPCR_MER
> +	bne	bounce_ext_interrupt

So there's no need for the external check that directly goes into the Linux handler code on full-fledged exits?

> +1:
> +
> +	/* Save DEC */
> +	mfspr	r5,SPRN_DEC
> +	mftb	r6
> +	extsw	r5,r5
> +	add	r5,r5,r6
> +	std	r5,VCPU_DEC_EXPIRES(r9)
> +
> +	/* Save HEIR (in last_inst) if this is a HEI (e40) */
> +	li	r3,-1
> +	cmpwi	r12,0xe40
> +	bne	11f
> +	mfspr	r3,SPRN_HEIR
> +11:	stw	r3,VCPU_LAST_INST(r9)
> +	
> +	/* Save more register state  */
> +	mfxer	r5
> +	mfdar	r6
> +	mfdsisr	r7
> +	mfctr	r8
> +
> +	stw	r5, VCPU_XER(r9)
> +	std	r6, VCPU_DAR(r9)
> +	stw	r7, VCPU_DSISR(r9)
> +	std	r8, VCPU_CTR(r9)
> +	cmpwi	r12,0xe00		/* grab HDAR & HDSISR if HDSI */
> +	beq	6f
> +7:	std	r6, VCPU_FAULT_DAR(r9)
> +	stw	r7, VCPU_FAULT_DSISR(r9)
> +
> +	/* Save guest CTRL register, set runlatch to 1 */
> +	mfspr	r6,SPRN_CTRLF
> +	stw	r6,VCPU_CTRL(r9)
> +	andi.	r0,r6,1
> +	bne	4f
> +	ori	r6,r6,1
> +	mtspr	SPRN_CTRLT,r6
> +4:
> +	/* Read the guest SLB and save it away */
> +	li	r6,0
> +	addi	r7,r9,VCPU_SLB
> +	li	r5,0
> +1:	slbmfee	r8,r6
> +	andis.	r0,r8,SLB_ESID_V@h
> +	beq	2f
> +	add	r8,r8,r6		/* put index in */
> +	slbmfev	r3,r6
> +	std	r8,VCPU_SLB_E(r7)
> +	std	r3,VCPU_SLB_V(r7)
> +	addi	r7,r7,VCPU_SLB_SIZE
> +	addi	r5,r5,1
> +2:	addi	r6,r6,1
> +	cmpwi	r6,32

I don't like how the 32 is hardcoded here. Better create a define for it and use the same in the init code.

> +	blt	1b
> +	stw	r5,VCPU_SLB_MAX(r9)
> +
> +	/*
> +	 * Save the guest PURR/SPURR
> +	 */
> +	mfspr	r5,SPRN_PURR
> +	mfspr	r6,SPRN_SPURR
> +	ld	r7,VCPU_PURR(r9)
> +	ld	r8,VCPU_SPURR(r9)
> +	std	r5,VCPU_PURR(r9)
> +	std	r6,VCPU_SPURR(r9)
> +	subf	r5,r7,r5
> +	subf	r6,r8,r6
> +
> +	/*
> +	 * Restore host PURR/SPURR and add guest times
> +	 * so that the time in the guest gets accounted.
> +	 */
> +	ld	r3,PACA_HOST_PURR(r13)
> +	ld	r4,PACA_HOST_SPURR(r13)
> +	add	r3,r3,r5
> +	add	r4,r4,r6
> +	mtspr	SPRN_PURR,r3
> +	mtspr	SPRN_SPURR,r4
> +
> +	/* Clear out SLB */
> +	li	r5,0
> +	slbmte	r5,r5
> +	slbia
> +	ptesync
> +
> +hdec_soon:
> +	/* Switch back to host partition */
> +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> +	ld	r6,KVM_HOST_SDR1(r4)
> +	lwz	r7,KVM_HOST_LPID(r4)
> +	li	r8,0x3ff		/* switch to reserved LPID */

is it reserved by ISA? Either way, hard-coding the constant without a name is not nice :).

> +	mtspr	SPRN_LPID,r8
> +	ptesync
> +	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
> +	mtspr	SPRN_LPID,r7
> +	isync
> +	lis	r8,0x7fff

INT_MAX@h might be more readable.

> +	mtspr	SPRN_HDEC,r8
> +
> +	ld	r8,KVM_HOST_LPCR(r4)
> +	mtspr	SPRN_LPCR,r8
> +	isync
> +
> +	/* load host SLB entries */
> +	ld	r8,PACA_SLBSHADOWPTR(r13)
> +
> +	.rept	SLB_NUM_BOLTED
> +	ld	r5,SLBSHADOW_SAVEAREA(r8)
> +	ld	r6,SLBSHADOW_SAVEAREA+8(r8)
> +	andis.	r7,r5,SLB_ESID_V@h
> +	beq	1f
> +	slbmte	r6,r5
> +1:	addi	r8,r8,16
> +	.endr
> +
> +	/* Save and reset AMR and UAMOR before turning on the MMU */
> +	mfspr	r5,SPRN_AMR
> +	mfspr	r6,SPRN_UAMOR
> +	std	r5,VCPU_AMR(r9)
> +	std	r6,VCPU_UAMOR(r9)
> +	li	r6,0
> +	mtspr	SPRN_AMR,r6
> +
> +	/* Restore host DABR and DABRX */
> +	ld	r5,PACA_DABR(r13)
> +	li	r6,7
> +	mtspr	SPRN_DABR,r5
> +	mtspr	SPRN_DABRX,r6
> +
> +	/* Switch DSCR back to host value */
> +	mfspr	r8, SPRN_DSCR
> +	ld	r7, PACA_HOST_DSCR(r13)
> +	std	r8, VCPU_DSCR(r7)
> +	mtspr	SPRN_DSCR, r7
> +
> +	/* Save non-volatile GPRs */
> +	std	r14, VCPU_GPR(r14)(r9)
> +	std	r15, VCPU_GPR(r15)(r9)
> +	std	r16, VCPU_GPR(r16)(r9)
> +	std	r17, VCPU_GPR(r17)(r9)
> +	std	r18, VCPU_GPR(r18)(r9)
> +	std	r19, VCPU_GPR(r19)(r9)
> +	std	r20, VCPU_GPR(r20)(r9)
> +	std	r21, VCPU_GPR(r21)(r9)
> +	std	r22, VCPU_GPR(r22)(r9)
> +	std	r23, VCPU_GPR(r23)(r9)
> +	std	r24, VCPU_GPR(r24)(r9)
> +	std	r25, VCPU_GPR(r25)(r9)
> +	std	r26, VCPU_GPR(r26)(r9)
> +	std	r27, VCPU_GPR(r27)(r9)
> +	std	r28, VCPU_GPR(r28)(r9)
> +	std	r29, VCPU_GPR(r29)(r9)
> +	std	r30, VCPU_GPR(r30)(r9)
> +	std	r31, VCPU_GPR(r31)(r9)
> +
> +	/* Save SPRGs */
> +	mfspr	r3, SPRN_SPRG0
> +	mfspr	r4, SPRN_SPRG1
> +	mfspr	r5, SPRN_SPRG2
> +	mfspr	r6, SPRN_SPRG3
> +	std	r3, VCPU_SPRG0(r9)
> +	std	r4, VCPU_SPRG1(r9)
> +	std	r5, VCPU_SPRG2(r9)
> +	std	r6, VCPU_SPRG3(r9)
> +
> +	/* Save PMU registers */
> +	li	r3, 1
> +	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
> +	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
> +	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
> +	isync
> +	mfspr	r5, SPRN_MMCR1
> +	mfspr	r6, SPRN_MMCRA
> +	std	r4, VCPU_MMCR(r9)
> +	std	r5, VCPU_MMCR + 8(r9)
> +	std	r6, VCPU_MMCR + 16(r9)
> +	mfspr	r3, SPRN_PMC1
> +	mfspr	r4, SPRN_PMC2
> +	mfspr	r5, SPRN_PMC3
> +	mfspr	r6, SPRN_PMC4
> +	mfspr	r7, SPRN_PMC5
> +	mfspr	r8, SPRN_PMC6
> +	stw	r3, VCPU_PMC(r9)
> +	stw	r4, VCPU_PMC + 4(r9)
> +	stw	r5, VCPU_PMC + 8(r9)
> +	stw	r6, VCPU_PMC + 12(r9)
> +	stw	r7, VCPU_PMC + 16(r9)
> +	stw	r8, VCPU_PMC + 20(r9)
> +22:
> +	/* save FP state */
> +	mr	r3, r9
> +	bl	.kvmppc_save_fp
> +
> +	/* RFI into the highmem handler */
> +	mfmsr	r7
> +	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
> +	mtsrr1	r7
> +	/* Load highmem handler address */
> +	ld	r8, VCPU_HIGHMEM_HANDLER(r3)
> +	mtsrr0	r8
> +
> +	RFI
> +kvmppc_handler_trampoline_exit_end:
> +
> +6:	mfspr	r6,SPRN_HDAR
> +	mfspr	r7,SPRN_HDSISR
> +	b	7b
> +
> +ignore_hdec:
> +	mr	r4,r9
> +	b	fast_guest_return
> +
> +bounce_ext_interrupt:
> +	mr	r4,r9
> +	mtspr	SPRN_SRR0,r10
> +	mtspr	SPRN_SRR1,r11
> +	li	r10,0x500
> +	LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME);
> +	b	fast_guest_return
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 4b54148..e5e3f92 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,8 +38,12 @@
> 
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> -	return !(v->arch.shared->msr & MSR_WE) ||
> +#ifndef CONFIG_KVM_BOOK3S_64_HV
> +	return !(kvmppc_get_msr(v) & MSR_WE) ||
> 	       !!(v->arch.pending_exceptions);
> +#else
> +	return 1;

So what happens if the guest sets MSR_WE? It just stays in guest context idling? That'd be pretty bad for scheduling on the host.

> +#endif
> }
> 
> int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
> @@ -52,7 +56,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
> 	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
> 	unsigned long r2 = 0;
> 
> -	if (!(vcpu->arch.shared->msr & MSR_SF)) {
> +	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
> 		/* 32 bit mode */
> 		param1 &= 0xffffffff;
> 		param2 &= 0xffffffff;
> @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_PPC_IRQ_LEVEL:
> 	case KVM_CAP_ENABLE_CAP:
> 	case KVM_CAP_PPC_OSI:
> +#ifndef CONFIG_KVM_BOOK3S_64_HV

You also don't do OSI on HV :).

> 	case KVM_CAP_PPC_GET_PVINFO:
> 		r = 1;
> 		break;
> 	case KVM_CAP_COALESCED_MMIO:
> 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
> 		break;
> +#endif
> 	default:
> 		r = 0;
> 		break;
> @@ -286,6 +292,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
> 	tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
> 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
> +	vcpu->arch.dec_expires = ~(u64)0;
> 
> 	return 0;
> }
> @@ -474,6 +481,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
> 		for (i = 0; i < 32; i++)
> 			kvmppc_set_gpr(vcpu, i, gprs[i]);
> 		vcpu->arch.osi_needed = 0;
> +	} else if (vcpu->arch.hcall_needed) {
> +		int i;
> +
> +		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
> +		for (i = 0; i < 6; ++i)
> +			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
> +		vcpu->arch.hcall_needed = 0;
> 	}
> 
> 	kvmppc_core_deliver_interrupts(vcpu);
> @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
> 	if (waitqueue_active(&vcpu->wq)) {
> 		wake_up_interruptible(&vcpu->wq);
> 		vcpu->stat.halt_wakeup++;
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	} else if (vcpu->cpu != -1) {
> +		smp_send_reschedule(vcpu->cpu);

Shouldn't this be done for non-HV too? The only reason we don't do it yet is because we don't do SMP, no?

> +#endif
> 	}
> -

eh... :)

> 	return 0;
> }
> 
> diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
> index d62a14b..e5c99b8 100644
> --- a/arch/powerpc/kvm/trace.h
> +++ b/arch/powerpc/kvm/trace.h
> @@ -103,7 +103,7 @@ TRACE_EVENT(kvm_gtlb_write,
>  *                         Book3S trace points                           *
>  *************************************************************************/
> 
> -#ifdef CONFIG_PPC_BOOK3S
> +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 
> TRACE_EVENT(kvm_book3s_exit,
> 	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..a4447ce 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
> #define KVM_EXIT_NMI              16
> #define KVM_EXIT_INTERNAL_ERROR   17
> #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_PAPR_HCALL	  19
> 
> /* For KVM_EXIT_INTERNAL_ERROR */
> #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -264,6 +265,11 @@ struct kvm_run {
> 		struct {
> 			__u64 gprs[32];
> 		} osi;
> +		struct {
> +			__u64 nr;
> +			__u64 ret;
> +			__u64 args[9];
> +		} papr_hcall;

This needs some information in the documentation.


Alex

PS: I CC'ed kvm-ppc@vger. Please make sure to CC that mailing list, so people interested in kvm on ppc get your patches as well :).


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-12  9:07     ` Avi Kivity
@ 2011-05-16  1:07       ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-16  1:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linuxppc-dev, kvm, Alexander Graf

On Thu, May 12, 2011 at 12:07:17PM +0300, Avi Kivity wrote:
> On 05/11/2011 01:44 PM, Paul Mackerras wrote:

> >--- a/include/linux/kvm.h
> >+++ b/include/linux/kvm.h
> >@@ -161,6 +161,7 @@ struct kvm_pit_config {
> >  #define KVM_EXIT_NMI              16
> >  #define KVM_EXIT_INTERNAL_ERROR   17
> >  #define KVM_EXIT_OSI              18
> >+#define KVM_EXIT_PAPR_HCALL	  19
> >
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  #define KVM_INTERNAL_ERROR_EMULATION 1
> >@@ -264,6 +265,11 @@ struct kvm_run {
> >  		struct {
> >  			__u64 gprs[32];
> >  		} osi;
> >+		struct {
> >+			__u64 nr;
> >+			__u64 ret;
> >+			__u64 args[9];
> >+		} papr_hcall;
> >  		/* Fix the size of the union. */
> >  		char padding[256];
> >  	};
> 
> Please document this in Documentation/kvm/api.txt.

I'll add a description of the basic calling convention in the next
version of the patches.  The full description of all the possible
hypercalls is in the PAPR version 2.4 document (826 pages) on the
www.power.org website.  You have to become a power.org member to
download it, but membership is free for individual developers.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-16  1:07       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-16  1:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linuxppc-dev, Alexander Graf, kvm

On Thu, May 12, 2011 at 12:07:17PM +0300, Avi Kivity wrote:
> On 05/11/2011 01:44 PM, Paul Mackerras wrote:

> >--- a/include/linux/kvm.h
> >+++ b/include/linux/kvm.h
> >@@ -161,6 +161,7 @@ struct kvm_pit_config {
> >  #define KVM_EXIT_NMI              16
> >  #define KVM_EXIT_INTERNAL_ERROR   17
> >  #define KVM_EXIT_OSI              18
> >+#define KVM_EXIT_PAPR_HCALL	  19
> >
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  #define KVM_INTERNAL_ERROR_EMULATION 1
> >@@ -264,6 +265,11 @@ struct kvm_run {
> >  		struct {
> >  			__u64 gprs[32];
> >  		} osi;
> >+		struct {
> >+			__u64 nr;
> >+			__u64 ret;
> >+			__u64 args[9];
> >+		} papr_hcall;
> >  		/* Fix the size of the union. */
> >  		char padding[256];
> >  	};
> 
> Please document this in Documentation/kvm/api.txt.

I'll add a description of the basic calling convention in the next
version of the patches.  The full description of all the possible
hypercalls is in the PAPR version 2.4 document (826 pages) on the
www.power.org website.  You have to become a power.org member to
download it, but membership is free for individual developers.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-15 21:58     ` Alexander Graf
  (?)
@ 2011-05-16  5:58       ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-16  5:58 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, KVM list, kvm-ppc

On Sun, May 15, 2011 at 11:58:12PM +0200, Alexander Graf wrote:
> 
> On 11.05.2011, at 12:44, Paul Mackerras wrote:

> > +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 
> I really liked how you called the .c file _pr - why call it NONHV now?

I agree, CONFIG_KVM_BOOK3S_PR would be better, I'll change it.

> > diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> > index 7412676..8dba5f6 100644
> > --- a/arch/powerpc/include/asm/paca.h
> > +++ b/arch/powerpc/include/asm/paca.h
> > @@ -149,6 +149,16 @@ struct paca_struct {
> > #ifdef CONFIG_KVM_BOOK3S_HANDLER
> > 	/* We use this to store guest state in */
> > 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> > +#ifdef CONFIG_KVM_BOOK3S_64_HV
> > +	struct kvm_vcpu *kvm_vcpu;
> > +	u64 dabr;
> > +	u64 host_mmcr[3];
> > +	u32 host_pmc[6];
> > +	u64 host_purr;
> > +	u64 host_spurr;
> > +	u64 host_dscr;
> > +	u64 dec_expires;
> 
> Hrm. I'd say either push those into shadow_vcpu for HV mode or get
> rid of the shadow_vcpu reference. I'd probably prefer the former.

These are fields that are pieces of host state that we need to save
and restore across execution of a guest; they don't apply to any
specific guest or vcpu.  That's why I didn't put them in shadow_vcpu,
which is specifically for one vcpu in one guest.  

Given that book3s_pr copies the shadow_vcpu into and out of the paca,
I thought it best not to add fields to it that are only live while we
are in the guest.  True, these fields only exist for book3s_hv, but if
we later on make it possible to select book3s_pr vs. book3s_hv at
runtime, we won't want to be copying these fields into and out of the
paca when book3s_pr is active.

Maybe we need another struct, kvm_host_state or something like that,
to save this sort of state.

> > @@ -65,6 +98,7 @@ config KVM_440
> > 	bool "KVM support for PowerPC 440 processors"
> > 	depends on EXPERIMENTAL && 44x
> > 	select KVM
> > +	select KVM_MMIO
> 
> e500 should also select MMIO, no?

Good point, I'll fix that.

> > +long kvmppc_alloc_hpt(struct kvm *kvm)
> > +{
> > +	unsigned long hpt;
> > +	unsigned long lpid;
> > +
> > +	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
> > +			       HPT_ORDER - PAGE_SHIFT);
> 
> This would end up failing quite often, no?

In practice it seems to be OK, possibly because the machines we're
testing this on have plenty of memory.  Maybe we should get qemu to
allocate the HPT using hugetlbfs so the memory will come from the
reserved page pool.  It does need to be physically contiguous and
aligned on a multiple of its size -- that's a hardware requirement.

> > +	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
> > +	kvm->arch.lpid = lpid;
> > +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
> > +	kvm->arch.host_lpid = mfspr(SPRN_LPID);
> > +	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
> 
> How do these correlate with the guest's hv mmu? I'd like to keep the
> code clean enough so we can potentially use it for PR mode as well :). 

The host SDR1 and LPID are different from the guest's.  That is, the
guest has its own HPT which is quite separate from the host's.  The
host values could be saved in global variables, though; there's no
real need for each VM to have its own copy, except that doing it this
way simplifies the low-level assembly code a little.

> > +	/* First see what page size we have */
> > +	psize = user_page_size(mem->userspace_addr);
> > +	/* For now, only allow 16MB pages */
> 
> The reason to go for 16MB pages is because of the host mmu code, not
> the guest hv mmu. So please at least #ifdef the code to HV so we
> document that correlation. 

I'm not sure what you mean by that.  The reason for going for 16MB
pages initially is for performance (this way the guest can use 16MB
pages for its linear mapping) and to avoid having to deal with the
pages being paged or swapped on the host side.  Could you explain the
"because of the host mmu code" part of your comment further?

What would adding #ifdef CONFIG_KVM_BOOK3S_64_HV achieve in a file
whose name ends in _hv.c and which only gets compiled when
CONFIG_KVM_BOOK3S_64_HV=y?

> > +int kvmppc_mmu_hv_init(void)
> > +{
> > +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> > +		return 0;
> 
> Return 0 for "it doesn't work" might not be the right exit code ;).

Good point, I'll make it -ENXIO or something.

> > +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
> > +				struct kvmppc_pte *gpte, bool data)
> > +{
> > +	return -ENOENT;
> 
> Can't you just call the normal book3s_64 mmu code here? Without
> xlate, doing ppc_ld doesn't work, which means that reading out the
> faulting guest instruction breaks. We'd also need it for PR mode :). 

With book3s_hv we currently have no situations where we need to read
out the faulting guest instruction.  

We could use the normal code here if we had the guest HPT mapped into
the qemu address space, which it currently isn't -- but should be.  It
hasn't been a priority to fix because with book3s_hv we currently have
no situations where we need to read out the faulting guest
instruction.

> > +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
> > +{
> > +	vcpu->arch.pvr = pvr;
> > +	kvmppc_mmu_book3s_hv_init(vcpu);
> 
> No need for you to do it depending on pvr. You can just do the mmu
> initialization on normal init :).

OK, I'll do that.

> > +	case BOOK3S_INTERRUPT_PROGRAM:
> > +	{
> > +		ulong flags;
> > +		/*
> > +		 * Normally program interrupts are delivered directly
> > +		 * to the guest by the hardware, but we can get here
> > +		 * as a result of a hypervisor emulation interrupt
> > +		 * (e40) getting turned into a 700 by BML RTAS.
> 
> Not sure I fully understand what's going on there. Mind to explain? :)

Recent versions of the architecture don't actually deliver a 0x700
interrupt to the OS on an illegal instruction; instead they generate
an 0xe40 interrupt to the hypervisor in case the hypervisor wants to
emulate the instruction.  If the hypervisor doesn't want to do that
it's supposed to synthesize a 0x700 interrupt to the OS.

When we're running this stuff under our BML (Bare Metal Linux)
framework in the lab, we use a small RTAS implementation that gets
loaded by the BML loader, and one of the things that this RTAS does is
to patch the 0xe40 vector to make the 0xe40 interrupt come in via the
0x700 vector, in case the kernel you're running under BML hasn't been
updated to have an 0xe40 handler.

I could just remove that case, in fact.

> > +	case BOOK3S_INTERRUPT_SYSCALL:
> > +	{
> > +		/* hcall - punt to userspace */
> > +		int i;
> > +
> 
> Do we really want to accept sc from pr=1? I'd just reject them straight away.

Good idea, I'll do that.

> > +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> > +		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
> > +		vcpu->arch.dear = vcpu->arch.fault_dar;
> > +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
> > +		r = RESUME_GUEST;
> > +		break;
> > +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> > +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
> > +					0x08000000);
> 
> What do these do? I thought the guest gets DSI and ISI directly?

It does for accesses with relocation (IR/DR) on, but because we have
enabled VRMA mode (Virtualized Real Mode Area) in the LPCR, we get
these interrupts to the hypervisor if the guest does a bad real-mode
access.  If we also turned on Virtualized Partition Memory (VPM) mode
in the LPCR, then all ISI/DSI in the guest come through to the
hypervisor using these vectors in case the hypervisor wants to do any
paging/swapping of guest memory.  I plan to do that later when we
support using 4k/64k pages for guest memory.

> > +	/* default to book3s_64 (power7) */
> > +	vcpu->arch.pvr = 0x3f0200;
> 
> Wouldn't it make sense to simply default to the host pvr? Not sure -
> maybe it's not :).

Sounds sensible, actually.  In fact the hypervisor can't spoof the PVR
for the guest, that is, the guest can read the real PVR value and
there's no way the hypervisor can stop it.

> > +	flush_fp_to_thread(current);
> 
> Do those with fine with preemption enabled?

Yes.  If a preemption happens, it will flush the FP/VMX/VSX registers
out to the thread_struct, and then any of these explicit calls that
happen after the preemption will do nothing.

> > +	/*
> > +	 * Put whatever is in the decrementer into the
> > +	 * hypervisor decrementer.
> > +	 */
> > +	mfspr	r8,SPRN_DEC
> > +	mftb	r7
> > +	mtspr	SPRN_HDEC,r8
> 
> Can't we just always use HDEC on the host? That's save us from all
> the swapping here.

The problem is that there is only one HDEC per core, so using it would
become complicated when the host is running in SMT4 or SMT2 mode.

> > +	extsw	r8,r8
> > +	add	r8,r8,r7
> > +	std	r8,PACA_KVM_DECEXP(r13)
> 
> Why is dec+hdec on vcpu_run decexp? What exactly does this store?

R7 here is the current (or very recent, anyway) timebase value, so
this stores the timebase value at which the host decrementer would get
to zero.

> > +	lwz	r3, PACA_HOST_PMC(r13)
> > +	lwz	r4, PACA_HOST_PMC + 4(r13)
> > +	lwz	r5, PACA_HOST_PMC + 4(r13)
> 
> copy&paste error?

Yes, thanks.

> > +.global kvmppc_handler_lowmem_trampoline
> > +kvmppc_handler_lowmem_trampoline:
> > +	cmpwi	r12,0x500
> > +	beq	1f
> > +	cmpwi	r12,0x980
> > +	beq	1f
> 
> What?
> 
> 1) use the macros please

OK

> 2) why?

The external interrupt and hypervisor decrementer interrupt handlers
expect the interrupted PC and MSR to be in HSRR0/1 rather than
SRR0/1.  I could remove the case for 0x980 since we don't call the
linux handler for HDEC interrupts any more.

> > +	/* Check if HDEC expires soon */
> > +	mfspr	r3,SPRN_HDEC
> > +	cmpwi	r3,10
> > +	li	r12,0x980
> 
> define

OK.

> > +	mr	r9,r4
> > +	blt	hdec_soon
> 
> Is it faster to do the check than to save the check and take the
> odds? Also, maybe we should rather do the check in preemptible
> code that could just directly pass the time slice on.

I do the check there because I was having problems where, if the HDEC
goes negative before we do the partition switch, we would occasionally
not get the HDEC interrupt at all until the next time HDEC went
negative, ~ 8.4 seconds later.

> > +	/* See if this is a leftover HDEC interrupt */
> > +	cmpwi	r12,0x980
> 
> define

OK.

> > +	bne	2f
> > +	mfspr	r3,SPRN_HDEC
> > +	cmpwi	r3,0
> > +	bge	ignore_hdec
> > +2:
> > +
> > +	/* Check for mediated interrupts (could be done earlier really ...) */
> > +	cmpwi	r12,0x500
> 
> define

OK.

> > +	bne+	1f
> > +	ld	r5,VCPU_LPCR(r9)
> > +	andi.	r0,r11,MSR_EE
> > +	beq	1f
> > +	andi.	r0,r5,LPCR_MER
> > +	bne	bounce_ext_interrupt
> 
> So there's no need for the external check that directly goes into
> the Linux handler code on full-fledged exits?

No, we still need that; ordinary external interrupts come to the
hypervisor regardless of the guest's MSR.EE setting.

The MER bit (Mediated External Request) is a way for the hypervisor to
know when the guest sets its MSR.EE bit.  If an event happens that
means the host wants to give a guest vcpu an external interrupt, but
the guest vcpu has MSR.EE = 0, then the host can't deliver the
simulated external interrupt.  Instead it sets LPCR.MER, which has the
effect that the hardware will deliver an external interrupt (to the
hypervisor) when running in the guest and it has MSR.EE = 1.

So, when we get an external interrupt, LPCR.MER = 1 and MSR.EE = 1,
we need to synthesize an external interrupt in the guest.  Doing it
here means that we don't need to do the full partition switch out to
the host and back again.

> > +	/* Read the guest SLB and save it away */
> > +	li	r6,0
> > +	addi	r7,r9,VCPU_SLB
> > +	li	r5,0
> > +1:	slbmfee	r8,r6
> > +	andis.	r0,r8,SLB_ESID_V@h
> > +	beq	2f
> > +	add	r8,r8,r6		/* put index in */
> > +	slbmfev	r3,r6
> > +	std	r8,VCPU_SLB_E(r7)
> > +	std	r3,VCPU_SLB_V(r7)
> > +	addi	r7,r7,VCPU_SLB_SIZE
> > +	addi	r5,r5,1
> > +2:	addi	r6,r6,1
> > +	cmpwi	r6,32
> 
> I don't like how the 32 is hardcoded here. Better create a define
> for it and use the same in the init code.

Sure.  In fact I probably should use vcpu->arch.slb_nr here.

> > +hdec_soon:
> > +	/* Switch back to host partition */
> > +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> > +	ld	r6,KVM_HOST_SDR1(r4)
> > +	lwz	r7,KVM_HOST_LPID(r4)
> > +	li	r8,0x3ff		/* switch to reserved LPID */
> 
> is it reserved by ISA? Either way, hard-coding the constant without
> a name is not nice :).

Actually, it just has to be an LPID value that isn't ever used for
running a real guest (or the host).  I'll make a name for it.

> > +	lis	r8,0x7fff
> 
> INT_MAX@h might be more readable.

OK.

> > int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> > {
> > -	return !(v->arch.shared->msr & MSR_WE) ||
> > +#ifndef CONFIG_KVM_BOOK3S_64_HV
> > +	return !(kvmppc_get_msr(v) & MSR_WE) ||
> > 	       !!(v->arch.pending_exceptions);
> > +#else
> > +	return 1;
> 
> So what happens if the guest sets MSR_WE? It just stays in guest
> context idling? That'd be pretty bad for scheduling on the host.

The MSR_WE bit doesn't exist on POWER7 (or any of POWER[4567], in
fact).  If the guest wants to idle it calls the H_CEDE hcall.

> > @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 	case KVM_CAP_PPC_IRQ_LEVEL:
> > 	case KVM_CAP_ENABLE_CAP:
> > 	case KVM_CAP_PPC_OSI:
> > +#ifndef CONFIG_KVM_BOOK3S_64_HV
> 
> You also don't do OSI on HV :).

Good point, I'll fix that.

> > @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
> > 	if (waitqueue_active(&vcpu->wq)) {
> > 		wake_up_interruptible(&vcpu->wq);
> > 		vcpu->stat.halt_wakeup++;
> > +#ifdef CONFIG_KVM_BOOK3S_64_HV
> > +	} else if (vcpu->cpu != -1) {
> > +		smp_send_reschedule(vcpu->cpu);
> 
> Shouldn't this be done for non-HV too? The only reason we don't do
> it yet is because we don't do SMP, no?

I didn't know why you didn't do it for non-HV, so I didn't change it
for that case.  If you say it's OK, I'll change it (we'll need to set
vcpu->cpu in the vcpu_load/unload code for book3s_pr too, then).

> > +#endif
> > 	}
> > -
> 
> eh... :)

OK. :)

> > +		struct {
> > +			__u64 nr;
> > +			__u64 ret;
> > +			__u64 args[9];
> > +		} papr_hcall;
> 
> This needs some information in the documentation.

Yes, Avi commented on that too.

> PS: I CC'ed kvm-ppc@vger. Please make sure to CC that mailing list,
> so people interested in kvm on ppc get your patches as well :).

Sure, will do.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-16  5:58       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-16  5:58 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, kvm-ppc, KVM list

On Sun, May 15, 2011 at 11:58:12PM +0200, Alexander Graf wrote:
> 
> On 11.05.2011, at 12:44, Paul Mackerras wrote:

> > +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 
> I really liked how you called the .c file _pr - why call it NONHV now?

I agree, CONFIG_KVM_BOOK3S_PR would be better, I'll change it.

> > diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> > index 7412676..8dba5f6 100644
> > --- a/arch/powerpc/include/asm/paca.h
> > +++ b/arch/powerpc/include/asm/paca.h
> > @@ -149,6 +149,16 @@ struct paca_struct {
> > #ifdef CONFIG_KVM_BOOK3S_HANDLER
> > 	/* We use this to store guest state in */
> > 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> > +#ifdef CONFIG_KVM_BOOK3S_64_HV
> > +	struct kvm_vcpu *kvm_vcpu;
> > +	u64 dabr;
> > +	u64 host_mmcr[3];
> > +	u32 host_pmc[6];
> > +	u64 host_purr;
> > +	u64 host_spurr;
> > +	u64 host_dscr;
> > +	u64 dec_expires;
> 
> Hrm. I'd say either push those into shadow_vcpu for HV mode or get
> rid of the shadow_vcpu reference. I'd probably prefer the former.

These are fields that are pieces of host state that we need to save
and restore across execution of a guest; they don't apply to any
specific guest or vcpu.  That's why I didn't put them in shadow_vcpu,
which is specifically for one vcpu in one guest.  

Given that book3s_pr copies the shadow_vcpu into and out of the paca,
I thought it best not to add fields to it that are only live while we
are in the guest.  True, these fields only exist for book3s_hv, but if
we later on make it possible to select book3s_pr vs. book3s_hv at
runtime, we won't want to be copying these fields into and out of the
paca when book3s_pr is active.

Maybe we need another struct, kvm_host_state or something like that,
to save this sort of state.

> > @@ -65,6 +98,7 @@ config KVM_440
> > 	bool "KVM support for PowerPC 440 processors"
> > 	depends on EXPERIMENTAL && 44x
> > 	select KVM
> > +	select KVM_MMIO
> 
> e500 should also select MMIO, no?

Good point, I'll fix that.

> > +long kvmppc_alloc_hpt(struct kvm *kvm)
> > +{
> > +	unsigned long hpt;
> > +	unsigned long lpid;
> > +
> > +	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
> > +			       HPT_ORDER - PAGE_SHIFT);
> 
> This would end up failing quite often, no?

In practice it seems to be OK, possibly because the machines we're
testing this on have plenty of memory.  Maybe we should get qemu to
allocate the HPT using hugetlbfs so the memory will come from the
reserved page pool.  It does need to be physically contiguous and
aligned on a multiple of its size -- that's a hardware requirement.

> > +	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
> > +	kvm->arch.lpid = lpid;
> > +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
> > +	kvm->arch.host_lpid = mfspr(SPRN_LPID);
> > +	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
> 
> How do these correlate with the guest's hv mmu? I'd like to keep the
> code clean enough so we can potentially use it for PR mode as well :). 

The host SDR1 and LPID are different from the guest's.  That is, the
guest has its own HPT which is quite separate from the host's.  The
host values could be saved in global variables, though; there's no
real need for each VM to have its own copy, except that doing it this
way simplifies the low-level assembly code a little.

> > +	/* First see what page size we have */
> > +	psize = user_page_size(mem->userspace_addr);
> > +	/* For now, only allow 16MB pages */
> 
> The reason to go for 16MB pages is because of the host mmu code, not
> the guest hv mmu. So please at least #ifdef the code to HV so we
> document that correlation. 

I'm not sure what you mean by that.  The reason for going for 16MB
pages initially is for performance (this way the guest can use 16MB
pages for its linear mapping) and to avoid having to deal with the
pages being paged or swapped on the host side.  Could you explain the
"because of the host mmu code" part of your comment further?

What would adding #ifdef CONFIG_KVM_BOOK3S_64_HV achieve in a file
whose name ends in _hv.c and which only gets compiled when
CONFIG_KVM_BOOK3S_64_HV=y?

> > +int kvmppc_mmu_hv_init(void)
> > +{
> > +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> > +		return 0;
> 
> Return 0 for "it doesn't work" might not be the right exit code ;).

Good point, I'll make it -ENXIO or something.

> > +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
> > +				struct kvmppc_pte *gpte, bool data)
> > +{
> > +	return -ENOENT;
> 
> Can't you just call the normal book3s_64 mmu code here? Without
> xlate, doing ppc_ld doesn't work, which means that reading out the
> faulting guest instruction breaks. We'd also need it for PR mode :). 

With book3s_hv we currently have no situations where we need to read
out the faulting guest instruction.  

We could use the normal code here if we had the guest HPT mapped into
the qemu address space, which it currently isn't -- but should be.  It
hasn't been a priority to fix because with book3s_hv we currently have
no situations where we need to read out the faulting guest
instruction.

> > +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
> > +{
> > +	vcpu->arch.pvr = pvr;
> > +	kvmppc_mmu_book3s_hv_init(vcpu);
> 
> No need for you to do it depending on pvr. You can just do the mmu
> initialization on normal init :).

OK, I'll do that.

> > +	case BOOK3S_INTERRUPT_PROGRAM:
> > +	{
> > +		ulong flags;
> > +		/*
> > +		 * Normally program interrupts are delivered directly
> > +		 * to the guest by the hardware, but we can get here
> > +		 * as a result of a hypervisor emulation interrupt
> > +		 * (e40) getting turned into a 700 by BML RTAS.
> 
> Not sure I fully understand what's going on there. Mind to explain? :)

Recent versions of the architecture don't actually deliver a 0x700
interrupt to the OS on an illegal instruction; instead they generate
an 0xe40 interrupt to the hypervisor in case the hypervisor wants to
emulate the instruction.  If the hypervisor doesn't want to do that
it's supposed to synthesize a 0x700 interrupt to the OS.

When we're running this stuff under our BML (Bare Metal Linux)
framework in the lab, we use a small RTAS implementation that gets
loaded by the BML loader, and one of the things that this RTAS does is
to patch the 0xe40 vector to make the 0xe40 interrupt come in via the
0x700 vector, in case the kernel you're running under BML hasn't been
updated to have an 0xe40 handler.

I could just remove that case, in fact.

> > +	case BOOK3S_INTERRUPT_SYSCALL:
> > +	{
> > +		/* hcall - punt to userspace */
> > +		int i;
> > +
> 
> Do we really want to accept sc from pr=1? I'd just reject them straight away.

Good idea, I'll do that.

> > +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> > +		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
> > +		vcpu->arch.dear = vcpu->arch.fault_dar;
> > +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
> > +		r = RESUME_GUEST;
> > +		break;
> > +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> > +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
> > +					0x08000000);
> 
> What do these do? I thought the guest gets DSI and ISI directly?

It does for accesses with relocation (IR/DR) on, but because we have
enabled VRMA mode (Virtualized Real Mode Area) in the LPCR, we get
these interrupts to the hypervisor if the guest does a bad real-mode
access.  If we also turned on Virtualized Partition Memory (VPM) mode
in the LPCR, then all ISI/DSI in the guest come through to the
hypervisor using these vectors in case the hypervisor wants to do any
paging/swapping of guest memory.  I plan to do that later when we
support using 4k/64k pages for guest memory.

> > +	/* default to book3s_64 (power7) */
> > +	vcpu->arch.pvr = 0x3f0200;
> 
> Wouldn't it make sense to simply default to the host pvr? Not sure -
> maybe it's not :).

Sounds sensible, actually.  In fact the hypervisor can't spoof the PVR
for the guest, that is, the guest can read the real PVR value and
there's no way the hypervisor can stop it.

> > +	flush_fp_to_thread(current);
> 
> Do those with fine with preemption enabled?

Yes.  If a preemption happens, it will flush the FP/VMX/VSX registers
out to the thread_struct, and then any of these explicit calls that
happen after the preemption will do nothing.

> > +	/*
> > +	 * Put whatever is in the decrementer into the
> > +	 * hypervisor decrementer.
> > +	 */
> > +	mfspr	r8,SPRN_DEC
> > +	mftb	r7
> > +	mtspr	SPRN_HDEC,r8
> 
> Can't we just always use HDEC on the host? That's save us from all
> the swapping here.

The problem is that there is only one HDEC per core, so using it would
become complicated when the host is running in SMT4 or SMT2 mode.

> > +	extsw	r8,r8
> > +	add	r8,r8,r7
> > +	std	r8,PACA_KVM_DECEXP(r13)
> 
> Why is dec+hdec on vcpu_run decexp? What exactly does this store?

R7 here is the current (or very recent, anyway) timebase value, so
this stores the timebase value at which the host decrementer would get
to zero.

> > +	lwz	r3, PACA_HOST_PMC(r13)
> > +	lwz	r4, PACA_HOST_PMC + 4(r13)
> > +	lwz	r5, PACA_HOST_PMC + 4(r13)
> 
> copy&paste error?

Yes, thanks.

> > +.global kvmppc_handler_lowmem_trampoline
> > +kvmppc_handler_lowmem_trampoline:
> > +	cmpwi	r12,0x500
> > +	beq	1f
> > +	cmpwi	r12,0x980
> > +	beq	1f
> 
> What?
> 
> 1) use the macros please

OK

> 2) why?

The external interrupt and hypervisor decrementer interrupt handlers
expect the interrupted PC and MSR to be in HSRR0/1 rather than
SRR0/1.  I could remove the case for 0x980 since we don't call the
linux handler for HDEC interrupts any more.

> > +	/* Check if HDEC expires soon */
> > +	mfspr	r3,SPRN_HDEC
> > +	cmpwi	r3,10
> > +	li	r12,0x980
> 
> define

OK.

> > +	mr	r9,r4
> > +	blt	hdec_soon
> 
> Is it faster to do the check than to save the check and take the
> odds? Also, maybe we should rather do the check in preemptible
> code that could just directly pass the time slice on.

I do the check there because I was having problems where, if the HDEC
goes negative before we do the partition switch, we would occasionally
not get the HDEC interrupt at all until the next time HDEC went
negative, ~ 8.4 seconds later.

> > +	/* See if this is a leftover HDEC interrupt */
> > +	cmpwi	r12,0x980
> 
> define

OK.

> > +	bne	2f
> > +	mfspr	r3,SPRN_HDEC
> > +	cmpwi	r3,0
> > +	bge	ignore_hdec
> > +2:
> > +
> > +	/* Check for mediated interrupts (could be done earlier really ...) */
> > +	cmpwi	r12,0x500
> 
> define

OK.

> > +	bne+	1f
> > +	ld	r5,VCPU_LPCR(r9)
> > +	andi.	r0,r11,MSR_EE
> > +	beq	1f
> > +	andi.	r0,r5,LPCR_MER
> > +	bne	bounce_ext_interrupt
> 
> So there's no need for the external check that directly goes into
> the Linux handler code on full-fledged exits?

No, we still need that; ordinary external interrupts come to the
hypervisor regardless of the guest's MSR.EE setting.

The MER bit (Mediated External Request) is a way for the hypervisor to
know when the guest sets its MSR.EE bit.  If an event happens that
means the host wants to give a guest vcpu an external interrupt, but
the guest vcpu has MSR.EE = 0, then the host can't deliver the
simulated external interrupt.  Instead it sets LPCR.MER, which has the
effect that the hardware will deliver an external interrupt (to the
hypervisor) when running in the guest and it has MSR.EE = 1.

So, when we get an external interrupt, LPCR.MER = 1 and MSR.EE = 1,
we need to synthesize an external interrupt in the guest.  Doing it
here means that we don't need to do the full partition switch out to
the host and back again.

> > +	/* Read the guest SLB and save it away */
> > +	li	r6,0
> > +	addi	r7,r9,VCPU_SLB
> > +	li	r5,0
> > +1:	slbmfee	r8,r6
> > +	andis.	r0,r8,SLB_ESID_V@h
> > +	beq	2f
> > +	add	r8,r8,r6		/* put index in */
> > +	slbmfev	r3,r6
> > +	std	r8,VCPU_SLB_E(r7)
> > +	std	r3,VCPU_SLB_V(r7)
> > +	addi	r7,r7,VCPU_SLB_SIZE
> > +	addi	r5,r5,1
> > +2:	addi	r6,r6,1
> > +	cmpwi	r6,32
> 
> I don't like how the 32 is hardcoded here. Better create a define
> for it and use the same in the init code.

Sure.  In fact I probably should use vcpu->arch.slb_nr here.

> > +hdec_soon:
> > +	/* Switch back to host partition */
> > +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> > +	ld	r6,KVM_HOST_SDR1(r4)
> > +	lwz	r7,KVM_HOST_LPID(r4)
> > +	li	r8,0x3ff		/* switch to reserved LPID */
> 
> is it reserved by ISA? Either way, hard-coding the constant without
> a name is not nice :).

Actually, it just has to be an LPID value that isn't ever used for
running a real guest (or the host).  I'll make a name for it.

> > +	lis	r8,0x7fff
> 
> INT_MAX@h might be more readable.

OK.

> > int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> > {
> > -	return !(v->arch.shared->msr & MSR_WE) ||
> > +#ifndef CONFIG_KVM_BOOK3S_64_HV
> > +	return !(kvmppc_get_msr(v) & MSR_WE) ||
> > 	       !!(v->arch.pending_exceptions);
> > +#else
> > +	return 1;
> 
> So what happens if the guest sets MSR_WE? It just stays in guest
> context idling? That'd be pretty bad for scheduling on the host.

The MSR_WE bit doesn't exist on POWER7 (or any of POWER[4567], in
fact).  If the guest wants to idle it calls the H_CEDE hcall.

> > @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 	case KVM_CAP_PPC_IRQ_LEVEL:
> > 	case KVM_CAP_ENABLE_CAP:
> > 	case KVM_CAP_PPC_OSI:
> > +#ifndef CONFIG_KVM_BOOK3S_64_HV
> 
> You also don't do OSI on HV :).

Good point, I'll fix that.

> > @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
> > 	if (waitqueue_active(&vcpu->wq)) {
> > 		wake_up_interruptible(&vcpu->wq);
> > 		vcpu->stat.halt_wakeup++;
> > +#ifdef CONFIG_KVM_BOOK3S_64_HV
> > +	} else if (vcpu->cpu != -1) {
> > +		smp_send_reschedule(vcpu->cpu);
> 
> Shouldn't this be done for non-HV too? The only reason we don't do
> it yet is because we don't do SMP, no?

I didn't know why you didn't do it for non-HV, so I didn't change it
for that case.  If you say it's OK, I'll change it (we'll need to set
vcpu->cpu in the vcpu_load/unload code for book3s_pr too, then).

> > +#endif
> > 	}
> > -
> 
> eh... :)

OK. :)

> > +		struct {
> > +			__u64 nr;
> > +			__u64 ret;
> > +			__u64 args[9];
> > +		} papr_hcall;
> 
> This needs some information in the documentation.

Yes, Avi commented on that too.

> PS: I CC'ed kvm-ppc@vger. Please make sure to CC that mailing list,
> so people interested in kvm on ppc get your patches as well :).

Sure, will do.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in
@ 2011-05-16  5:58       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-16  5:58 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, KVM list, kvm-ppc

On Sun, May 15, 2011 at 11:58:12PM +0200, Alexander Graf wrote:
> 
> On 11.05.2011, at 12:44, Paul Mackerras wrote:

> > +#ifdef CONFIG_KVM_BOOK3S_NONHV
> 
> I really liked how you called the .c file _pr - why call it NONHV now?

I agree, CONFIG_KVM_BOOK3S_PR would be better, I'll change it.

> > diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> > index 7412676..8dba5f6 100644
> > --- a/arch/powerpc/include/asm/paca.h
> > +++ b/arch/powerpc/include/asm/paca.h
> > @@ -149,6 +149,16 @@ struct paca_struct {
> > #ifdef CONFIG_KVM_BOOK3S_HANDLER
> > 	/* We use this to store guest state in */
> > 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> > +#ifdef CONFIG_KVM_BOOK3S_64_HV
> > +	struct kvm_vcpu *kvm_vcpu;
> > +	u64 dabr;
> > +	u64 host_mmcr[3];
> > +	u32 host_pmc[6];
> > +	u64 host_purr;
> > +	u64 host_spurr;
> > +	u64 host_dscr;
> > +	u64 dec_expires;
> 
> Hrm. I'd say either push those into shadow_vcpu for HV mode or get
> rid of the shadow_vcpu reference. I'd probably prefer the former.

These are fields that are pieces of host state that we need to save
and restore across execution of a guest; they don't apply to any
specific guest or vcpu.  That's why I didn't put them in shadow_vcpu,
which is specifically for one vcpu in one guest.  

Given that book3s_pr copies the shadow_vcpu into and out of the paca,
I thought it best not to add fields to it that are only live while we
are in the guest.  True, these fields only exist for book3s_hv, but if
we later on make it possible to select book3s_pr vs. book3s_hv at
runtime, we won't want to be copying these fields into and out of the
paca when book3s_pr is active.

Maybe we need another struct, kvm_host_state or something like that,
to save this sort of state.

> > @@ -65,6 +98,7 @@ config KVM_440
> > 	bool "KVM support for PowerPC 440 processors"
> > 	depends on EXPERIMENTAL && 44x
> > 	select KVM
> > +	select KVM_MMIO
> 
> e500 should also select MMIO, no?

Good point, I'll fix that.

> > +long kvmppc_alloc_hpt(struct kvm *kvm)
> > +{
> > +	unsigned long hpt;
> > +	unsigned long lpid;
> > +
> > +	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
> > +			       HPT_ORDER - PAGE_SHIFT);
> 
> This would end up failing quite often, no?

In practice it seems to be OK, possibly because the machines we're
testing this on have plenty of memory.  Maybe we should get qemu to
allocate the HPT using hugetlbfs so the memory will come from the
reserved page pool.  It does need to be physically contiguous and
aligned on a multiple of its size -- that's a hardware requirement.

> > +	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
> > +	kvm->arch.lpid = lpid;
> > +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
> > +	kvm->arch.host_lpid = mfspr(SPRN_LPID);
> > +	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
> 
> How do these correlate with the guest's hv mmu? I'd like to keep the
> code clean enough so we can potentially use it for PR mode as well :). 

The host SDR1 and LPID are different from the guest's.  That is, the
guest has its own HPT which is quite separate from the host's.  The
host values could be saved in global variables, though; there's no
real need for each VM to have its own copy, except that doing it this
way simplifies the low-level assembly code a little.

> > +	/* First see what page size we have */
> > +	psize = user_page_size(mem->userspace_addr);
> > +	/* For now, only allow 16MB pages */
> 
> The reason to go for 16MB pages is because of the host mmu code, not
> the guest hv mmu. So please at least #ifdef the code to HV so we
> document that correlation. 

I'm not sure what you mean by that.  The reason for going for 16MB
pages initially is for performance (this way the guest can use 16MB
pages for its linear mapping) and to avoid having to deal with the
pages being paged or swapped on the host side.  Could you explain the
"because of the host mmu code" part of your comment further?

What would adding #ifdef CONFIG_KVM_BOOK3S_64_HV achieve in a file
whose name ends in _hv.c and which only gets compiled when
CONFIG_KVM_BOOK3S_64_HV=y?

> > +int kvmppc_mmu_hv_init(void)
> > +{
> > +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> > +		return 0;
> 
> Return 0 for "it doesn't work" might not be the right exit code ;).

Good point, I'll make it -ENXIO or something.

> > +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
> > +				struct kvmppc_pte *gpte, bool data)
> > +{
> > +	return -ENOENT;
> 
> Can't you just call the normal book3s_64 mmu code here? Without
> xlate, doing ppc_ld doesn't work, which means that reading out the
> faulting guest instruction breaks. We'd also need it for PR mode :). 

With book3s_hv we currently have no situations where we need to read
out the faulting guest instruction.  

We could use the normal code here if we had the guest HPT mapped into
the qemu address space, which it currently isn't -- but should be.  It
hasn't been a priority to fix because with book3s_hv we currently have
no situations where we need to read out the faulting guest
instruction.

> > +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
> > +{
> > +	vcpu->arch.pvr = pvr;
> > +	kvmppc_mmu_book3s_hv_init(vcpu);
> 
> No need for you to do it depending on pvr. You can just do the mmu
> initialization on normal init :).

OK, I'll do that.

> > +	case BOOK3S_INTERRUPT_PROGRAM:
> > +	{
> > +		ulong flags;
> > +		/*
> > +		 * Normally program interrupts are delivered directly
> > +		 * to the guest by the hardware, but we can get here
> > +		 * as a result of a hypervisor emulation interrupt
> > +		 * (e40) getting turned into a 700 by BML RTAS.
> 
> Not sure I fully understand what's going on there. Mind to explain? :)

Recent versions of the architecture don't actually deliver a 0x700
interrupt to the OS on an illegal instruction; instead they generate
an 0xe40 interrupt to the hypervisor in case the hypervisor wants to
emulate the instruction.  If the hypervisor doesn't want to do that
it's supposed to synthesize a 0x700 interrupt to the OS.

When we're running this stuff under our BML (Bare Metal Linux)
framework in the lab, we use a small RTAS implementation that gets
loaded by the BML loader, and one of the things that this RTAS does is
to patch the 0xe40 vector to make the 0xe40 interrupt come in via the
0x700 vector, in case the kernel you're running under BML hasn't been
updated to have an 0xe40 handler.

I could just remove that case, in fact.

> > +	case BOOK3S_INTERRUPT_SYSCALL:
> > +	{
> > +		/* hcall - punt to userspace */
> > +		int i;
> > +
> 
> Do we really want to accept sc from pr=1? I'd just reject them straight away.

Good idea, I'll do that.

> > +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> > +		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
> > +		vcpu->arch.dear = vcpu->arch.fault_dar;
> > +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
> > +		r = RESUME_GUEST;
> > +		break;
> > +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> > +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
> > +					0x08000000);
> 
> What do these do? I thought the guest gets DSI and ISI directly?

It does for accesses with relocation (IR/DR) on, but because we have
enabled VRMA mode (Virtualized Real Mode Area) in the LPCR, we get
these interrupts to the hypervisor if the guest does a bad real-mode
access.  If we also turned on Virtualized Partition Memory (VPM) mode
in the LPCR, then all ISI/DSI in the guest come through to the
hypervisor using these vectors in case the hypervisor wants to do any
paging/swapping of guest memory.  I plan to do that later when we
support using 4k/64k pages for guest memory.

> > +	/* default to book3s_64 (power7) */
> > +	vcpu->arch.pvr = 0x3f0200;
> 
> Wouldn't it make sense to simply default to the host pvr? Not sure -
> maybe it's not :).

Sounds sensible, actually.  In fact the hypervisor can't spoof the PVR
for the guest, that is, the guest can read the real PVR value and
there's no way the hypervisor can stop it.

> > +	flush_fp_to_thread(current);
> 
> Do those with fine with preemption enabled?

Yes.  If a preemption happens, it will flush the FP/VMX/VSX registers
out to the thread_struct, and then any of these explicit calls that
happen after the preemption will do nothing.

> > +	/*
> > +	 * Put whatever is in the decrementer into the
> > +	 * hypervisor decrementer.
> > +	 */
> > +	mfspr	r8,SPRN_DEC
> > +	mftb	r7
> > +	mtspr	SPRN_HDEC,r8
> 
> Can't we just always use HDEC on the host? That's save us from all
> the swapping here.

The problem is that there is only one HDEC per core, so using it would
become complicated when the host is running in SMT4 or SMT2 mode.

> > +	extsw	r8,r8
> > +	add	r8,r8,r7
> > +	std	r8,PACA_KVM_DECEXP(r13)
> 
> Why is dec+hdec on vcpu_run decexp? What exactly does this store?

R7 here is the current (or very recent, anyway) timebase value, so
this stores the timebase value at which the host decrementer would get
to zero.

> > +	lwz	r3, PACA_HOST_PMC(r13)
> > +	lwz	r4, PACA_HOST_PMC + 4(r13)
> > +	lwz	r5, PACA_HOST_PMC + 4(r13)
> 
> copy&paste error?

Yes, thanks.

> > +.global kvmppc_handler_lowmem_trampoline
> > +kvmppc_handler_lowmem_trampoline:
> > +	cmpwi	r12,0x500
> > +	beq	1f
> > +	cmpwi	r12,0x980
> > +	beq	1f
> 
> What?
> 
> 1) use the macros please

OK

> 2) why?

The external interrupt and hypervisor decrementer interrupt handlers
expect the interrupted PC and MSR to be in HSRR0/1 rather than
SRR0/1.  I could remove the case for 0x980 since we don't call the
linux handler for HDEC interrupts any more.

> > +	/* Check if HDEC expires soon */
> > +	mfspr	r3,SPRN_HDEC
> > +	cmpwi	r3,10
> > +	li	r12,0x980
> 
> define

OK.

> > +	mr	r9,r4
> > +	blt	hdec_soon
> 
> Is it faster to do the check than to save the check and take the
> odds? Also, maybe we should rather do the check in preemptible
> code that could just directly pass the time slice on.

I do the check there because I was having problems where, if the HDEC
goes negative before we do the partition switch, we would occasionally
not get the HDEC interrupt at all until the next time HDEC went
negative, ~ 8.4 seconds later.

> > +	/* See if this is a leftover HDEC interrupt */
> > +	cmpwi	r12,0x980
> 
> define

OK.

> > +	bne	2f
> > +	mfspr	r3,SPRN_HDEC
> > +	cmpwi	r3,0
> > +	bge	ignore_hdec
> > +2:
> > +
> > +	/* Check for mediated interrupts (could be done earlier really ...) */
> > +	cmpwi	r12,0x500
> 
> define

OK.

> > +	bne+	1f
> > +	ld	r5,VCPU_LPCR(r9)
> > +	andi.	r0,r11,MSR_EE
> > +	beq	1f
> > +	andi.	r0,r5,LPCR_MER
> > +	bne	bounce_ext_interrupt
> 
> So there's no need for the external check that directly goes into
> the Linux handler code on full-fledged exits?

No, we still need that; ordinary external interrupts come to the
hypervisor regardless of the guest's MSR.EE setting.

The MER bit (Mediated External Request) is a way for the hypervisor to
know when the guest sets its MSR.EE bit.  If an event happens that
means the host wants to give a guest vcpu an external interrupt, but
the guest vcpu has MSR.EE = 0, then the host can't deliver the
simulated external interrupt.  Instead it sets LPCR.MER, which has the
effect that the hardware will deliver an external interrupt (to the
hypervisor) when running in the guest and it has MSR.EE = 1.

So, when we get an external interrupt, LPCR.MER = 1 and MSR.EE = 1,
we need to synthesize an external interrupt in the guest.  Doing it
here means that we don't need to do the full partition switch out to
the host and back again.

> > +	/* Read the guest SLB and save it away */
> > +	li	r6,0
> > +	addi	r7,r9,VCPU_SLB
> > +	li	r5,0
> > +1:	slbmfee	r8,r6
> > +	andis.	r0,r8,SLB_ESID_V@h
> > +	beq	2f
> > +	add	r8,r8,r6		/* put index in */
> > +	slbmfev	r3,r6
> > +	std	r8,VCPU_SLB_E(r7)
> > +	std	r3,VCPU_SLB_V(r7)
> > +	addi	r7,r7,VCPU_SLB_SIZE
> > +	addi	r5,r5,1
> > +2:	addi	r6,r6,1
> > +	cmpwi	r6,32
> 
> I don't like how the 32 is hardcoded here. Better create a define
> for it and use the same in the init code.

Sure.  In fact I probably should use vcpu->arch.slb_nr here.

> > +hdec_soon:
> > +	/* Switch back to host partition */
> > +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> > +	ld	r6,KVM_HOST_SDR1(r4)
> > +	lwz	r7,KVM_HOST_LPID(r4)
> > +	li	r8,0x3ff		/* switch to reserved LPID */
> 
> is it reserved by ISA? Either way, hard-coding the constant without
> a name is not nice :).

Actually, it just has to be an LPID value that isn't ever used for
running a real guest (or the host).  I'll make a name for it.

> > +	lis	r8,0x7fff
> 
> INT_MAX@h might be more readable.

OK.

> > int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> > {
> > -	return !(v->arch.shared->msr & MSR_WE) ||
> > +#ifndef CONFIG_KVM_BOOK3S_64_HV
> > +	return !(kvmppc_get_msr(v) & MSR_WE) ||
> > 	       !!(v->arch.pending_exceptions);
> > +#else
> > +	return 1;
> 
> So what happens if the guest sets MSR_WE? It just stays in guest
> context idling? That'd be pretty bad for scheduling on the host.

The MSR_WE bit doesn't exist on POWER7 (or any of POWER[4567], in
fact).  If the guest wants to idle it calls the H_CEDE hcall.

> > @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 	case KVM_CAP_PPC_IRQ_LEVEL:
> > 	case KVM_CAP_ENABLE_CAP:
> > 	case KVM_CAP_PPC_OSI:
> > +#ifndef CONFIG_KVM_BOOK3S_64_HV
> 
> You also don't do OSI on HV :).

Good point, I'll fix that.

> > @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
> > 	if (waitqueue_active(&vcpu->wq)) {
> > 		wake_up_interruptible(&vcpu->wq);
> > 		vcpu->stat.halt_wakeup++;
> > +#ifdef CONFIG_KVM_BOOK3S_64_HV
> > +	} else if (vcpu->cpu != -1) {
> > +		smp_send_reschedule(vcpu->cpu);
> 
> Shouldn't this be done for non-HV too? The only reason we don't do
> it yet is because we don't do SMP, no?

I didn't know why you didn't do it for non-HV, so I didn't change it
for that case.  If you say it's OK, I'll change it (we'll need to set
vcpu->cpu in the vcpu_load/unload code for book3s_pr too, then).

> > +#endif
> > 	}
> > -
> 
> eh... :)

OK. :)

> > +		struct {
> > +			__u64 nr;
> > +			__u64 ret;
> > +			__u64 args[9];
> > +		} papr_hcall;
> 
> This needs some information in the documentation.

Yes, Avi commented on that too.

> PS: I CC'ed kvm-ppc@vger. Please make sure to CC that mailing list,
> so people interested in kvm on ppc get your patches as well :).

Sure, will do.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel
  2011-05-11 10:45 ` [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel Paul Mackerras
@ 2011-05-17  7:54     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  7:54 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:45, Paul Mackerras wrote:

> This adds the infrastructure for handling PAPR hcalls in the kernel,
> either early in the guest exit path while we are still in real mode,
> or later once the MMU has been turned back on and we are in the full
> kernel context.  The advantage of handling hcalls in real mode if
> possible is that we avoid two partition switches -- and this will
> become more important when we support SMT4 guests, since a partition
> switch means we have to pull all of the threads in the core out of
> the guest.  The disadvantage is that we can only access the kernel
> linear mapping, not anything vmalloced or ioremapped, since the MMU
> is off.
> 
> This also adds code to handle the following hcalls in real mode:
> 
> H_ENTER       Add an HPTE to the hashed page table
> H_REMOVE      Remove an HPTE from the hashed page table
> H_READ        Read HPTEs from the hashed page table
> H_PROTECT     Change the protection bits in an HPTE
> H_BULK_REMOVE Remove up to 4 HPTEs from the hashed page table
> H_SET_DABR    Set the data address breakpoint register
> 
> Plus code to handle the following hcalls in the kernel:
> 
> H_CEDE        Idle the vcpu until an interrupt or H_PROD hcall arrives
> H_PROD        Wake up a ceded vcpu
> H_REGISTER_VPA Register a virtual processor area (VPA)
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/hvcall.h       |    5 +
> arch/powerpc/include/asm/kvm_host.h     |   11 +
> arch/powerpc/include/asm/kvm_ppc.h      |    1 +
> arch/powerpc/kernel/asm-offsets.c       |    2 +
> arch/powerpc/kvm/book3s_64_mmu_hv.c     |  342 +++++++++++++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c            |  170 +++++++++++++++-
> arch/powerpc/kvm/book3s_hv_rmhandlers.S |  150 +++++++++++++-
> arch/powerpc/kvm/powerpc.c              |    2 +-
> 8 files changed, 679 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index 8edec71..a226002 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -29,6 +29,10 @@
> #define H_LONG_BUSY_ORDER_100_SEC	9905  /* Long busy, hint that 100sec \
> 						 is a good time to retry */
> #define H_LONG_BUSY_END_RANGE		9905  /* End of long busy range */
> +
> +/* Hacked in for HV-aware kvm support */
> +#define H_TOO_HARD	9999

Not sure I like the name - when is it used? :)
Also, if it's not in the PAPR, the guest should never receive it, right?

> +
> #define H_HARDWARE	-1	/* Hardware error */
> #define H_FUNCTION	-2	/* Function not supported */
> #define H_PRIVILEGE	-3	/* Caller not privileged */
> @@ -100,6 +104,7 @@
> #define H_PAGE_SET_ACTIVE	H_PAGE_STATE_CHANGE
> #define H_AVPN			(1UL<<(63-32))	/* An avpn is provided as a sanity test */
> #define H_ANDCOND		(1UL<<(63-33))
> +#define H_LOCAL			(1UL<<(63-35))
> #define H_ICACHE_INVALIDATE	(1UL<<(63-40))	/* icbi, etc.  (ignored for IO pages) */
> #define H_ICACHE_SYNCHRONIZE	(1UL<<(63-41))	/* dcbst, icbi, etc (ignored for IO pages */
> #define H_ZERO_PAGE		(1UL<<(63-48))	/* zero the page before mapping (ignored for IO pages) */
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index ec62365..af6703e 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -59,6 +59,10 @@ struct kvm;
> struct kvm_run;
> struct kvm_vcpu;
> 
> +struct lppaca;
> +struct slb_shadow;
> +struct dtl;
> +
> struct kvm_vm_stat {
> 	u32 remote_tlb_flush;
> };
> @@ -341,7 +345,14 @@ struct kvm_vcpu_arch {
> 	u64 dec_expires;
> 	unsigned long pending_exceptions;
> 	u16 last_cpu;
> +	u8 ceded;
> +	u8 prodded;
> 	u32 last_inst;
> +
> +	struct lppaca *vpa;
> +	struct slb_shadow *slb_shadow;
> +	struct dtl *dtl;
> +	struct dtl *dtl_end;
> 	int trap;
> 	struct kvm_vcpu_arch_shared *shared;
> 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index cd9ad96..b4ee11a 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -116,6 +116,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
> 				struct kvm_userspace_memory_region *mem);
> extern void kvmppc_map_vrma(struct kvm *kvm,
> 			    struct kvm_userspace_memory_region *mem);
> +extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
> extern int kvmppc_core_init_vm(struct kvm *kvm);
> extern void kvmppc_core_destroy_vm(struct kvm *kvm);
> extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 49e97fd..fd56f14 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -189,6 +189,7 @@ int main(void)
> 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int));
> 	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use));
> 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
> +	DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
> 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
> #endif /* CONFIG_PPC_STD_MMU_64 */
> 	DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
> @@ -467,6 +468,7 @@ int main(void)
> 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
> 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
> 	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
> +	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
> 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
> 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
> 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 52d1be1..623caae 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -219,6 +219,348 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
> 	}
> }
> 
> +#define HPTE_V_HVLOCK	0x40UL
> +
> +static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
> +{
> +	unsigned long tmp, old;
> +
> +	asm volatile("	ldarx	%0,0,%2\n"
> +		     "	and.	%1,%0,%3\n"
> +		     "	bne	2f\n"
> +		     "	ori	%0,%0,%4\n"
> +		     "  stdcx.	%0,0,%2\n"
> +		     "	beq+	2f\n"
> +		     "	li	%1,%3\n"
> +		     "2:	isync"
> +		     : "=&r" (tmp), "=&r" (old)
> +		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
> +		     : "cc", "memory");
> +	return old == 0;
> +}
> +
> +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
> +		    long pte_index, unsigned long pteh, unsigned long ptel)
> +{
> +	unsigned long porder;
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long i, lpn, pa;
> +	unsigned long *hpte;
> +
> +	/* only handle 4k, 64k and 16M pages for now */
> +	porder = 12;
> +	if (pteh & HPTE_V_LARGE) {
> +		if ((ptel & 0xf000) == 0x1000) {
> +			/* 64k page */
> +			porder = 16;
> +		} else if ((ptel & 0xff000) == 0) {
> +			/* 16M page */
> +			porder = 24;
> +			/* lowest AVA bit must be 0 for 16M pages */
> +			if (pteh & 0x80)
> +				return H_PARAMETER;
> +		} else
> +			return H_PARAMETER;
> +	}
> +	lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
> +	if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder)
> +		return H_PARAMETER;
> +	pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
> +	if (!pa)
> +		return H_PARAMETER;
> +	/* Check WIMG */
> +	if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
> +	    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
> +		return H_PARAMETER;
> +	pteh &= ~0x60UL;
> +	ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize);
> +	ptel |= pa;
> +	if (pte_index >= (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	if (likely((flags & H_EXACT) == 0)) {
> +		pte_index &= ~7UL;
> +		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
> +		for (i = 0; ; ++i) {
> +			if (i == 8)
> +				return H_PTEG_FULL;
> +			if ((*hpte & HPTE_V_VALID) == 0 &&
> +			    lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
> +				break;
> +			hpte += 2;
> +		}
> +	} else {
> +		i = 0;
> +		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
> +		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
> +			return H_PTEG_FULL;
> +	}
> +	hpte[1] = ptel;
> +	eieio();
> +	hpte[0] = pteh;
> +	asm volatile("ptesync" : : : "memory");
> +	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
> +	vcpu->arch.gpr[4] = pte_index + i;
> +	return H_SUCCESS;
> +}
> +
> +static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
> +				      unsigned long pte_index)
> +{
> +	unsigned long rb, va_low;
> +
> +	rb = (v & ~0x7fUL) << 16;		/* AVA field */
> +	va_low = pte_index >> 3;
> +	if (v & HPTE_V_SECONDARY)
> +		va_low = ~va_low;
> +	/* xor vsid from AVA */
> +	if (!(v & HPTE_V_1TB_SEG))
> +		va_low ^= v >> 12;
> +	else
> +		va_low ^= v >> 24;
> +	va_low &= 0x7ff;
> +	if (v & HPTE_V_LARGE) {
> +		rb |= 1;			/* L field */
> +		if (r & 0xff000) {
> +			/* non-16MB large page, must be 64k */
> +			/* (masks depend on page size) */
> +			rb |= 0x1000;		/* page encoding in LP field */
> +			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
> +			rb |= (va_low & 0xfe);	/* AVAL field (P7 doesn't seem to care) */
> +		}
> +	} else {
> +		/* 4kB page */
> +		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
> +	}
> +	rb |= (v >> 54) & 0x300;		/* B field */
> +	return rb;
> +}
> +
> +#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
> +
> +static inline int try_lock_tlbie(unsigned int *lock)
> +{
> +	unsigned int tmp, old;
> +	unsigned int token = LOCK_TOKEN;
> +
> +	asm volatile("1:lwarx	%1,0,%2\n"
> +		     "	cmpwi	cr0,%1,0\n"
> +		     "	bne	2f\n"
> +		     "  stwcx.	%3,0,%2\n"
> +		     "	bne-	1b\n"
> +		     "  isync\n"
> +		     "2:"
> +		     : "=&r" (tmp), "=&r" (old)
> +		     : "r" (lock), "r" (token)
> +		     : "cc", "memory");
> +	return old == 0;
> +}
> +
> +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
> +		     unsigned long pte_index, unsigned long avpn,
> +		     unsigned long va)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long *hpte;
> +	unsigned long v, r, rb;
> +
> +	if (pte_index >= (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
> +	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
> +		cpu_relax();
> +	if ((hpte[0] & HPTE_V_VALID) == 0 ||
> +	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
> +	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
> +		hpte[0] &= ~HPTE_V_HVLOCK;
> +		return H_NOT_FOUND;
> +	}
> +	if (atomic_read(&kvm->online_vcpus) == 1)
> +		flags |= H_LOCAL;
> +	vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK;
> +	vcpu->arch.gpr[5] = r = hpte[1];
> +	rb = compute_tlbie_rb(v, r, pte_index);
> +	hpte[0] = 0;
> +	if (!(flags & H_LOCAL)) {
> +		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
> +			cpu_relax();
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
> +			     : : "r" (rb), "r" (kvm->arch.lpid));
> +		asm volatile("ptesync" : : : "memory");
> +		kvm->arch.tlbie_lock = 0;
> +	} else {
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile("tlbiel %0" : : "r" (rb));
> +		asm volatile("ptesync" : : : "memory");
> +	}
> +	return H_SUCCESS;
> +}
> +
> +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long *args = &vcpu->arch.gpr[4];
> +	unsigned long *hp, tlbrb[4];
> +	long int i, found;
> +	long int n_inval = 0;
> +	unsigned long flags, req, pte_index;
> +	long int local = 0;
> +	long int ret = H_SUCCESS;
> +
> +	if (atomic_read(&kvm->online_vcpus) == 1)
> +		local = 1;
> +	for (i = 0; i < 4; ++i) {
> +		pte_index = args[i * 2];
> +		flags = pte_index >> 56;
> +		pte_index &= ((1ul << 56) - 1);
> +		req = flags >> 6;
> +		flags &= 3;
> +		if (req == 3)
> +			break;
> +		if (req != 1 || flags == 3 ||
> +		    pte_index >= (HPT_NPTEG << 3)) {
> +			/* parameter error */
> +			args[i * 2] = ((0xa0 | flags) << 56) + pte_index;
> +			ret = H_PARAMETER;
> +			break;
> +		}
> +		hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
> +		while (!lock_hpte(hp, HPTE_V_HVLOCK))
> +			cpu_relax();
> +		found = 0;
> +		if (hp[0] & HPTE_V_VALID) {
> +			switch (flags & 3) {
> +			case 0:		/* absolute */
> +				found = 1;
> +				break;
> +			case 1:		/* andcond */
> +				if (!(hp[0] & args[i * 2 + 1]))
> +					found = 1;
> +				break;
> +			case 2:		/* AVPN */
> +				if ((hp[0] & ~0x7fUL) == args[i * 2 + 1])
> +					found = 1;
> +				break;
> +			}
> +		}
> +		if (!found) {
> +			hp[0] &= ~HPTE_V_HVLOCK;
> +			args[i * 2] = ((0x90 | flags) << 56) + pte_index;
> +			continue;
> +		}
> +		/* insert R and C bits from PTE */
> +		flags |= (hp[1] >> 5) & 0x0c;
> +		args[i * 2] = ((0x80 | flags) << 56) + pte_index;
> +		tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index);
> +		hp[0] = 0;
> +	}
> +	if (n_inval == 0)
> +		return ret;
> +
> +	if (!local) {
> +		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
> +			cpu_relax();
> +		asm volatile("ptesync" : : : "memory");
> +		for (i = 0; i < n_inval; ++i)
> +			asm volatile(PPC_TLBIE(%1,%0)
> +				     : : "r" (tlbrb[i]), "r" (kvm->arch.lpid));
> +		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +		kvm->arch.tlbie_lock = 0;
> +	} else {
> +		asm volatile("ptesync" : : : "memory");
> +		for (i = 0; i < n_inval; ++i)
> +			asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
> +		asm volatile("ptesync" : : : "memory");
> +	}
> +	return ret;
> +}
> +
> +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
> +		      unsigned long pte_index, unsigned long avpn,
> +		      unsigned long va)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long *hpte;
> +	unsigned long v, r, rb;
> +
> +	if (pte_index >= (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
> +	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
> +		cpu_relax();
> +	if ((hpte[0] & HPTE_V_VALID) == 0 ||
> +	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
> +		hpte[0] &= ~HPTE_V_HVLOCK;
> +		return H_NOT_FOUND;
> +	}
> +	if (atomic_read(&kvm->online_vcpus) == 1)
> +		flags |= H_LOCAL;
> +	v = hpte[0];
> +	r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
> +			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
> +	r |= (flags << 55) & HPTE_R_PP0;
> +	r |= (flags << 48) & HPTE_R_KEY_HI;
> +	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
> +	rb = compute_tlbie_rb(v, r, pte_index);
> +	hpte[0] = v & ~HPTE_V_VALID;
> +	if (!(flags & H_LOCAL)) {
> +		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
> +			cpu_relax();
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
> +			     : : "r" (rb), "r" (kvm->arch.lpid));
> +		asm volatile("ptesync" : : : "memory");
> +		kvm->arch.tlbie_lock = 0;
> +	} else {
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile("tlbiel %0" : : "r" (rb));
> +		asm volatile("ptesync" : : : "memory");
> +	}
> +	hpte[1] = r;
> +	eieio();
> +	hpte[0] = v & ~HPTE_V_HVLOCK;
> +	asm volatile("ptesync" : : : "memory");
> +	return H_SUCCESS;
> +}
> +
> +static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr)
> +{
> +	long int i;
> +	unsigned long offset, rpn;
> +
> +	offset = realaddr & (kvm->arch.ram_psize - 1);
> +	rpn = (realaddr - offset) >> PAGE_SHIFT;
> +	for (i = 0; i < kvm->arch.ram_npages; ++i)
> +		if (rpn == kvm->arch.ram_pginfo[i].pfn)
> +			return (i << PAGE_SHIFT) + offset;
> +	return HPTE_R_RPN;	/* all 1s in the RPN field */
> +}
> +
> +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
> +		   unsigned long pte_index)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long *hpte, r;
> +	int i, n = 1;
> +
> +	if (pte_index >= (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	if (flags & H_READ_4) {
> +		pte_index &= ~3;
> +		n = 4;
> +	}
> +	for (i = 0; i < n; ++i, ++pte_index) {
> +		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
> +		r = hpte[1];
> +		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
> +			r = reverse_xlate(kvm, r & HPTE_R_RPN) |
> +				(r & ~HPTE_R_RPN);
> +		vcpu->arch.gpr[4 + i * 2] = hpte[0];
> +		vcpu->arch.gpr[5 + i * 2] = r;
> +	}
> +	return H_SUCCESS;
> +}
> +
> int kvmppc_mmu_hv_init(void)
> {
> 	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index f6b7cd1..377a35a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -126,6 +126,158 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
> 	       vcpu->arch.last_inst);
> }
> 
> +struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
> +{
> +	int r;
> +	struct kvm_vcpu *v, *ret = NULL;
> +
> +	mutex_lock(&kvm->lock);
> +	kvm_for_each_vcpu(r, v, kvm) {
> +		if (v->vcpu_id == id) {
> +			ret = v;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&kvm->lock);
> +	return ret;
> +}
> +
> +static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
> +{
> +	vpa->shared_proc = 1;
> +	vpa->yield_count = 1;
> +}
> +
> +static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
> +				       unsigned long flags,
> +				       unsigned long vcpuid, unsigned long vpa)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long pg_index, ra, len;
> +	unsigned long pg_offset;
> +	void *va;
> +	struct kvm_vcpu *tvcpu;
> +
> +	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
> +	if (!tvcpu)
> +		return H_PARAMETER;
> +
> +	flags >>= 63 - 18;
> +	flags &= 7;
> +	if (flags == 0 || flags == 4)
> +		return H_PARAMETER;
> +	if (flags < 4) {
> +		if (vpa & 0x7f)
> +			return H_PARAMETER;
> +		/* registering new area; convert logical addr to real */
> +		pg_index = vpa >> kvm->arch.ram_porder;
> +		pg_offset = vpa & (kvm->arch.ram_psize - 1);
> +		if (pg_index >= kvm->arch.ram_npages)
> +			return H_PARAMETER;
> +		if (kvm->arch.ram_pginfo[pg_index].pfn == 0)
> +			return H_PARAMETER;
> +		ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
> +		ra |= pg_offset;
> +		va = __va(ra);
> +		if (flags <= 1)
> +			len = *(unsigned short *)(va + 4);
> +		else
> +			len = *(unsigned int *)(va + 4);
> +		if (pg_offset + len > kvm->arch.ram_psize)
> +			return H_PARAMETER;
> +		switch (flags) {
> +		case 1:		/* register VPA */
> +			if (len < 640)
> +				return H_PARAMETER;
> +			tvcpu->arch.vpa = va;
> +			init_vpa(vcpu, va);
> +			break;
> +		case 2:		/* register DTL */
> +			if (len < 48)
> +				return H_PARAMETER;
> +			if (!tvcpu->arch.vpa)
> +				return H_RESOURCE;
> +			len -= len % 48;
> +			tvcpu->arch.dtl = va;
> +			tvcpu->arch.dtl_end = va + len;
> +			break;
> +		case 3:		/* register SLB shadow buffer */
> +			if (len < 8)
> +				return H_PARAMETER;
> +			if (!tvcpu->arch.vpa)
> +				return H_RESOURCE;
> +			tvcpu->arch.slb_shadow = va;
> +			len = (len - 16) / 16;
> +			tvcpu->arch.slb_shadow = va;
> +			break;
> +		}
> +	} else {
> +		switch (flags) {
> +		case 5:		/* unregister VPA */
> +			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
> +				return H_RESOURCE;
> +			tvcpu->arch.vpa = NULL;
> +			break;
> +		case 6:		/* unregister DTL */
> +			tvcpu->arch.dtl = NULL;
> +			break;
> +		case 7:		/* unregister SLB shadow buffer */
> +			tvcpu->arch.slb_shadow = NULL;
> +			break;
> +		}
> +	}
> +	return H_SUCCESS;
> +}
> +
> +int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long req = kvmppc_get_gpr(vcpu, 3);
> +	unsigned long target, ret = H_SUCCESS;
> +	struct kvm_vcpu *tvcpu;
> +
> +	switch (req) {
> +	case H_CEDE:
> +		vcpu->arch.msr |= MSR_EE;
> +		vcpu->arch.ceded = 1;
> +		smp_mb();
> +		if (!vcpu->arch.prodded)
> +			kvmppc_vcpu_block(vcpu);
> +		else
> +			vcpu->arch.prodded = 0;
> +		smp_mb();
> +		vcpu->arch.ceded = 0;
> +		break;
> +	case H_PROD:
> +		target = kvmppc_get_gpr(vcpu, 4);
> +		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
> +		if (!tvcpu) {
> +			ret = H_PARAMETER;
> +			break;
> +		}
> +		tvcpu->arch.prodded = 1;
> +		smp_mb();
> +		if (vcpu->arch.ceded) {
> +			if (waitqueue_active(&vcpu->wq)) {
> +				wake_up_interruptible(&vcpu->wq);
> +				vcpu->stat.halt_wakeup++;
> +			}
> +		}
> +		break;
> +	case H_CONFER:
> +		break;
> +	case H_REGISTER_VPA:
> +		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
> +					kvmppc_get_gpr(vcpu, 5),
> +					kvmppc_get_gpr(vcpu, 6));
> +		break;
> +	default:
> +		return RESUME_HOST;
> +	}
> +	kvmppc_set_gpr(vcpu, 3, ret);
> +	vcpu->arch.hcall_needed = 0;
> +	return RESUME_GUEST;
> +}
> +
> static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
> 			      struct task_struct *tsk)
> {
> @@ -307,7 +459,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
> 
> extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
> 
> -int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
> {
> 	u64 now;
> 
> @@ -338,6 +490,22 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> 	return kvmppc_handle_exit(run, vcpu, current);
> }
> 
> +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +{
> +	int r;
> +
> +	do {
> +		r = kvmppc_run_vcpu(run, vcpu);
> +
> +		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
> +		    !(vcpu->arch.msr & MSR_PR)) {
> +			r = kvmppc_pseries_do_hcall(vcpu);
> +			kvmppc_core_deliver_interrupts(vcpu);
> +		}
> +	} while (r == RESUME_GUEST);
> +	return r;
> +}
> +
> int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> 				struct kvm_userspace_memory_region *mem)
> {
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 813b01c..e8a8f3c 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -195,6 +195,14 @@ kvmppc_handler_trampoline_enter:
> 	/* Save R1 in the PACA */
> 	std	r1, PACA_KVM_SVCPU + SVCPU_HOST_R1(r13)
> 
> +	/* Increment yield count if they have a VPA */
> +	ld	r3, VCPU_VPA(r4)
> +	cmpdi	r3, 0
> +	beq	25f
> +	lwz	r5, LPPACA_YIELDCOUNT(r3)
> +	addi	r5, r5, 1
> +	stw	r5, LPPACA_YIELDCOUNT(r3)
> +25:
> 	/* Load up DAR and DSISR */
> 	ld	r5, VCPU_DAR(r4)
> 	lwz	r6, VCPU_DSISR(r4)
> @@ -432,6 +440,10 @@ kvmppc_interrupt:
> 	cmpwi	r3,0
> 	bge	ignore_hdec
> 2:
> +	/* See if this is something we can handle in real mode */
> +	cmpwi	r12,0xc00

use the define please

> +	beq	hcall_real_mode

This is simply a hcall helper, as the name suggests. So the comment is slightly misleading - it should rather read like "Try to handle hypercalls in real mode".

> +hcall_real_cont:
> 
> 	/* Check for mediated interrupts (could be done earlier really ...) */
> 	cmpwi	r12,0x500
> @@ -607,13 +619,28 @@ hdec_soon:
> 	std	r5, VCPU_SPRG2(r9)
> 	std	r6, VCPU_SPRG3(r9)
> 
> -	/* Save PMU registers */
> +	/* Increment yield count if they have a VPA */
> +	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
> +	cmpdi	r8, 0
> +	beq	25f
> +	lwz	r3, LPPACA_YIELDCOUNT(r8)
> +	addi	r3, r3, 1
> +	stw	r3, LPPACA_YIELDCOUNT(r8)
> +25:
> +	/* Save PMU registers if requested */
> +	/* r8 and cr0.eq are live here */
> 	li	r3, 1
> 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
> 	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
> 	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
> 	isync
> -	mfspr	r5, SPRN_MMCR1
> +	beq	21f			/* if no VPA, save PMU stuff anyway */
> +	lbz	r7, LPPACA_PMCINUSE(r8)
> +	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
> +	bne	21f
> +	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
> +	b	22f
> +21:	mfspr	r5, SPRN_MMCR1
> 	mfspr	r6, SPRN_MMCRA
> 	std	r4, VCPU_MMCR(r9)
> 	std	r5, VCPU_MMCR + 8(r9)
> @@ -650,6 +677,119 @@ kvmppc_handler_trampoline_exit_end:
> 	mfspr	r7,SPRN_HDSISR
> 	b	7b
> 
> +	.globl	hcall_real_mode
> +hcall_real_mode:
> +	ld	r3,VCPU_GPR(r3)(r9)
> +	andi.	r0,r11,MSR_PR
> +	bne	hcall_real_cont
> +	clrrdi	r3,r3,2
> +	cmpldi	r3,hcall_real_table_end - hcall_real_table
> +	bge	hcall_real_cont
> +	LOAD_REG_ADDR(r4, hcall_real_table)
> +	lwzx	r3,r3,r4
> +	cmpwi	r3,0
> +	beq	hcall_real_cont
> +	add	r3,r3,r4
> +	mtctr	r3
> +	mr	r3,r9		/* get vcpu pointer */
> +	ld	r4,VCPU_GPR(r4)(r9)
> +	bctrl
> +	cmpdi	r3,H_TOO_HARD
> +	beq	hcall_real_fallback

Ah, very good. Please mark the constant as "for internal use only" then, as that's certainly fine :).

> +	ld	r4,PACA_KVM_VCPU(r13)
> +	std	r3,VCPU_GPR(r3)(r4)
> +	ld	r10,VCPU_PC(r4)
> +	ld	r11,VCPU_MSR(r4)
> +	b	fast_guest_return
> +
> +	/* We've attempted a real mode hcall, but it's punted it back
> +	 * to userspace.  We need to restore some clobbered volatiles
> +	 * before resuming the pass-it-to-qemu path */
> +hcall_real_fallback:
> +	li	r12,0xc00

use the define please :)


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel
@ 2011-05-17  7:54     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  7:54 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:45, Paul Mackerras wrote:

> This adds the infrastructure for handling PAPR hcalls in the kernel,
> either early in the guest exit path while we are still in real mode,
> or later once the MMU has been turned back on and we are in the full
> kernel context.  The advantage of handling hcalls in real mode if
> possible is that we avoid two partition switches -- and this will
> become more important when we support SMT4 guests, since a partition
> switch means we have to pull all of the threads in the core out of
> the guest.  The disadvantage is that we can only access the kernel
> linear mapping, not anything vmalloced or ioremapped, since the MMU
> is off.
>=20
> This also adds code to handle the following hcalls in real mode:
>=20
> H_ENTER       Add an HPTE to the hashed page table
> H_REMOVE      Remove an HPTE from the hashed page table
> H_READ        Read HPTEs from the hashed page table
> H_PROTECT     Change the protection bits in an HPTE
> H_BULK_REMOVE Remove up to 4 HPTEs from the hashed page table
> H_SET_DABR    Set the data address breakpoint register
>=20
> Plus code to handle the following hcalls in the kernel:
>=20
> H_CEDE        Idle the vcpu until an interrupt or H_PROD hcall arrives
> H_PROD        Wake up a ceded vcpu
> H_REGISTER_VPA Register a virtual processor area (VPA)
>=20
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/hvcall.h       |    5 +
> arch/powerpc/include/asm/kvm_host.h     |   11 +
> arch/powerpc/include/asm/kvm_ppc.h      |    1 +
> arch/powerpc/kernel/asm-offsets.c       |    2 +
> arch/powerpc/kvm/book3s_64_mmu_hv.c     |  342 =
+++++++++++++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c            |  170 +++++++++++++++-
> arch/powerpc/kvm/book3s_hv_rmhandlers.S |  150 +++++++++++++-
> arch/powerpc/kvm/powerpc.c              |    2 +-
> 8 files changed, 679 insertions(+), 4 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/hvcall.h =
b/arch/powerpc/include/asm/hvcall.h
> index 8edec71..a226002 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -29,6 +29,10 @@
> #define H_LONG_BUSY_ORDER_100_SEC	9905  /* Long busy, hint that =
100sec \
> 						 is a good time to retry =
*/
> #define H_LONG_BUSY_END_RANGE		9905  /* End of long busy range =
*/
> +
> +/* Hacked in for HV-aware kvm support */
> +#define H_TOO_HARD	9999

Not sure I like the name - when is it used? :)
Also, if it's not in the PAPR, the guest should never receive it, right?

> +
> #define H_HARDWARE	-1	/* Hardware error */
> #define H_FUNCTION	-2	/* Function not supported */
> #define H_PRIVILEGE	-3	/* Caller not privileged */
> @@ -100,6 +104,7 @@
> #define H_PAGE_SET_ACTIVE	H_PAGE_STATE_CHANGE
> #define H_AVPN			(1UL<<(63-32))	/* An avpn is =
provided as a sanity test */
> #define H_ANDCOND		(1UL<<(63-33))
> +#define H_LOCAL			(1UL<<(63-35))
> #define H_ICACHE_INVALIDATE	(1UL<<(63-40))	/* icbi, etc.  (ignored =
for IO pages) */
> #define H_ICACHE_SYNCHRONIZE	(1UL<<(63-41))	/* dcbst, icbi, etc =
(ignored for IO pages */
> #define H_ZERO_PAGE		(1UL<<(63-48))	/* zero the page before =
mapping (ignored for IO pages) */
> diff --git a/arch/powerpc/include/asm/kvm_host.h =
b/arch/powerpc/include/asm/kvm_host.h
> index ec62365..af6703e 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -59,6 +59,10 @@ struct kvm;
> struct kvm_run;
> struct kvm_vcpu;
>=20
> +struct lppaca;
> +struct slb_shadow;
> +struct dtl;
> +
> struct kvm_vm_stat {
> 	u32 remote_tlb_flush;
> };
> @@ -341,7 +345,14 @@ struct kvm_vcpu_arch {
> 	u64 dec_expires;
> 	unsigned long pending_exceptions;
> 	u16 last_cpu;
> +	u8 ceded;
> +	u8 prodded;
> 	u32 last_inst;
> +
> +	struct lppaca *vpa;
> +	struct slb_shadow *slb_shadow;
> +	struct dtl *dtl;
> +	struct dtl *dtl_end;
> 	int trap;
> 	struct kvm_vcpu_arch_shared *shared;
> 	unsigned long magic_page_pa; /* phys addr to map the magic page =
to */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h =
b/arch/powerpc/include/asm/kvm_ppc.h
> index cd9ad96..b4ee11a 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -116,6 +116,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
> 				struct kvm_userspace_memory_region =
*mem);
> extern void kvmppc_map_vrma(struct kvm *kvm,
> 			    struct kvm_userspace_memory_region *mem);
> +extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
> extern int kvmppc_core_init_vm(struct kvm *kvm);
> extern void kvmppc_core_destroy_vm(struct kvm *kvm);
> extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> diff --git a/arch/powerpc/kernel/asm-offsets.c =
b/arch/powerpc/kernel/asm-offsets.c
> index 49e97fd..fd56f14 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -189,6 +189,7 @@ int main(void)
> 	DEFINE(LPPACADECRINT, offsetof(struct lppaca, =
int_dword.fields.decr_int));
> 	DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, =
pmcregs_in_use));
> 	DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx));
> +	DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count));
> 	DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx));
> #endif /* CONFIG_PPC_STD_MMU_64 */
> 	DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp));
> @@ -467,6 +468,7 @@ int main(void)
> 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
> 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, =
arch.dec_expires));
> 	DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr));
> +	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa));
> 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
> 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
> 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c =
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 52d1be1..623caae 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -219,6 +219,348 @@ void kvmppc_map_vrma(struct kvm *kvm, struct =
kvm_userspace_memory_region *mem)
> 	}
> }
>=20
> +#define HPTE_V_HVLOCK	0x40UL
> +
> +static inline long lock_hpte(unsigned long *hpte, unsigned long bits)
> +{
> +	unsigned long tmp, old;
> +
> +	asm volatile("	ldarx	%0,0,%2\n"
> +		     "	and.	%1,%0,%3\n"
> +		     "	bne	2f\n"
> +		     "	ori	%0,%0,%4\n"
> +		     "  stdcx.	%0,0,%2\n"
> +		     "	beq+	2f\n"
> +		     "	li	%1,%3\n"
> +		     "2:	isync"
> +		     : "=3D&r" (tmp), "=3D&r" (old)
> +		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
> +		     : "cc", "memory");
> +	return old =3D=3D 0;
> +}
> +
> +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
> +		    long pte_index, unsigned long pteh, unsigned long =
ptel)
> +{
> +	unsigned long porder;
> +	struct kvm *kvm =3D vcpu->kvm;
> +	unsigned long i, lpn, pa;
> +	unsigned long *hpte;
> +
> +	/* only handle 4k, 64k and 16M pages for now */
> +	porder =3D 12;
> +	if (pteh & HPTE_V_LARGE) {
> +		if ((ptel & 0xf000) =3D=3D 0x1000) {
> +			/* 64k page */
> +			porder =3D 16;
> +		} else if ((ptel & 0xff000) =3D=3D 0) {
> +			/* 16M page */
> +			porder =3D 24;
> +			/* lowest AVA bit must be 0 for 16M pages */
> +			if (pteh & 0x80)
> +				return H_PARAMETER;
> +		} else
> +			return H_PARAMETER;
> +	}
> +	lpn =3D (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder;
> +	if (lpn >=3D kvm->arch.ram_npages || porder > =
kvm->arch.ram_porder)
> +		return H_PARAMETER;
> +	pa =3D kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT;
> +	if (!pa)
> +		return H_PARAMETER;
> +	/* Check WIMG */
> +	if ((ptel & HPTE_R_WIMG) !=3D HPTE_R_M &&
> +	    (ptel & HPTE_R_WIMG) !=3D (HPTE_R_W | HPTE_R_I | HPTE_R_M))
> +		return H_PARAMETER;
> +	pteh &=3D ~0x60UL;
> +	ptel &=3D ~(HPTE_R_PP0 - kvm->arch.ram_psize);
> +	ptel |=3D pa;
> +	if (pte_index >=3D (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	if (likely((flags & H_EXACT) =3D=3D 0)) {
> +		pte_index &=3D ~7UL;
> +		hpte =3D (unsigned long *)(kvm->arch.hpt_virt + =
(pte_index << 4));
> +		for (i =3D 0; ; ++i) {
> +			if (i =3D=3D 8)
> +				return H_PTEG_FULL;
> +			if ((*hpte & HPTE_V_VALID) =3D=3D 0 &&
> +			    lock_hpte(hpte, HPTE_V_HVLOCK | =
HPTE_V_VALID))
> +				break;
> +			hpte +=3D 2;
> +		}
> +	} else {
> +		i =3D 0;
> +		hpte =3D (unsigned long *)(kvm->arch.hpt_virt + =
(pte_index << 4));
> +		if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID))
> +			return H_PTEG_FULL;
> +	}
> +	hpte[1] =3D ptel;
> +	eieio();
> +	hpte[0] =3D pteh;
> +	asm volatile("ptesync" : : : "memory");
> +	atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt);
> +	vcpu->arch.gpr[4] =3D pte_index + i;
> +	return H_SUCCESS;
> +}
> +
> +static unsigned long compute_tlbie_rb(unsigned long v, unsigned long =
r,
> +				      unsigned long pte_index)
> +{
> +	unsigned long rb, va_low;
> +
> +	rb =3D (v & ~0x7fUL) << 16;		/* AVA field */
> +	va_low =3D pte_index >> 3;
> +	if (v & HPTE_V_SECONDARY)
> +		va_low =3D ~va_low;
> +	/* xor vsid from AVA */
> +	if (!(v & HPTE_V_1TB_SEG))
> +		va_low ^=3D v >> 12;
> +	else
> +		va_low ^=3D v >> 24;
> +	va_low &=3D 0x7ff;
> +	if (v & HPTE_V_LARGE) {
> +		rb |=3D 1;			/* L field */
> +		if (r & 0xff000) {
> +			/* non-16MB large page, must be 64k */
> +			/* (masks depend on page size) */
> +			rb |=3D 0x1000;		/* page encoding in LP =
field */
> +			rb |=3D (va_low & 0x7f) << 16; /* 7b of VA in =
AVA/LP field */
> +			rb |=3D (va_low & 0xfe);	/* AVAL field =
(P7 doesn't seem to care) */
> +		}
> +	} else {
> +		/* 4kB page */
> +		rb |=3D (va_low & 0x7ff) << 12;	/* remaining 11b of VA =
*/
> +	}
> +	rb |=3D (v >> 54) & 0x300;		/* B field */
> +	return rb;
> +}
> +
> +#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
> +
> +static inline int try_lock_tlbie(unsigned int *lock)
> +{
> +	unsigned int tmp, old;
> +	unsigned int token =3D LOCK_TOKEN;
> +
> +	asm volatile("1:lwarx	%1,0,%2\n"
> +		     "	cmpwi	cr0,%1,0\n"
> +		     "	bne	2f\n"
> +		     "  stwcx.	%3,0,%2\n"
> +		     "	bne-	1b\n"
> +		     "  isync\n"
> +		     "2:"
> +		     : "=3D&r" (tmp), "=3D&r" (old)
> +		     : "r" (lock), "r" (token)
> +		     : "cc", "memory");
> +	return old =3D=3D 0;
> +}
> +
> +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
> +		     unsigned long pte_index, unsigned long avpn,
> +		     unsigned long va)
> +{
> +	struct kvm *kvm =3D vcpu->kvm;
> +	unsigned long *hpte;
> +	unsigned long v, r, rb;
> +
> +	if (pte_index >=3D (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	hpte =3D (unsigned long *)(kvm->arch.hpt_virt + (pte_index << =
4));
> +	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
> +		cpu_relax();
> +	if ((hpte[0] & HPTE_V_VALID) =3D=3D 0 ||
> +	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) !=3D avpn) ||
> +	    ((flags & H_ANDCOND) && (hpte[0] & avpn) !=3D 0)) {
> +		hpte[0] &=3D ~HPTE_V_HVLOCK;
> +		return H_NOT_FOUND;
> +	}
> +	if (atomic_read(&kvm->online_vcpus) =3D=3D 1)
> +		flags |=3D H_LOCAL;
> +	vcpu->arch.gpr[4] =3D v =3D hpte[0] & ~HPTE_V_HVLOCK;
> +	vcpu->arch.gpr[5] =3D r =3D hpte[1];
> +	rb =3D compute_tlbie_rb(v, r, pte_index);
> +	hpte[0] =3D 0;
> +	if (!(flags & H_LOCAL)) {
> +		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
> +			cpu_relax();
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
> +			     : : "r" (rb), "r" (kvm->arch.lpid));
> +		asm volatile("ptesync" : : : "memory");
> +		kvm->arch.tlbie_lock =3D 0;
> +	} else {
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile("tlbiel %0" : : "r" (rb));
> +		asm volatile("ptesync" : : : "memory");
> +	}
> +	return H_SUCCESS;
> +}
> +
> +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm =3D vcpu->kvm;
> +	unsigned long *args =3D &vcpu->arch.gpr[4];
> +	unsigned long *hp, tlbrb[4];
> +	long int i, found;
> +	long int n_inval =3D 0;
> +	unsigned long flags, req, pte_index;
> +	long int local =3D 0;
> +	long int ret =3D H_SUCCESS;
> +
> +	if (atomic_read(&kvm->online_vcpus) =3D=3D 1)
> +		local =3D 1;
> +	for (i =3D 0; i < 4; ++i) {
> +		pte_index =3D args[i * 2];
> +		flags =3D pte_index >> 56;
> +		pte_index &=3D ((1ul << 56) - 1);
> +		req =3D flags >> 6;
> +		flags &=3D 3;
> +		if (req =3D=3D 3)
> +			break;
> +		if (req !=3D 1 || flags =3D=3D 3 ||
> +		    pte_index >=3D (HPT_NPTEG << 3)) {
> +			/* parameter error */
> +			args[i * 2] =3D ((0xa0 | flags) << 56) + =
pte_index;
> +			ret =3D H_PARAMETER;
> +			break;
> +		}
> +		hp =3D (unsigned long *)(kvm->arch.hpt_virt + (pte_index =
<< 4));
> +		while (!lock_hpte(hp, HPTE_V_HVLOCK))
> +			cpu_relax();
> +		found =3D 0;
> +		if (hp[0] & HPTE_V_VALID) {
> +			switch (flags & 3) {
> +			case 0:		/* absolute */
> +				found =3D 1;
> +				break;
> +			case 1:		/* andcond */
> +				if (!(hp[0] & args[i * 2 + 1]))
> +					found =3D 1;
> +				break;
> +			case 2:		/* AVPN */
> +				if ((hp[0] & ~0x7fUL) =3D=3D args[i * 2 =
+ 1])
> +					found =3D 1;
> +				break;
> +			}
> +		}
> +		if (!found) {
> +			hp[0] &=3D ~HPTE_V_HVLOCK;
> +			args[i * 2] =3D ((0x90 | flags) << 56) + =
pte_index;
> +			continue;
> +		}
> +		/* insert R and C bits from PTE */
> +		flags |=3D (hp[1] >> 5) & 0x0c;
> +		args[i * 2] =3D ((0x80 | flags) << 56) + pte_index;
> +		tlbrb[n_inval++] =3D compute_tlbie_rb(hp[0], hp[1], =
pte_index);
> +		hp[0] =3D 0;
> +	}
> +	if (n_inval =3D=3D 0)
> +		return ret;
> +
> +	if (!local) {
> +		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
> +			cpu_relax();
> +		asm volatile("ptesync" : : : "memory");
> +		for (i =3D 0; i < n_inval; ++i)
> +			asm volatile(PPC_TLBIE(%1,%0)
> +				     : : "r" (tlbrb[i]), "r" =
(kvm->arch.lpid));
> +		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +		kvm->arch.tlbie_lock =3D 0;
> +	} else {
> +		asm volatile("ptesync" : : : "memory");
> +		for (i =3D 0; i < n_inval; ++i)
> +			asm volatile("tlbiel %0" : : "r" (tlbrb[i]));
> +		asm volatile("ptesync" : : : "memory");
> +	}
> +	return ret;
> +}
> +
> +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
> +		      unsigned long pte_index, unsigned long avpn,
> +		      unsigned long va)
> +{
> +	struct kvm *kvm =3D vcpu->kvm;
> +	unsigned long *hpte;
> +	unsigned long v, r, rb;
> +
> +	if (pte_index >=3D (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	hpte =3D (unsigned long *)(kvm->arch.hpt_virt + (pte_index << =
4));
> +	while (!lock_hpte(hpte, HPTE_V_HVLOCK))
> +		cpu_relax();
> +	if ((hpte[0] & HPTE_V_VALID) =3D=3D 0 ||
> +	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) !=3D avpn)) {
> +		hpte[0] &=3D ~HPTE_V_HVLOCK;
> +		return H_NOT_FOUND;
> +	}
> +	if (atomic_read(&kvm->online_vcpus) =3D=3D 1)
> +		flags |=3D H_LOCAL;
> +	v =3D hpte[0];
> +	r =3D hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
> +			HPTE_R_KEY_HI | HPTE_R_KEY_LO);
> +	r |=3D (flags << 55) & HPTE_R_PP0;
> +	r |=3D (flags << 48) & HPTE_R_KEY_HI;
> +	r |=3D flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
> +	rb =3D compute_tlbie_rb(v, r, pte_index);
> +	hpte[0] =3D v & ~HPTE_V_VALID;
> +	if (!(flags & H_LOCAL)) {
> +		while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
> +			cpu_relax();
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
> +			     : : "r" (rb), "r" (kvm->arch.lpid));
> +		asm volatile("ptesync" : : : "memory");
> +		kvm->arch.tlbie_lock =3D 0;
> +	} else {
> +		asm volatile("ptesync" : : : "memory");
> +		asm volatile("tlbiel %0" : : "r" (rb));
> +		asm volatile("ptesync" : : : "memory");
> +	}
> +	hpte[1] =3D r;
> +	eieio();
> +	hpte[0] =3D v & ~HPTE_V_HVLOCK;
> +	asm volatile("ptesync" : : : "memory");
> +	return H_SUCCESS;
> +}
> +
> +static unsigned long reverse_xlate(struct kvm *kvm, unsigned long =
realaddr)
> +{
> +	long int i;
> +	unsigned long offset, rpn;
> +
> +	offset =3D realaddr & (kvm->arch.ram_psize - 1);
> +	rpn =3D (realaddr - offset) >> PAGE_SHIFT;
> +	for (i =3D 0; i < kvm->arch.ram_npages; ++i)
> +		if (rpn =3D=3D kvm->arch.ram_pginfo[i].pfn)
> +			return (i << PAGE_SHIFT) + offset;
> +	return HPTE_R_RPN;	/* all 1s in the RPN field */
> +}
> +
> +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
> +		   unsigned long pte_index)
> +{
> +	struct kvm *kvm =3D vcpu->kvm;
> +	unsigned long *hpte, r;
> +	int i, n =3D 1;
> +
> +	if (pte_index >=3D (HPT_NPTEG << 3))
> +		return H_PARAMETER;
> +	if (flags & H_READ_4) {
> +		pte_index &=3D ~3;
> +		n =3D 4;
> +	}
> +	for (i =3D 0; i < n; ++i, ++pte_index) {
> +		hpte =3D (unsigned long *)(kvm->arch.hpt_virt + =
(pte_index << 4));
> +		r =3D hpte[1];
> +		if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID))
> +			r =3D reverse_xlate(kvm, r & HPTE_R_RPN) |
> +				(r & ~HPTE_R_RPN);
> +		vcpu->arch.gpr[4 + i * 2] =3D hpte[0];
> +		vcpu->arch.gpr[5 + i * 2] =3D r;
> +	}
> +	return H_SUCCESS;
> +}
> +
> int kvmppc_mmu_hv_init(void)
> {
> 	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
> diff --git a/arch/powerpc/kvm/book3s_hv.c =
b/arch/powerpc/kvm/book3s_hv.c
> index f6b7cd1..377a35a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -126,6 +126,158 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
> 	       vcpu->arch.last_inst);
> }
>=20
> +struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
> +{
> +	int r;
> +	struct kvm_vcpu *v, *ret =3D NULL;
> +
> +	mutex_lock(&kvm->lock);
> +	kvm_for_each_vcpu(r, v, kvm) {
> +		if (v->vcpu_id =3D=3D id) {
> +			ret =3D v;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&kvm->lock);
> +	return ret;
> +}
> +
> +static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
> +{
> +	vpa->shared_proc =3D 1;
> +	vpa->yield_count =3D 1;
> +}
> +
> +static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
> +				       unsigned long flags,
> +				       unsigned long vcpuid, unsigned =
long vpa)
> +{
> +	struct kvm *kvm =3D vcpu->kvm;
> +	unsigned long pg_index, ra, len;
> +	unsigned long pg_offset;
> +	void *va;
> +	struct kvm_vcpu *tvcpu;
> +
> +	tvcpu =3D kvmppc_find_vcpu(kvm, vcpuid);
> +	if (!tvcpu)
> +		return H_PARAMETER;
> +
> +	flags >>=3D 63 - 18;
> +	flags &=3D 7;
> +	if (flags =3D=3D 0 || flags =3D=3D 4)
> +		return H_PARAMETER;
> +	if (flags < 4) {
> +		if (vpa & 0x7f)
> +			return H_PARAMETER;
> +		/* registering new area; convert logical addr to real */
> +		pg_index =3D vpa >> kvm->arch.ram_porder;
> +		pg_offset =3D vpa & (kvm->arch.ram_psize - 1);
> +		if (pg_index >=3D kvm->arch.ram_npages)
> +			return H_PARAMETER;
> +		if (kvm->arch.ram_pginfo[pg_index].pfn =3D=3D 0)
> +			return H_PARAMETER;
> +		ra =3D kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT;
> +		ra |=3D pg_offset;
> +		va =3D __va(ra);
> +		if (flags <=3D 1)
> +			len =3D *(unsigned short *)(va + 4);
> +		else
> +			len =3D *(unsigned int *)(va + 4);
> +		if (pg_offset + len > kvm->arch.ram_psize)
> +			return H_PARAMETER;
> +		switch (flags) {
> +		case 1:		/* register VPA */
> +			if (len < 640)
> +				return H_PARAMETER;
> +			tvcpu->arch.vpa =3D va;
> +			init_vpa(vcpu, va);
> +			break;
> +		case 2:		/* register DTL */
> +			if (len < 48)
> +				return H_PARAMETER;
> +			if (!tvcpu->arch.vpa)
> +				return H_RESOURCE;
> +			len -=3D len % 48;
> +			tvcpu->arch.dtl =3D va;
> +			tvcpu->arch.dtl_end =3D va + len;
> +			break;
> +		case 3:		/* register SLB shadow buffer */
> +			if (len < 8)
> +				return H_PARAMETER;
> +			if (!tvcpu->arch.vpa)
> +				return H_RESOURCE;
> +			tvcpu->arch.slb_shadow =3D va;
> +			len =3D (len - 16) / 16;
> +			tvcpu->arch.slb_shadow =3D va;
> +			break;
> +		}
> +	} else {
> +		switch (flags) {
> +		case 5:		/* unregister VPA */
> +			if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl)
> +				return H_RESOURCE;
> +			tvcpu->arch.vpa =3D NULL;
> +			break;
> +		case 6:		/* unregister DTL */
> +			tvcpu->arch.dtl =3D NULL;
> +			break;
> +		case 7:		/* unregister SLB shadow buffer */
> +			tvcpu->arch.slb_shadow =3D NULL;
> +			break;
> +		}
> +	}
> +	return H_SUCCESS;
> +}
> +
> +int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long req =3D kvmppc_get_gpr(vcpu, 3);
> +	unsigned long target, ret =3D H_SUCCESS;
> +	struct kvm_vcpu *tvcpu;
> +
> +	switch (req) {
> +	case H_CEDE:
> +		vcpu->arch.msr |=3D MSR_EE;
> +		vcpu->arch.ceded =3D 1;
> +		smp_mb();
> +		if (!vcpu->arch.prodded)
> +			kvmppc_vcpu_block(vcpu);
> +		else
> +			vcpu->arch.prodded =3D 0;
> +		smp_mb();
> +		vcpu->arch.ceded =3D 0;
> +		break;
> +	case H_PROD:
> +		target =3D kvmppc_get_gpr(vcpu, 4);
> +		tvcpu =3D kvmppc_find_vcpu(vcpu->kvm, target);
> +		if (!tvcpu) {
> +			ret =3D H_PARAMETER;
> +			break;
> +		}
> +		tvcpu->arch.prodded =3D 1;
> +		smp_mb();
> +		if (vcpu->arch.ceded) {
> +			if (waitqueue_active(&vcpu->wq)) {
> +				wake_up_interruptible(&vcpu->wq);
> +				vcpu->stat.halt_wakeup++;
> +			}
> +		}
> +		break;
> +	case H_CONFER:
> +		break;
> +	case H_REGISTER_VPA:
> +		ret =3D do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
> +					kvmppc_get_gpr(vcpu, 5),
> +					kvmppc_get_gpr(vcpu, 6));
> +		break;
> +	default:
> +		return RESUME_HOST;
> +	}
> +	kvmppc_set_gpr(vcpu, 3, ret);
> +	vcpu->arch.hcall_needed =3D 0;
> +	return RESUME_GUEST;
> +}
> +
> static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu =
*vcpu,
> 			      struct task_struct *tsk)
> {
> @@ -307,7 +459,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>=20
> extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct =
kvm_vcpu *vcpu);
>=20
> -int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu =
*vcpu)
> {
> 	u64 now;
>=20
> @@ -338,6 +490,22 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct =
kvm_vcpu *vcpu)
> 	return kvmppc_handle_exit(run, vcpu, current);
> }
>=20
> +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +{
> +	int r;
> +
> +	do {
> +		r =3D kvmppc_run_vcpu(run, vcpu);
> +
> +		if (run->exit_reason =3D=3D KVM_EXIT_PAPR_HCALL &&
> +		    !(vcpu->arch.msr & MSR_PR)) {
> +			r =3D kvmppc_pseries_do_hcall(vcpu);
> +			kvmppc_core_deliver_interrupts(vcpu);
> +		}
> +	} while (r =3D=3D RESUME_GUEST);
> +	return r;
> +}
> +
> int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> 				struct kvm_userspace_memory_region *mem)
> {
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S =
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 813b01c..e8a8f3c 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -195,6 +195,14 @@ kvmppc_handler_trampoline_enter:
> 	/* Save R1 in the PACA */
> 	std	r1, PACA_KVM_SVCPU + SVCPU_HOST_R1(r13)
>=20
> +	/* Increment yield count if they have a VPA */
> +	ld	r3, VCPU_VPA(r4)
> +	cmpdi	r3, 0
> +	beq	25f
> +	lwz	r5, LPPACA_YIELDCOUNT(r3)
> +	addi	r5, r5, 1
> +	stw	r5, LPPACA_YIELDCOUNT(r3)
> +25:
> 	/* Load up DAR and DSISR */
> 	ld	r5, VCPU_DAR(r4)
> 	lwz	r6, VCPU_DSISR(r4)
> @@ -432,6 +440,10 @@ kvmppc_interrupt:
> 	cmpwi	r3,0
> 	bge	ignore_hdec
> 2:
> +	/* See if this is something we can handle in real mode */
> +	cmpwi	r12,0xc00

use the define please

> +	beq	hcall_real_mode

This is simply a hcall helper, as the name suggests. So the comment is =
slightly misleading - it should rather read like "Try to handle =
hypercalls in real mode".

> +hcall_real_cont:
>=20
> 	/* Check for mediated interrupts (could be done earlier really =
...) */
> 	cmpwi	r12,0x500
> @@ -607,13 +619,28 @@ hdec_soon:
> 	std	r5, VCPU_SPRG2(r9)
> 	std	r6, VCPU_SPRG3(r9)
>=20
> -	/* Save PMU registers */
> +	/* Increment yield count if they have a VPA */
> +	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
> +	cmpdi	r8, 0
> +	beq	25f
> +	lwz	r3, LPPACA_YIELDCOUNT(r8)
> +	addi	r3, r3, 1
> +	stw	r3, LPPACA_YIELDCOUNT(r8)
> +25:
> +	/* Save PMU registers if requested */
> +	/* r8 and cr0.eq are live here */
> 	li	r3, 1
> 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) =
bit */
> 	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
> 	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable =
ints */
> 	isync
> -	mfspr	r5, SPRN_MMCR1
> +	beq	21f			/* if no VPA, save PMU stuff =
anyway */
> +	lbz	r7, LPPACA_PMCINUSE(r8)
> +	cmpwi	r7, 0			/* did they ask for PMU stuff to =
be saved? */
> +	bne	21f
> +	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC =
*/
> +	b	22f
> +21:	mfspr	r5, SPRN_MMCR1
> 	mfspr	r6, SPRN_MMCRA
> 	std	r4, VCPU_MMCR(r9)
> 	std	r5, VCPU_MMCR + 8(r9)
> @@ -650,6 +677,119 @@ kvmppc_handler_trampoline_exit_end:
> 	mfspr	r7,SPRN_HDSISR
> 	b	7b
>=20
> +	.globl	hcall_real_mode
> +hcall_real_mode:
> +	ld	r3,VCPU_GPR(r3)(r9)
> +	andi.	r0,r11,MSR_PR
> +	bne	hcall_real_cont
> +	clrrdi	r3,r3,2
> +	cmpldi	r3,hcall_real_table_end - hcall_real_table
> +	bge	hcall_real_cont
> +	LOAD_REG_ADDR(r4, hcall_real_table)
> +	lwzx	r3,r3,r4
> +	cmpwi	r3,0
> +	beq	hcall_real_cont
> +	add	r3,r3,r4
> +	mtctr	r3
> +	mr	r3,r9		/* get vcpu pointer */
> +	ld	r4,VCPU_GPR(r4)(r9)
> +	bctrl
> +	cmpdi	r3,H_TOO_HARD
> +	beq	hcall_real_fallback

Ah, very good. Please mark the constant as "for internal use only" then, =
as that's certainly fine :).

> +	ld	r4,PACA_KVM_VCPU(r13)
> +	std	r3,VCPU_GPR(r3)(r4)
> +	ld	r10,VCPU_PC(r4)
> +	ld	r11,VCPU_MSR(r4)
> +	b	fast_guest_return
> +
> +	/* We've attempted a real mode hcall, but it's punted it back
> +	 * to userspace.  We need to restore some clobbered volatiles
> +	 * before resuming the pass-it-to-qemu path */
> +hcall_real_fallback:
> +	li	r12,0xc00

use the define please :)


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
  2011-05-11 10:46 ` [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode Paul Mackerras
@ 2011-05-17  8:01     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  8:01 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:46, Paul Mackerras wrote:

> From: David Gibson <dwg@au1.ibm.com>
> 
> This improves I/O performance for guests using the PAPR paravirtualization
> interface by making the H_PUT_TCE hcall faster, by implementing it in
> real mode.  H_PUT_TCE is used for updating virtual IOMMU tables, and is
> used both for virtual I/O and for real I/O in the PAPR interface.
> 
> Since this moves the IOMMU tables into the kernel, we define a new
> KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables.
> The ioctl returns a file descriptor which can be used to mmap the
> newly created table.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/kvm.h           |    9 +++
> arch/powerpc/include/asm/kvm_book3s_64.h |    2 +
> arch/powerpc/include/asm/kvm_host.h      |    9 +++
> arch/powerpc/include/asm/kvm_ppc.h       |    2 +
> arch/powerpc/kvm/Makefile                |    3 +-
> arch/powerpc/kvm/book3s_64_vio_hv.c      |   73 +++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c             |  116 +++++++++++++++++++++++++++++-
> arch/powerpc/kvm/book3s_hv_rmhandlers.S  |    2 +-
> arch/powerpc/kvm/powerpc.c               |   18 +++++
> include/linux/kvm.h                      |    5 ++

This one definitely needs documentation :).

> 10 files changed, 236 insertions(+), 3 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c
> 
> diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
> index 18ea696..a9e641b 100644
> --- a/arch/powerpc/include/asm/kvm.h
> +++ b/arch/powerpc/include/asm/kvm.h
> @@ -22,6 +22,9 @@
> 
> #include <linux/types.h>
> 
> +/* Select powerpc specific features in <linux/kvm.h> */
> +#define __KVM_HAVE_SPAPR_TCE
> +
> struct kvm_regs {
> 	__u64 pc;
> 	__u64 cr;
> @@ -88,4 +91,10 @@ struct kvm_guest_debug_arch {
> #define KVM_INTERRUPT_UNSET	-2U
> #define KVM_INTERRUPT_SET_LEVEL	-3U
> 
> +/* for KVM_CAP_SPAPR_TCE */
> +struct kvm_create_spapr_tce {
> +	__u64 liobn;
> +	__u32 window_size;
> +};
> +
> #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 4cadd61..e1a096b 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -25,4 +25,6 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu)
> 	return &get_paca()->shadow_vcpu;
> }
> 
> +#define SPAPR_TCE_SHIFT		12
> +
> #endif /* __ASM_KVM_BOOK3S_64_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index af6703e..cda183e 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -144,6 +144,14 @@ struct kvmppc_pginfo {
> 	atomic_t refcnt;
> };
> 
> +struct kvmppc_spapr_tce_table {
> +	struct list_head list;
> +	struct kvm *kvm;
> +	u64 liobn;
> +	u32 window_size;
> +	struct page *pages[0];
> +};
> +
> struct kvm_arch {
> 	unsigned long hpt_virt;
> 	unsigned long ram_npages;
> @@ -157,6 +165,7 @@ struct kvm_arch {
> 	unsigned long host_sdr1;
> 	int tlbie_lock;
> 	unsigned short last_vcpu[NR_CPUS];
> +	struct list_head spapr_tce_tables;
> };
> 
> struct kvmppc_pte {
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index b4ee11a..de683fa 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -117,6 +117,8 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
> extern void kvmppc_map_vrma(struct kvm *kvm,
> 			    struct kvm_userspace_memory_region *mem);
> extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
> +extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> +				struct kvm_create_spapr_tce *args);
> extern int kvmppc_core_init_vm(struct kvm *kvm);
> extern void kvmppc_core_destroy_vm(struct kvm *kvm);
> extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 37c1a60..8ba062f 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -59,7 +59,8 @@ kvm-book3s_64_hv-objs := \
> 	book3s.o \
> 	book3s_hv.o \
> 	book3s_hv_interrupts.o \
> -	book3s_64_mmu_hv.o
> +	book3s_64_mmu_hv.o \
> +	book3s_64_vio_hv.o
> kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) := $(kvm-book3s_64_hv-objs)
> 
> kvm-book3s_32-objs := \
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> new file mode 100644
> index 0000000..ea0f8c5
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -0,0 +1,73 @@
> +/*
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
> + *
> + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
> + */
> +
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/highmem.h>
> +#include <linux/gfp.h>
> +#include <linux/slab.h>
> +#include <linux/hugetlb.h>
> +#include <linux/list.h>
> +
> +#include <asm/tlbflush.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu-hash64.h>
> +#include <asm/hvcall.h>
> +#include <asm/synch.h>
> +#include <asm/ppc-opcode.h>
> +#include <asm/kvm_host.h>
> +#include <asm/udbg.h>
> +
> +#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
> +

It would be great to somehow mark code that runs in real mode as such - either by an attribute in the function header or by a simple comment.

> +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
> +		      unsigned long ioba, unsigned long tce)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvmppc_spapr_tce_table *stt;
> +
> +	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
> +	/* 	    liobn, ioba, tce); */
> +
> +	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> +		if (stt->liobn == liobn) {
> +			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
> +			struct page *page;
> +			u64 *tbl;
> +
> +			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
> +			/* 	    liobn, stt, stt->window_size); */
> +			if (ioba >= stt->window_size)
> +				return H_PARAMETER;
> +
> +			page = stt->pages[idx / TCES_PER_PAGE];
> +			tbl = (u64 *)page_address(page);
> +
> +			/* FIXME: Need to validate the TCE itself */
> +			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
> +			tbl[idx % TCES_PER_PAGE] = tce;
> +			return H_SUCCESS;
> +		}
> +	}
> +
> +	/* Didn't find the liobn, punt it to userspace */
> +	return H_TOO_HARD;
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 377a35a..eed2c10 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -506,6 +506,116 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> 	return r;
> }
> 
> +static long kvmppc_stt_npages(unsigned long window_size)
> +{
> +	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
> +		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
> +}
> +
> +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
> +{
> +	struct kvm *kvm = stt->kvm;
> +	int i;
> +
> +	mutex_lock(&kvm->lock);
> +	list_del(&stt->list);
> +	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
> +		__free_page(stt->pages[i]);
> +	kfree(stt);
> +	mutex_unlock(&kvm->lock);
> +
> +	kvm_put_kvm(kvm);
> +}
> +
> +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
> +	struct page *page;
> +
> +	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
> +		return VM_FAULT_SIGBUS;
> +
> +	page = stt->pages[vmf->pgoff];
> +	get_page(page);
> +	vmf->page = page;
> +	return 0;
> +}
> +
> +static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
> +	.fault = kvm_spapr_tce_fault,
> +};
> +
> +static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	vma->vm_ops = &kvm_spapr_tce_vm_ops;
> +	return 0;
> +}
> +
> +static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
> +{
> +	struct kvmppc_spapr_tce_table *stt = filp->private_data;
> +
> +	release_spapr_tce_table(stt);
> +	return 0;
> +}
> +
> +static struct file_operations kvm_spapr_tce_fops = {
> +	.mmap           = kvm_spapr_tce_mmap,
> +	.release	= kvm_spapr_tce_release,
> +};
> +
> +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> +				   struct kvm_create_spapr_tce *args)
> +{
> +	struct kvmppc_spapr_tce_table *stt = NULL;
> +	long npages;
> +	int ret = -ENOMEM;
> +	int i;
> +
> +	/* Check this LIOBN hasn't been previously allocated */
> +	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> +		if (stt->liobn == args->liobn)
> +			return -EBUSY;
> +	}
> +
> +	npages = kvmppc_stt_npages(args->window_size);
> +
> +	stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
> +		      GFP_KERNEL);
> +	if (!stt)
> +		goto fail;
> +
> +	stt->liobn = args->liobn;
> +	stt->window_size = args->window_size;
> +	stt->kvm = kvm;
> +
> +	for (i = 0; i < npages; i++) {
> +		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +		if (!stt->pages[i])
> +			goto fail;
> +	}
> +
> +	kvm_get_kvm(kvm);
> +
> +	mutex_lock(&kvm->lock);
> +	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
> +
> +	mutex_unlock(&kvm->lock);
> +
> +	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
> +				stt, O_RDONLY);
> +
> +fail:
> +	if (stt) {
> +		for (i = 0; i < npages; i++)
> +			if (stt->pages[i])
> +				__free_page(stt->pages[i]);
> +
> +		kfree(stt);
> +	}
> +	return ret;
> +}
> +
> int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> 				struct kvm_userspace_memory_region *mem)
> {
> @@ -527,13 +637,17 @@ int kvmppc_core_init_vm(struct kvm *kvm)
> 
> 	/* Allocate hashed page table */
> 	r = kvmppc_alloc_hpt(kvm);
> +	if (r)
> +		return r;
> 
> -	return r;
> +	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
> +	return 0;
> }
> 
> void kvmppc_core_destroy_vm(struct kvm *kvm)
> {
> 	kvmppc_free_hpt(kvm);
> +	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
> }
> 
> /* These are stubs for now */
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index e8a8f3c..95f6386 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -722,7 +722,7 @@ hcall_real_table:
> 	.long	0		/* 0x14 - H_CLEAR_REF */
> 	.long	.kvmppc_h_protect - hcall_real_table
> 	.long	0		/* 0x1c - H_GET_TCE */
> -	.long	0		/* 0x20 - H_SET_TCE */
> +	.long	.kvmppc_h_put_tce - hcall_real_table
> 	.long	0		/* 0x24 - H_SET_SPRG0 */
> 	.long	.kvmppc_h_set_dabr - hcall_real_table
> 	.long	0		/* 0x2c */
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 7bfe413..10f777a 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -196,6 +196,11 @@ int kvm_dev_ioctl_check_extension(long ext)
> 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
> 		break;
> #endif
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	case KVM_CAP_SPAPR_TCE:
> +		r = 1;
> +		break;
> +#endif
> 	default:
> 		r = 0;
> 		break;
> @@ -628,6 +633,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
> 
> 		break;
> 	}
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	case KVM_CREATE_SPAPR_TCE: {
> +		struct kvm_create_spapr_tce create_tce;
> +		struct kvm *kvm = filp->private_data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
> +			goto out;
> +		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
> +		goto out;
> +	}

I'm not sure I fully understand how this is supposed to work. If the tables are kept inside the kernel, how does userspace get to know where to DMA to?


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
@ 2011-05-17  8:01     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  8:01 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:46, Paul Mackerras wrote:

> From: David Gibson <dwg@au1.ibm.com>
>=20
> This improves I/O performance for guests using the PAPR =
paravirtualization
> interface by making the H_PUT_TCE hcall faster, by implementing it in
> real mode.  H_PUT_TCE is used for updating virtual IOMMU tables, and =
is
> used both for virtual I/O and for real I/O in the PAPR interface.
>=20
> Since this moves the IOMMU tables into the kernel, we define a new
> KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables.
> The ioctl returns a file descriptor which can be used to mmap the
> newly created table.
>=20
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/kvm.h           |    9 +++
> arch/powerpc/include/asm/kvm_book3s_64.h |    2 +
> arch/powerpc/include/asm/kvm_host.h      |    9 +++
> arch/powerpc/include/asm/kvm_ppc.h       |    2 +
> arch/powerpc/kvm/Makefile                |    3 +-
> arch/powerpc/kvm/book3s_64_vio_hv.c      |   73 +++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c             |  116 =
+++++++++++++++++++++++++++++-
> arch/powerpc/kvm/book3s_hv_rmhandlers.S  |    2 +-
> arch/powerpc/kvm/powerpc.c               |   18 +++++
> include/linux/kvm.h                      |    5 ++

This one definitely needs documentation :).

> 10 files changed, 236 insertions(+), 3 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c
>=20
> diff --git a/arch/powerpc/include/asm/kvm.h =
b/arch/powerpc/include/asm/kvm.h
> index 18ea696..a9e641b 100644
> --- a/arch/powerpc/include/asm/kvm.h
> +++ b/arch/powerpc/include/asm/kvm.h
> @@ -22,6 +22,9 @@
>=20
> #include <linux/types.h>
>=20
> +/* Select powerpc specific features in <linux/kvm.h> */
> +#define __KVM_HAVE_SPAPR_TCE
> +
> struct kvm_regs {
> 	__u64 pc;
> 	__u64 cr;
> @@ -88,4 +91,10 @@ struct kvm_guest_debug_arch {
> #define KVM_INTERRUPT_UNSET	-2U
> #define KVM_INTERRUPT_SET_LEVEL	-3U
>=20
> +/* for KVM_CAP_SPAPR_TCE */
> +struct kvm_create_spapr_tce {
> +	__u64 liobn;
> +	__u32 window_size;
> +};
> +
> #endif /* __LINUX_KVM_POWERPC_H */
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h =
b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 4cadd61..e1a096b 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -25,4 +25,6 @@ static inline struct kvmppc_book3s_shadow_vcpu =
*to_svcpu(struct kvm_vcpu *vcpu)
> 	return &get_paca()->shadow_vcpu;
> }
>=20
> +#define SPAPR_TCE_SHIFT		12
> +
> #endif /* __ASM_KVM_BOOK3S_64_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_host.h =
b/arch/powerpc/include/asm/kvm_host.h
> index af6703e..cda183e 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -144,6 +144,14 @@ struct kvmppc_pginfo {
> 	atomic_t refcnt;
> };
>=20
> +struct kvmppc_spapr_tce_table {
> +	struct list_head list;
> +	struct kvm *kvm;
> +	u64 liobn;
> +	u32 window_size;
> +	struct page *pages[0];
> +};
> +
> struct kvm_arch {
> 	unsigned long hpt_virt;
> 	unsigned long ram_npages;
> @@ -157,6 +165,7 @@ struct kvm_arch {
> 	unsigned long host_sdr1;
> 	int tlbie_lock;
> 	unsigned short last_vcpu[NR_CPUS];
> +	struct list_head spapr_tce_tables;
> };
>=20
> struct kvmppc_pte {
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h =
b/arch/powerpc/include/asm/kvm_ppc.h
> index b4ee11a..de683fa 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -117,6 +117,8 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
> extern void kvmppc_map_vrma(struct kvm *kvm,
> 			    struct kvm_userspace_memory_region *mem);
> extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
> +extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> +				struct kvm_create_spapr_tce *args);
> extern int kvmppc_core_init_vm(struct kvm *kvm);
> extern void kvmppc_core_destroy_vm(struct kvm *kvm);
> extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 37c1a60..8ba062f 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -59,7 +59,8 @@ kvm-book3s_64_hv-objs :=3D \
> 	book3s.o \
> 	book3s_hv.o \
> 	book3s_hv_interrupts.o \
> -	book3s_64_mmu_hv.o
> +	book3s_64_mmu_hv.o \
> +	book3s_64_vio_hv.o
> kvm-objs-$(CONFIG_KVM_BOOK3S_64_HV) :=3D $(kvm-book3s_64_hv-objs)
>=20
> kvm-book3s_32-objs :=3D \
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c =
b/arch/powerpc/kvm/book3s_64_vio_hv.c
> new file mode 100644
> index 0000000..ea0f8c5
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -0,0 +1,73 @@
> +/*
> + * This program is free software; you can redistribute it and/or =
modify
> + * it under the terms of the GNU General Public License, version 2, =
as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  =
02110-1301, USA.
> + *
> + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
> + */
> +
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <linux/kvm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/highmem.h>
> +#include <linux/gfp.h>
> +#include <linux/slab.h>
> +#include <linux/hugetlb.h>
> +#include <linux/list.h>
> +
> +#include <asm/tlbflush.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/mmu-hash64.h>
> +#include <asm/hvcall.h>
> +#include <asm/synch.h>
> +#include <asm/ppc-opcode.h>
> +#include <asm/kvm_host.h>
> +#include <asm/udbg.h>
> +
> +#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
> +

It would be great to somehow mark code that runs in real mode as such - =
either by an attribute in the function header or by a simple comment.

> +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
> +		      unsigned long ioba, unsigned long tce)
> +{
> +	struct kvm *kvm =3D vcpu->kvm;
> +	struct kvmppc_spapr_tce_table *stt;
> +
> +	/* udbg_printf("H_PUT_TCE(): liobn=3D0x%lx ioba=3D0x%lx, =
tce=3D0x%lx\n", */
> +	/* 	    liobn, ioba, tce); */
> +
> +	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> +		if (stt->liobn =3D=3D liobn) {
> +			unsigned long idx =3D ioba >> SPAPR_TCE_SHIFT;
> +			struct page *page;
> +			u64 *tbl;
> +
> +			/* udbg_printf("H_PUT_TCE: liobn 0x%lx =3D> =
stt=3D%p  window_size=3D0x%x\n", */
> +			/* 	    liobn, stt, stt->window_size); */
> +			if (ioba >=3D stt->window_size)
> +				return H_PARAMETER;
> +
> +			page =3D stt->pages[idx / TCES_PER_PAGE];
> +			tbl =3D (u64 *)page_address(page);
> +
> +			/* FIXME: Need to validate the TCE itself */
> +			/* udbg_printf("tce @ %p\n", &tbl[idx % =
TCES_PER_PAGE]); */
> +			tbl[idx % TCES_PER_PAGE] =3D tce;
> +			return H_SUCCESS;
> +		}
> +	}
> +
> +	/* Didn't find the liobn, punt it to userspace */
> +	return H_TOO_HARD;
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv.c =
b/arch/powerpc/kvm/book3s_hv.c
> index 377a35a..eed2c10 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -506,6 +506,116 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct =
kvm_vcpu *vcpu)
> 	return r;
> }
>=20
> +static long kvmppc_stt_npages(unsigned long window_size)
> +{
> +	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
> +		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
> +}
> +
> +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table =
*stt)
> +{
> +	struct kvm *kvm =3D stt->kvm;
> +	int i;
> +
> +	mutex_lock(&kvm->lock);
> +	list_del(&stt->list);
> +	for (i =3D 0; i < kvmppc_stt_npages(stt->window_size); i++)
> +		__free_page(stt->pages[i]);
> +	kfree(stt);
> +	mutex_unlock(&kvm->lock);
> +
> +	kvm_put_kvm(kvm);
> +}
> +
> +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct =
vm_fault *vmf)
> +{
> +	struct kvmppc_spapr_tce_table *stt =3D =
vma->vm_file->private_data;
> +	struct page *page;
> +
> +	if (vmf->pgoff >=3D kvmppc_stt_npages(stt->window_size))
> +		return VM_FAULT_SIGBUS;
> +
> +	page =3D stt->pages[vmf->pgoff];
> +	get_page(page);
> +	vmf->page =3D page;
> +	return 0;
> +}
> +
> +static const struct vm_operations_struct kvm_spapr_tce_vm_ops =3D {
> +	.fault =3D kvm_spapr_tce_fault,
> +};
> +
> +static int kvm_spapr_tce_mmap(struct file *file, struct =
vm_area_struct *vma)
> +{
> +	vma->vm_ops =3D &kvm_spapr_tce_vm_ops;
> +	return 0;
> +}
> +
> +static int kvm_spapr_tce_release(struct inode *inode, struct file =
*filp)
> +{
> +	struct kvmppc_spapr_tce_table *stt =3D filp->private_data;
> +
> +	release_spapr_tce_table(stt);
> +	return 0;
> +}
> +
> +static struct file_operations kvm_spapr_tce_fops =3D {
> +	.mmap           =3D kvm_spapr_tce_mmap,
> +	.release	=3D kvm_spapr_tce_release,
> +};
> +
> +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
> +				   struct kvm_create_spapr_tce *args)
> +{
> +	struct kvmppc_spapr_tce_table *stt =3D NULL;
> +	long npages;
> +	int ret =3D -ENOMEM;
> +	int i;
> +
> +	/* Check this LIOBN hasn't been previously allocated */
> +	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> +		if (stt->liobn =3D=3D args->liobn)
> +			return -EBUSY;
> +	}
> +
> +	npages =3D kvmppc_stt_npages(args->window_size);
> +
> +	stt =3D kzalloc(sizeof(*stt) + npages* sizeof(struct page *),
> +		      GFP_KERNEL);
> +	if (!stt)
> +		goto fail;
> +
> +	stt->liobn =3D args->liobn;
> +	stt->window_size =3D args->window_size;
> +	stt->kvm =3D kvm;
> +
> +	for (i =3D 0; i < npages; i++) {
> +		stt->pages[i] =3D alloc_page(GFP_KERNEL | __GFP_ZERO);
> +		if (!stt->pages[i])
> +			goto fail;
> +	}
> +
> +	kvm_get_kvm(kvm);
> +
> +	mutex_lock(&kvm->lock);
> +	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
> +
> +	mutex_unlock(&kvm->lock);
> +
> +	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
> +				stt, O_RDONLY);
> +
> +fail:
> +	if (stt) {
> +		for (i =3D 0; i < npages; i++)
> +			if (stt->pages[i])
> +				__free_page(stt->pages[i]);
> +
> +		kfree(stt);
> +	}
> +	return ret;
> +}
> +
> int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> 				struct kvm_userspace_memory_region *mem)
> {
> @@ -527,13 +637,17 @@ int kvmppc_core_init_vm(struct kvm *kvm)
>=20
> 	/* Allocate hashed page table */
> 	r =3D kvmppc_alloc_hpt(kvm);
> +	if (r)
> +		return r;
>=20
> -	return r;
> +	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
> +	return 0;
> }
>=20
> void kvmppc_core_destroy_vm(struct kvm *kvm)
> {
> 	kvmppc_free_hpt(kvm);
> +	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
> }
>=20
> /* These are stubs for now */
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S =
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index e8a8f3c..95f6386 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -722,7 +722,7 @@ hcall_real_table:
> 	.long	0		/* 0x14 - H_CLEAR_REF */
> 	.long	.kvmppc_h_protect - hcall_real_table
> 	.long	0		/* 0x1c - H_GET_TCE */
> -	.long	0		/* 0x20 - H_SET_TCE */
> +	.long	.kvmppc_h_put_tce - hcall_real_table
> 	.long	0		/* 0x24 - H_SET_SPRG0 */
> 	.long	.kvmppc_h_set_dabr - hcall_real_table
> 	.long	0		/* 0x2c */
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 7bfe413..10f777a 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -196,6 +196,11 @@ int kvm_dev_ioctl_check_extension(long ext)
> 		r =3D KVM_COALESCED_MMIO_PAGE_OFFSET;
> 		break;
> #endif
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	case KVM_CAP_SPAPR_TCE:
> +		r =3D 1;
> +		break;
> +#endif
> 	default:
> 		r =3D 0;
> 		break;
> @@ -628,6 +633,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
>=20
> 		break;
> 	}
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	case KVM_CREATE_SPAPR_TCE: {
> +		struct kvm_create_spapr_tce create_tce;
> +		struct kvm *kvm =3D filp->private_data;
> +
> +		r =3D -EFAULT;
> +		if (copy_from_user(&create_tce, argp, =
sizeof(create_tce)))
> +			goto out;
> +		r =3D kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
> +		goto out;
> +	}

I'm not sure I fully understand how this is supposed to work. If the =
tables are kept inside the kernel, how does userspace get to know where =
to DMA to?


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
  2011-05-11 10:46 ` [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes Paul Mackerras
@ 2011-05-17  8:21     ` Alexander Graf
  2011-05-17  8:21     ` Alexander Graf
  1 sibling, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  8:21 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:46, Paul Mackerras wrote:

> This lifts the restriction that book3s_hv guests can only run one
> hardware thread per core, and allows them to use up to 4 threads
> per core on POWER7.  The host still has to run single-threaded.
> 
> This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
> capability.
> 
> To use this, the host kernel should be booted with all threads
> active, and then all the secondary threads should be offlined.
> This will put the secondary threads into nap mode.  KVM will then
> wake them from nap mode and use them for running guest code (while
> they are still offline).  To wake the secondary threads, we send
> them an IPI using a new xics_wake_cpu() function, implemented in
> arch/powerpc/sysdev/xics/icp-native.c.  In other words, at this stage
> we assume that the platform has a XICS interrupt controller and
> we are using icp-native.c to drive it.  Since the woken thread will
> need to acknowledge and clear the IPI, we also export the base
> physical address of the XICS registers using kvmppc_set_xics_phys()
> for use in the low-level KVM book3s code.
> 
> When a vcpu is created, it is assigned to a virtual core (vcore).
> The vcore number is obtained by dividing the vcpu number by 4,
> since we assume at most 4 threads per core.  Thus, if qemu wishes
> to run the guest in single-threaded mode, it should make all vcpu
> numbers be multiples of 4.
> 
> We distinguish three states of a vcpu: runnable (i.e., ready to execute
> the guest), blocked (that is, idle), and busy in host.  We currently
> implement a policy that the vcore can run only when all its threads
> are runnable or blocked.  This way, if a vcpu needs to execute elsewhere
> in the kernel or in qemu, it can do so without being starved of CPU
> by the other vcpus.
> 
> When a vcore starts to run, it executes in the context of one of the
> vcpu threads.  The other vcpu threads all go to sleep and stay asleep
> until something happens requiring the vcpu thread to return to qemu,
> or to wake up to run the vcore (this can happen when another vcpu
> thread goes from busy in host state to blocked).
> 
> It can happen that a vcpu goes from blocked to runnable state (e.g.
> because of an interrupt), and the vcore it belongs to is already
> running.  In that case it can start to run immediately as long as
> the none of the vcpus in the vcore have started to exit the guest.
> We send the next free thread in the vcore an IPI to get it to start
> to execute the guest.  It synchronizes with the other threads via
> the vcore->entry_exit_count field to make sure that it doesn't go
> into the guest if the other vcpus are exiting by the time that it
> is ready to actually enter the guest.
> 
> Note that there is no fixed relationship between the hardware thread
> number and the vcpu number.  Hardware threads are assigned to vcpus
> as they become runnable, so we will always use the lower-numbered
> hardware threads in preference to higher-numbered threads if not all
> the vcpus in the vcore are runnable, regardless of which vcpus are
> runnable.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/kvm.h          |    1 +
> arch/powerpc/include/asm/kvm_host.h     |   45 +++++-
> arch/powerpc/include/asm/kvm_ppc.h      |   13 ++
> arch/powerpc/include/asm/paca.h         |    2 +
> arch/powerpc/kernel/asm-offsets.c       |    6 +
> arch/powerpc/kernel/exceptions-64s.S    |   31 +++-
> arch/powerpc/kernel/idle_power7.S       |    2 -
> arch/powerpc/kvm/Kconfig                |    2 +-
> arch/powerpc/kvm/book3s_hv.c            |  266 +++++++++++++++++++++++++++++--
> arch/powerpc/kvm/book3s_hv_rmhandlers.S |  157 ++++++++++++++++++-
> arch/powerpc/sysdev/xics/icp-native.c   |    7 +
> include/linux/kvm.h                     |    3 +
> 12 files changed, 506 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
> index a9e641b..624c872 100644
> --- a/arch/powerpc/include/asm/kvm.h
> +++ b/arch/powerpc/include/asm/kvm.h
> @@ -24,6 +24,7 @@
> 
> /* Select powerpc specific features in <linux/kvm.h> */
> #define __KVM_HAVE_SPAPR_TCE
> +#define __KVM_HAVE_PPC_SMT
> 
> struct kvm_regs {
> 	__u64 pc;
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index cda183e..a2085da 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -25,10 +25,15 @@
> #include <linux/interrupt.h>
> #include <linux/types.h>
> #include <linux/kvm_types.h>
> +#include <linux/threads.h>
> +#include <linux/spinlock.h>
> #include <linux/kvm_para.h>
> #include <asm/kvm_asm.h>
> +#include <asm/processor.h>
> 
> -#define KVM_MAX_VCPUS 1
> +#define KVM_MAX_VCPUS		NR_CPUS
> +#define KVM_THREADS_PER_CORE	4

So what if POWER8 (or whatever it will be called) comes along with 8 threads per core? Would that change the userspace interface?

> +#define KVM_MAX_VCORES		(KVM_MAX_VCPUS / KVM_THREADS_PER_CORE)
> #define KVM_MEMORY_SLOTS 32
> /* memory slots that does not exposed to userspace */
> #define KVM_PRIVATE_MEM_SLOTS 4
> @@ -165,9 +170,36 @@ struct kvm_arch {
> 	unsigned long host_sdr1;
> 	int tlbie_lock;
> 	unsigned short last_vcpu[NR_CPUS];
> +	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
> 	struct list_head spapr_tce_tables;
> };
> 
> +/*
> + * Struct for a virtual core.
> + * Note: entry_exit_count combines an entry count in the bottom 8 bits
> + * and an exit count in the next 8 bits.  This is so that we can
> + * atomically increment the entry count iff the exit count is 0
> + * without taking the lock.
> + */
> +struct kvmppc_vcore {
> +	int n_runnable;
> +	int n_blocked;
> +	int num_threads;
> +	int entry_exit_count;
> +	int n_woken;
> +	int nap_count;
> +	u16 pcpu;
> +	u8 vcore_running;
> +	u8 in_guest;
> +	struct kvm_vcpu *runnable_threads[KVM_THREADS_PER_CORE];
> +	struct task_struct *run_task[KVM_THREADS_PER_CORE];
> +	struct kvm_run *kvm_run[KVM_THREADS_PER_CORE];
> +	spinlock_t lock;
> +};
> +
> +#define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
> +#define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
> +
> struct kvmppc_pte {
> 	ulong eaddr;
> 	u64 vpage;
> @@ -362,10 +394,21 @@ struct kvm_vcpu_arch {
> 	struct slb_shadow *slb_shadow;
> 	struct dtl *dtl;
> 	struct dtl *dtl_end;
> +
> +	struct kvmppc_vcore *vcore;
> +	int ret;
> 	int trap;
> +	int state;
> +	int ptid;
> +	wait_queue_head_t cpu_run;
> +
> 	struct kvm_vcpu_arch_shared *shared;
> 	unsigned long magic_page_pa; /* phys addr to map the magic page to */
> 	unsigned long magic_page_ea; /* effect. addr to map the magic page to */
> };
> 
> +#define KVMPPC_VCPU_BUSY_IN_HOST	0
> +#define KVMPPC_VCPU_BLOCKED		1
> +#define KVMPPC_VCPU_RUNNABLE		2
> +
> #endif /* __POWERPC_KVM_HOST_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index de683fa..ed75975 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -33,6 +33,9 @@
> #else
> #include <asm/kvm_booke.h>
> #endif
> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#include <asm/paca.h>
> +#endif
> 
> enum emulation_result {
> 	EMULATE_DONE,         /* no further processing */
> @@ -159,4 +162,14 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
> 	return r;
> }
> 
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
> +{
> +	paca[cpu].xics_phys = addr;
> +}
> +#else
> +static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
> +{}
> +#endif
> +
> #endif /* __POWERPC_KVM_PPC_H__ */
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index 8dba5f6..8b6628c 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -151,6 +151,8 @@ struct paca_struct {
> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> #ifdef CONFIG_KVM_BOOK3S_64_HV
> 	struct kvm_vcpu *kvm_vcpu;
> +	struct kvmppc_vcore *kvm_vcore;
> +	unsigned long xics_phys;
> 	u64 dabr;
> 	u64 host_mmcr[3];
> 	u32 host_pmc[6];
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index fd56f14..1fee2cf 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -204,12 +204,14 @@ int main(void)
> 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, shadow_vcpu));
> #ifdef CONFIG_KVM_BOOK3S_64_HV
> 	DEFINE(PACA_KVM_VCPU, offsetof(struct paca_struct, kvm_vcpu));
> +	DEFINE(PACA_KVM_VCORE, offsetof(struct paca_struct, kvm_vcore));
> 	DEFINE(PACA_HOST_MMCR, offsetof(struct paca_struct, host_mmcr));
> 	DEFINE(PACA_HOST_PMC, offsetof(struct paca_struct, host_pmc));
> 	DEFINE(PACA_HOST_PURR, offsetof(struct paca_struct, host_purr));
> 	DEFINE(PACA_HOST_SPURR, offsetof(struct paca_struct, host_spurr));
> 	DEFINE(PACA_HOST_DSCR, offsetof(struct paca_struct, host_dscr));
> 	DEFINE(PACA_DABR, offsetof(struct paca_struct, dabr));
> +	DEFINE(PACA_XICS_PHYS, offsetof(struct paca_struct, xics_phys));
> 	DEFINE(PACA_KVM_DECEXP, offsetof(struct paca_struct, dec_expires));
> #endif
> #endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
> @@ -478,6 +480,10 @@ int main(void)
> 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
> 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
> 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
> +	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
> +	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
> +	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
> +	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
> 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) -
> 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
> 	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, host_r1));
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 80c6456..803dcff 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -49,19 +49,32 @@ BEGIN_FTR_SECTION
> 	 * state loss at this time.
> 	 */
> 	mfspr	r13,SPRN_SRR1
> -	rlwinm	r13,r13,47-31,30,31
> -	cmpwi	cr0,r13,1
> -	bne	1f
> -	b	.power7_wakeup_noloss
> -1:	cmpwi	cr0,r13,2
> -	bne	1f
> -	b	.power7_wakeup_loss
> +	rlwinm.	r13,r13,47-31,30,31
> +	beq	9f
> +
> +	/* waking up from powersave (nap) state */
> +	cmpwi	cr1,r13,2
> 	/* Total loss of HV state is fatal, we could try to use the
> 	 * PIR to locate a PACA, then use an emergency stack etc...
> 	 * but for now, let's just stay stuck here
> 	 */
> -1:	cmpwi	cr0,r13,3
> -	beq	.
> +	bgt	cr1,.
> +	GET_PACA(r13)
> +
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	lbz	r0,PACAPROCSTART(r13)
> +	cmpwi	r0,0x80
> +	bne	1f
> +	li	r0,0
> +	stb	r0,PACAPROCSTART(r13)
> +	b	kvm_start_guest
> +1:
> +#endif
> +
> +	beq	cr1,2f
> +	b	.power7_wakeup_noloss
> +2:	b	.power7_wakeup_loss
> +9:
> END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
> #endif /* CONFIG_PPC_P7_NAP */
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
> diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
> index f8f0bc7..3a70845 100644
> --- a/arch/powerpc/kernel/idle_power7.S
> +++ b/arch/powerpc/kernel/idle_power7.S
> @@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
> 	b	.
> 
> _GLOBAL(power7_wakeup_loss)
> -	GET_PACA(r13)
> 	ld	r1,PACAR1(r13)
> 	REST_NVGPRS(r1)
> 	REST_GPR(2, r1)
> @@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
> 	rfid
> 
> _GLOBAL(power7_wakeup_noloss)
> -	GET_PACA(r13)
> 	ld	r1,PACAR1(r13)
> 	ld	r4,_MSR(r1)
> 	ld	r5,_NIP(r1)
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 6ff191b..27d8e36 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -58,7 +58,7 @@ config KVM_BOOK3S_64
> 
> config KVM_BOOK3S_64_HV
> 	bool "KVM support for POWER7 using hypervisor mode in host"
> -	depends on EXPERIMENTAL && PPC_BOOK3S_64
> +	depends on EXPERIMENTAL && PPC_BOOK3S_64 && PPC_PSERIES
> 	select KVM
> 	select KVM_BOOK3S_64
> 	---help---
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index eed2c10..ce006c0 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -50,6 +50,7 @@
> void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> {
> 	local_paca->kvm_vcpu = vcpu;
> +	local_paca->kvm_vcore = vcpu->arch.vcore;
> 	vcpu->cpu = cpu;
> }
> 
> @@ -58,6 +59,9 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
> 	vcpu->cpu = -1;
> }
> 
> +static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
> +static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
> +
> void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
> {
> 	u64 now;
> @@ -75,11 +79,15 @@ void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
> 			      HRTIMER_MODE_REL);
> 	}
> 
> +	kvmppc_vcpu_blocked(vcpu);
> +
> 	kvm_vcpu_block(vcpu);
> 	vcpu->stat.halt_wakeup++;
> 
> 	if (vcpu->arch.dec_expires != ~(u64)0)
> 		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
> +
> +	kvmppc_vcpu_unblocked(vcpu);
> }
> 
> void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
> @@ -415,6 +423,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
> {
> 	struct kvm_vcpu *vcpu;
> 	int err = -ENOMEM;
> +	int core;
> +	struct kvmppc_vcore *vcore;
> 	unsigned long lpcr;
> 
> 	vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
> @@ -443,6 +453,37 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
> 	lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE;
> 	vcpu->arch.lpcr = lpcr;
> 
> +	/*
> +	 * Some vcpus may start out in stopped state.  If we initialize
> +	 * them to busy-in-host state they will stop other vcpus in the
> +	 * vcore from running.  Instead we initialize them to blocked
> +	 * state, effectively considering them to be stopped until we
> +	 * see the first run ioctl for them.
> +	 */
> +	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
> +
> +	init_waitqueue_head(&vcpu->arch.cpu_run);
> +	core = id / KVM_THREADS_PER_CORE;
> +
> +	mutex_lock(&kvm->lock);
> +	vcore = kvm->arch.vcores[core];
> +	if (!vcore) {
> +		vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
> +		if (vcore)
> +			spin_lock_init(&vcore->lock);
> +		kvm->arch.vcores[core] = vcore;
> +	}
> +	mutex_unlock(&kvm->lock);
> +
> +	if (!vcore)
> +		goto free_vcpu;
> +
> +	spin_lock(&vcore->lock);
> +	++vcore->num_threads;
> +	++vcore->n_blocked;
> +	spin_unlock(&vcore->lock);
> +	vcpu->arch.vcore = vcore;
> +
> 	return vcpu;
> 
> free_vcpu:
> @@ -457,37 +498,238 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
> 	kfree(vcpu);
> }
> 
> +static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	spin_lock(&vc->lock);
> +	vcpu->arch.state = KVMPPC_VCPU_BLOCKED;
> +	++vc->n_blocked;
> +	if (vc->n_runnable > 0 &&
> +	    vc->n_runnable + vc->n_blocked == vc->num_threads)
> +		wake_up(&vc->runnable_threads[0]->arch.cpu_run);
> +	spin_unlock(&vc->lock);
> +}
> +
> +static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	spin_lock(&vc->lock);
> +	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
> +	--vc->n_blocked;
> +	spin_unlock(&vc->lock);
> +}
> +
> extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
> +extern void xics_wake_cpu(int cpu);
> +
> +static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, int ptid)
> +{
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +
> +	vcpu = vc->runnable_threads[ptid];
> +	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
> +		return;
> +	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
> +	--vc->n_runnable;
> +	for (i = ptid; i < vc->n_runnable; ++i) {
> +		vcpu = vc->runnable_threads[i+1];
> +		vc->runnable_threads[i] = vcpu;
> +		vcpu->arch.ptid = i;
> +		vc->run_task[i] = vc->run_task[i+1];
> +		vc->kvm_run[i] = vc->kvm_run[i+1];
> +	}
> +}
> +
> +static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
> +{
> +	int cpu;
> +	struct paca_struct *tpaca;
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	cpu = vc->pcpu + vcpu->arch.ptid;
> +	tpaca = &paca[cpu];
> +	tpaca->kvm_vcpu = vcpu;
> +	tpaca->kvm_vcore = vc;
> +	smp_wmb();
> +	if (vcpu->arch.ptid) {
> +		tpaca->cpu_start = 0x80;
> +		tpaca->shadow_vcpu.in_guest = KVM_GUEST_MODE_GUEST;
> +		wmb();
> +		xics_wake_cpu(cpu);
> +		++vc->n_woken;
> +	}
> +}
> +
> +static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
> +{
> +	int i;
> +
> +	HMT_low();
> +	i = 0;
> +	while (vc->nap_count < vc->n_woken) {
> +		if (++i >= 1000000) {
> +			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
> +			       vc->nap_count, vc->n_woken);
> +			break;
> +		}
> +		cpu_relax();
> +	}
> +	HMT_medium();
> +}
> 
> -static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu)
> +static int kvmppc_run_core(struct kvmppc_vcore *vc)
> {
> +	struct kvm_vcpu *vcpu;
> +	long i, ret;
> 	u64 now;
> 
> +	/* don't start if any threads have a signal pending */
> +	for (i = 0; i < vc->n_runnable; ++i) {
> +		if (signal_pending(vc->run_task[i]))
> +			return 0;
> +	}
> +
> +	vc->n_woken = 0;
> +	vc->nap_count = 0;
> +	vc->entry_exit_count = 0;
> +	vc->vcore_running = 1;
> +	vc->in_guest = 0;
> +	vc->pcpu = smp_processor_id();
> +	for (i = 0; i < vc->n_runnable; ++i)
> +		kvmppc_start_thread(vc->runnable_threads[i]);
> +	vcpu = vc->runnable_threads[0];
> +
> +	spin_unlock(&vc->lock);
> +
> +	kvm_guest_enter();
> +	__kvmppc_vcore_entry(NULL, vcpu);
> +
> +	/* wait for secondary threads to get back to nap mode */
> +	spin_lock(&vc->lock);
> +	if (vc->nap_count < vc->n_woken)
> +		kvmppc_wait_for_nap(vc);

So you're taking the vcore wide lock and wait for other CPUs to set themselves to nap? Not sure I fully understand this. Why would another thread want to go to nap mode when it's 100% busy?

> +	vc->vcore_running = 2;
> +	spin_unlock(&vc->lock);
> +
> +	/* make sure updates to secondary vcpu structs are visible now */
> +	smp_mb();
> +	kvm_guest_exit();
> +
> +	preempt_enable();
> +	kvm_resched(vcpu);
> +
> +	now = get_tb();
> +	for (i = 0; i < vc->n_runnable; ++i) {
> +		vcpu = vc->runnable_threads[i];
> +		/* cancel pending dec exception if dec is positive */
> +		if (now < vcpu->arch.dec_expires &&
> +		    kvmppc_core_pending_dec(vcpu))
> +			kvmppc_core_dequeue_dec(vcpu);
> +		if (!vcpu->arch.trap) {
> +			if (signal_pending(vc->run_task[i])) {
> +				vc->kvm_run[i]->exit_reason = KVM_EXIT_INTR;
> +				vcpu->arch.ret = -EINTR;
> +			}
> +			continue;		/* didn't get to run */
> +		}
> +		ret = kvmppc_handle_exit(vc->kvm_run[i], vcpu, vc->run_task[i]);
> +		vcpu->arch.ret = ret;
> +		vcpu->arch.trap = 0;
> +	}
> +
> +	preempt_disable();
> +	spin_lock(&vc->lock);
> +
> +	vc->vcore_running = 0;
> +	for (i = 0; i < vc->n_runnable; ) {
> +		vcpu = vc->runnable_threads[i];
> +		if (vcpu->arch.ret != RESUME_GUEST) {
> +			kvmppc_remove_runnable(vc, i);
> +			wake_up(&vcpu->arch.cpu_run);
> +		} else
> +			++i;
> +	}
> +
> +	return 1;
> +}
> +
> +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> +{
> +	int ptid;
> +	int wait_state;
> +	struct kvmppc_vcore *vc;
> +	DEFINE_WAIT(wait);
> +
> +	/* No need to go into the guest when all we do is going out */
> 	if (signal_pending(current)) {
> -		run->exit_reason = KVM_EXIT_INTR;
> +		kvm_run->exit_reason = KVM_EXIT_INTR;
> 		return -EINTR;
> 	}
> 
> +	kvm_run->exit_reason = 0;
> +	vcpu->arch.ret = RESUME_GUEST;
> +	vcpu->arch.trap = 0;
> +
> 	flush_fp_to_thread(current);
> 	flush_altivec_to_thread(current);
> 	flush_vsx_to_thread(current);
> 	preempt_disable();
> 
> -	kvm_guest_enter();
> +	/*
> +	 * Synchronize with other threads in this virtual core
> +	 */
> +	vc = vcpu->arch.vcore;
> +	spin_lock(&vc->lock);
> +	/* This happens the first time this is called for a vcpu */
> +	if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED)
> +		--vc->n_blocked;
> +	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
> +	ptid = vc->n_runnable;
> +	vc->runnable_threads[ptid] = vcpu;
> +	vc->run_task[ptid] = current;
> +	vc->kvm_run[ptid] = kvm_run;
> +	vcpu->arch.ptid = ptid;
> +	++vc->n_runnable;
> +
> +	wait_state = TASK_INTERRUPTIBLE;
> +	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
> +		if (signal_pending(current)) {
> +			if (!vc->vcore_running) {
> +				kvm_run->exit_reason = KVM_EXIT_INTR;
> +				vcpu->arch.ret = -EINTR;
> +				break;
> +			}
> +			/* have to wait for vcore to stop executing guest */
> +			wait_state = TASK_UNINTERRUPTIBLE;
> +			smp_send_reschedule(vc->pcpu);
> +		}
> 
> -	__kvmppc_vcore_entry(NULL, vcpu);
> +		if (!vc->vcore_running &&
> +		    vc->n_runnable + vc->n_blocked == vc->num_threads) {
> +			/* we can run now */
> +			if (kvmppc_run_core(vc))
> +				continue;
> +		}
> 
> -	kvm_guest_exit();
> +		if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0)
> +			kvmppc_start_thread(vcpu);
> 
> -	preempt_enable();
> -	kvm_resched(vcpu);
> +		/* wait for other threads to come in, or wait for vcore */
> +		prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
> +		spin_unlock(&vc->lock);
> +		schedule();
> +		finish_wait(&vcpu->arch.cpu_run, &wait);
> +		spin_lock(&vc->lock);
> +	}
> 
> -	now = get_tb();
> -	/* cancel pending dec exception if dec is positive */
> -	if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
> -		kvmppc_core_dequeue_dec(vcpu);
> +	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
> +		kvmppc_remove_runnable(vc, vcpu->arch.ptid);
> +	spin_unlock(&vc->lock);
> 
> -	return kvmppc_handle_exit(run, vcpu, current);
> +	return vcpu->arch.ret;
> }
> 
> int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 95f6386..f1d3779 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -111,6 +111,32 @@ kvmppc_trampoline_enter:
>  *                                                                            *
>  *****************************************************************************/
> 
> +#define XICS_XIRR		4
> +#define XICS_QIRR		0xc
> +
> +/*
> + * We come in here when wakened from nap mode on a secondary hw thread.
> + * Relocation is off and most register values are lost.
> + * r13 points to the PACA.
> + */
> +	.globl	kvm_start_guest
> +kvm_start_guest:
> +	ld	r1,PACAEMERGSP(r13)
> +	subi	r1,r1,STACK_FRAME_OVERHEAD
> +
> +	/* get vcpu pointer */
> +	ld	r4, PACA_KVM_VCPU(r13)
> +
> +	/* We got here with an IPI; clear it */
> +	ld	r5, PACA_XICS_PHYS(r13)
> +	li	r0, 0xff
> +	li	r6, XICS_QIRR
> +	li	r7, XICS_XIRR
> +	lwzcix	r8, r5, r7		/* ack the interrupt */
> +	sync
> +	stbcix	r0, r5, r6		/* clear it */
> +	stwcix	r8, r5, r7		/* EOI it */
> +
> .global kvmppc_handler_trampoline_enter
> kvmppc_handler_trampoline_enter:
> 
> @@ -229,7 +255,20 @@ kvmppc_handler_trampoline_enter:
> 	slbia
> 	ptesync
> 
> -	/* Switch to guest partition. */
> +	/* Increment entry count iff exit count is zero. */
> + 	ld	r5,PACA_KVM_VCORE(r13)
> +	addi	r9,r5,VCORE_ENTRY_EXIT
> +21:	lwarx	r3,0,r9
> +	cmpwi	r3,0x100		/* any threads starting to exit? */
> +	bge	secondary_too_late	/* if so we're too late to the party */
> +	addi	r3,r3,1
> +	stwcx.	r3,0,r9
> +	bne	21b
> +
> +	/* Primary thread switches to guest partition. */
> +	lwz	r6,VCPU_PTID(r4)
> +	cmpwi	r6,0
> +	bne	20f
> 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> 	ld	r6,KVM_SDR1(r9)
> 	lwz	r7,KVM_LPID(r9)
> @@ -239,7 +278,15 @@ kvmppc_handler_trampoline_enter:
> 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
> 	mtspr	SPRN_LPID,r7
> 	isync
> -	ld	r8,VCPU_LPCR(r4)
> +	li	r0,1
> +	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
> +	b	10f
> +
> +	/* Secondary threads wait for primary to have done partition switch */
> +20:	lbz	r0,VCORE_IN_GUEST(r5)
> +	cmpwi	r0,0
> +	beq	20b
> +10:	ld	r8,VCPU_LPCR(r4)
> 	mtspr	SPRN_LPCR,r8
> 	isync
> 
> @@ -254,10 +301,12 @@ kvmppc_handler_trampoline_enter:
> 	 * Invalidate the TLB if we could possibly have stale TLB
> 	 * entries for this partition on this core due to the use
> 	 * of tlbiel.
> +	 * XXX maybe only need this on primary thread?
> 	 */
> 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> 	lwz	r5,VCPU_VCPUID(r4)
> 	lhz	r6,PACAPACAINDEX(r13)
> +	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v */
> 	lhz	r8,VCPU_LAST_CPU(r4)
> 	sldi	r7,r6,1			/* see if this is the same vcpu */
> 	add	r7,r7,r9		/* as last ran on this pcpu */
> @@ -540,8 +589,51 @@ hcall_real_cont:
> 	ptesync
> 
> hdec_soon:
> -	/* Switch back to host partition */
> +	/* Increment the threads-exiting-guest count in the 0xff00
> +	   bits of vcore->entry_exit_count */
> +	lwsync
> +	ld	r5,PACA_KVM_VCORE(r13)
> +	addi	r6,r5,VCORE_ENTRY_EXIT
> +41:	lwarx	r3,0,r6
> +	addi	r0,r3,0x100
> +	stwcx.	r0,0,r6
> +	bne	41b
> +
> +	/* If this is not a HDEC interrupt and there are other threads,
> +	   and we were the first thread to take an interrupt,
> +	   set HDEC to 0 to pull the other threads out of the guest. */
> +	cmpwi	r12,0x980
> +	beq	40f
> +	cmpwi	r3,0x100

good old use define comment :)

> +	bge	40f
> +	cmpwi	r3,1
> +	ble	40f
> +	li	r0,0
> +	mtspr	SPRN_HDEC,r0
> +40:
> +
> +	/* Secondary threads wait for primary to do partition switch */
> 	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> +	ld	r5,PACA_KVM_VCORE(r13)
> +	lwz	r3,VCPU_PTID(r9)
> +	cmpwi	r3,0
> +	beq	15f
> +	HMT_LOW
> +13:	lbz	r3,VCORE_IN_GUEST(r5)
> +	cmpwi	r3,0
> +	bne	13b
> +	HMT_MEDIUM
> +	b	16f
> +
> +	/* Primary thread waits for all the secondaries to exit guest */
> +15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
> +	srwi	r0,r3,8
> +	clrldi	r3,r3,56
> +	cmpw	r3,r0
> +	bne	15b
> +	isync
> +
> +	/* Primary thread switches back to host partition */
> 	ld	r6,KVM_HOST_SDR1(r4)
> 	lwz	r7,KVM_HOST_LPID(r4)
> 	li	r8,0x3ff		/* switch to reserved LPID */
> @@ -550,10 +642,12 @@ hdec_soon:
> 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
> 	mtspr	SPRN_LPID,r7
> 	isync
> +	li	r0,0
> +	stb	r0,VCORE_IN_GUEST(r5)
> 	lis	r8,0x7fff
> 	mtspr	SPRN_HDEC,r8
> 
> -	ld	r8,KVM_HOST_LPCR(r4)
> +16:	ld	r8,KVM_HOST_LPCR(r4)
> 	mtspr	SPRN_LPCR,r8
> 	isync
> 
> @@ -662,6 +756,11 @@ hdec_soon:
> 	mr	r3, r9
> 	bl	.kvmppc_save_fp
> 
> +	/* Secondary threads go off to take a nap */
> +	lwz	r0,VCPU_PTID(r3)
> +	cmpwi	r0,0
> +	bne	secondary_nap
> +
> 	/* RFI into the highmem handler */
> 	mfmsr	r7
> 	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging */
> @@ -807,3 +906,53 @@ _GLOBAL(kvmppc_h_set_dabr)
> 	mtspr	SPRN_DABR,r4
> 	li	r3,0
> 	blr
> +
> +secondary_too_late:
> +	ld	r5,PACA_KVM_VCORE(r13)
> +	HMT_LOW
> +13:	lbz	r3,VCORE_IN_GUEST(r5)
> +	cmpwi	r3,0
> +	bne	13b
> +	HMT_MEDIUM
> +	ld	r11,PACA_SLBSHADOWPTR(r13)
> +
> +	.rept	SLB_NUM_BOLTED
> +	ld	r5,SLBSHADOW_SAVEAREA(r11)
> +	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
> +	andis.	r7,r5,SLB_ESID_V@h
> +	beq	1f
> +	slbmte	r6,r5
> +1:	addi	r11,r11,16
> +	.endr
> +	b	50f
> +
> +secondary_nap:
> +	/* Clear any pending IPI */
> +50:	ld	r5, PACA_XICS_PHYS(r13)
> +	li	r0, 0xff
> +	li	r6, XICS_QIRR
> +	stbcix	r0, r5, r6
> +
> +	/* increment the nap count and then go to nap mode */
> +	ld	r4, PACA_KVM_VCORE(r13)
> +	addi	r4, r4, VCORE_NAP_COUNT
> +	lwsync				/* make previous updates visible */
> +51:	lwarx	r3, 0, r4
> +	addi	r3, r3, 1
> +	stwcx.	r3, 0, r4
> +	bne	51b
> +	isync
> +
> +	mfspr	r4, SPRN_LPCR
> +	li	r0, LPCR_PECE
> +	andc	r4, r4, r0
> +	ori	r4, r4, LPCR_PECE0	/* exit nap on interrupt */
> +	mtspr	SPRN_LPCR, r4
> +	li	r0, 0
> +	std	r0, SHADOW_VCPU_OFF + SVCPU_SCRATCH0(r13)
> +	ptesync
> +	ld	r0, SHADOW_VCPU_OFF + SVCPU_SCRATCH0(r13)
> +1:	cmpd	r0, r0
> +	bne	1b
> +	nap
> +	b	.
> diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
> index be5e3d7..01149c0 100644
> --- a/arch/powerpc/sysdev/xics/icp-native.c
> +++ b/arch/powerpc/sysdev/xics/icp-native.c
> @@ -23,6 +23,7 @@
> #include <asm/irq.h>
> #include <asm/errno.h>
> #include <asm/xics.h>
> +#include <asm/kvm_ppc.h>
> 
> struct icp_ipl {
> 	union {
> @@ -142,6 +143,11 @@ static inline void icp_native_do_message(int cpu, int msg)
> 	icp_native_set_qirr(cpu, IPI_PRIORITY);
> }
> 
> +void xics_wake_cpu(int cpu)
> +{
> +	icp_native_set_qirr(cpu, IPI_PRIORITY);
> +}
> +
> static void icp_native_message_pass(int target, int msg)
> {
> 	unsigned int i;
> @@ -204,6 +210,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr,
> 	}
> 
> 	icp_native_regs[cpu] = ioremap(addr, size);
> +	kvmppc_set_xics_phys(cpu, addr);
> 	if (!icp_native_regs[cpu]) {
> 		pr_warning("icp_native: Failed ioremap for CPU %d, "
> 			   "interrupt server #0x%x, addr %#lx\n",
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 3d3cdf1..952d556 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -550,6 +550,9 @@ struct kvm_ppc_pvinfo {
> #ifdef __KVM_HAVE_SPAPR_TCE
> #define KVM_CAP_SPAPR_TCE 60
> #endif
> +#ifdef __KVM_HAVE_PPC_SMT
> +#define KVM_CAP_PPC_SMT 61
> +#endif
> 
> #ifdef KVM_CAP_IRQ_ROUTING
> 
> -- 
> 1.7.4.4
> 

Maybe I also missed the point here, but how does this correlate with Linux threads? Is each vcpu running in its own Linux thread? How does the scheduling happen? IIUC the host only sees a single thread per core and then distributes the vcpus to the respective host threads.

Alex



^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
@ 2011-05-17  8:21     ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  8:21 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:46, Paul Mackerras wrote:

> This lifts the restriction that book3s_hv guests can only run one
> hardware thread per core, and allows them to use up to 4 threads
> per core on POWER7.  The host still has to run single-threaded.
>=20
> This capability is advertised to qemu through a new KVM_CAP_PPC_SMT
> capability.
>=20
> To use this, the host kernel should be booted with all threads
> active, and then all the secondary threads should be offlined.
> This will put the secondary threads into nap mode.  KVM will then
> wake them from nap mode and use them for running guest code (while
> they are still offline).  To wake the secondary threads, we send
> them an IPI using a new xics_wake_cpu() function, implemented in
> arch/powerpc/sysdev/xics/icp-native.c.  In other words, at this stage
> we assume that the platform has a XICS interrupt controller and
> we are using icp-native.c to drive it.  Since the woken thread will
> need to acknowledge and clear the IPI, we also export the base
> physical address of the XICS registers using kvmppc_set_xics_phys()
> for use in the low-level KVM book3s code.
>=20
> When a vcpu is created, it is assigned to a virtual core (vcore).
> The vcore number is obtained by dividing the vcpu number by 4,
> since we assume at most 4 threads per core.  Thus, if qemu wishes
> to run the guest in single-threaded mode, it should make all vcpu
> numbers be multiples of 4.
>=20
> We distinguish three states of a vcpu: runnable (i.e., ready to =
execute
> the guest), blocked (that is, idle), and busy in host.  We currently
> implement a policy that the vcore can run only when all its threads
> are runnable or blocked.  This way, if a vcpu needs to execute =
elsewhere
> in the kernel or in qemu, it can do so without being starved of CPU
> by the other vcpus.
>=20
> When a vcore starts to run, it executes in the context of one of the
> vcpu threads.  The other vcpu threads all go to sleep and stay asleep
> until something happens requiring the vcpu thread to return to qemu,
> or to wake up to run the vcore (this can happen when another vcpu
> thread goes from busy in host state to blocked).
>=20
> It can happen that a vcpu goes from blocked to runnable state (e.g.
> because of an interrupt), and the vcore it belongs to is already
> running.  In that case it can start to run immediately as long as
> the none of the vcpus in the vcore have started to exit the guest.
> We send the next free thread in the vcore an IPI to get it to start
> to execute the guest.  It synchronizes with the other threads via
> the vcore->entry_exit_count field to make sure that it doesn't go
> into the guest if the other vcpus are exiting by the time that it
> is ready to actually enter the guest.
>=20
> Note that there is no fixed relationship between the hardware thread
> number and the vcpu number.  Hardware threads are assigned to vcpus
> as they become runnable, so we will always use the lower-numbered
> hardware threads in preference to higher-numbered threads if not all
> the vcpus in the vcore are runnable, regardless of which vcpus are
> runnable.
>=20
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
> arch/powerpc/include/asm/kvm.h          |    1 +
> arch/powerpc/include/asm/kvm_host.h     |   45 +++++-
> arch/powerpc/include/asm/kvm_ppc.h      |   13 ++
> arch/powerpc/include/asm/paca.h         |    2 +
> arch/powerpc/kernel/asm-offsets.c       |    6 +
> arch/powerpc/kernel/exceptions-64s.S    |   31 +++-
> arch/powerpc/kernel/idle_power7.S       |    2 -
> arch/powerpc/kvm/Kconfig                |    2 +-
> arch/powerpc/kvm/book3s_hv.c            |  266 =
+++++++++++++++++++++++++++++--
> arch/powerpc/kvm/book3s_hv_rmhandlers.S |  157 ++++++++++++++++++-
> arch/powerpc/sysdev/xics/icp-native.c   |    7 +
> include/linux/kvm.h                     |    3 +
> 12 files changed, 506 insertions(+), 29 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/kvm.h =
b/arch/powerpc/include/asm/kvm.h
> index a9e641b..624c872 100644
> --- a/arch/powerpc/include/asm/kvm.h
> +++ b/arch/powerpc/include/asm/kvm.h
> @@ -24,6 +24,7 @@
>=20
> /* Select powerpc specific features in <linux/kvm.h> */
> #define __KVM_HAVE_SPAPR_TCE
> +#define __KVM_HAVE_PPC_SMT
>=20
> struct kvm_regs {
> 	__u64 pc;
> diff --git a/arch/powerpc/include/asm/kvm_host.h =
b/arch/powerpc/include/asm/kvm_host.h
> index cda183e..a2085da 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -25,10 +25,15 @@
> #include <linux/interrupt.h>
> #include <linux/types.h>
> #include <linux/kvm_types.h>
> +#include <linux/threads.h>
> +#include <linux/spinlock.h>
> #include <linux/kvm_para.h>
> #include <asm/kvm_asm.h>
> +#include <asm/processor.h>
>=20
> -#define KVM_MAX_VCPUS 1
> +#define KVM_MAX_VCPUS		NR_CPUS
> +#define KVM_THREADS_PER_CORE	4

So what if POWER8 (or whatever it will be called) comes along with 8 =
threads per core? Would that change the userspace interface?

> +#define KVM_MAX_VCORES		(KVM_MAX_VCPUS / =
KVM_THREADS_PER_CORE)
> #define KVM_MEMORY_SLOTS 32
> /* memory slots that does not exposed to userspace */
> #define KVM_PRIVATE_MEM_SLOTS 4
> @@ -165,9 +170,36 @@ struct kvm_arch {
> 	unsigned long host_sdr1;
> 	int tlbie_lock;
> 	unsigned short last_vcpu[NR_CPUS];
> +	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
> 	struct list_head spapr_tce_tables;
> };
>=20
> +/*
> + * Struct for a virtual core.
> + * Note: entry_exit_count combines an entry count in the bottom 8 =
bits
> + * and an exit count in the next 8 bits.  This is so that we can
> + * atomically increment the entry count iff the exit count is 0
> + * without taking the lock.
> + */
> +struct kvmppc_vcore {
> +	int n_runnable;
> +	int n_blocked;
> +	int num_threads;
> +	int entry_exit_count;
> +	int n_woken;
> +	int nap_count;
> +	u16 pcpu;
> +	u8 vcore_running;
> +	u8 in_guest;
> +	struct kvm_vcpu *runnable_threads[KVM_THREADS_PER_CORE];
> +	struct task_struct *run_task[KVM_THREADS_PER_CORE];
> +	struct kvm_run *kvm_run[KVM_THREADS_PER_CORE];
> +	spinlock_t lock;
> +};
> +
> +#define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
> +#define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
> +
> struct kvmppc_pte {
> 	ulong eaddr;
> 	u64 vpage;
> @@ -362,10 +394,21 @@ struct kvm_vcpu_arch {
> 	struct slb_shadow *slb_shadow;
> 	struct dtl *dtl;
> 	struct dtl *dtl_end;
> +
> +	struct kvmppc_vcore *vcore;
> +	int ret;
> 	int trap;
> +	int state;
> +	int ptid;
> +	wait_queue_head_t cpu_run;
> +
> 	struct kvm_vcpu_arch_shared *shared;
> 	unsigned long magic_page_pa; /* phys addr to map the magic page =
to */
> 	unsigned long magic_page_ea; /* effect. addr to map the magic =
page to */
> };
>=20
> +#define KVMPPC_VCPU_BUSY_IN_HOST	0
> +#define KVMPPC_VCPU_BLOCKED		1
> +#define KVMPPC_VCPU_RUNNABLE		2
> +
> #endif /* __POWERPC_KVM_HOST_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h =
b/arch/powerpc/include/asm/kvm_ppc.h
> index de683fa..ed75975 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -33,6 +33,9 @@
> #else
> #include <asm/kvm_booke.h>
> #endif
> +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> +#include <asm/paca.h>
> +#endif
>=20
> enum emulation_result {
> 	EMULATE_DONE,         /* no further processing */
> @@ -159,4 +162,14 @@ static inline u32 kvmppc_set_field(u64 inst, int =
msb, int lsb, int value)
> 	return r;
> }
>=20
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
> +{
> +	paca[cpu].xics_phys =3D addr;
> +}
> +#else
> +static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
> +{}
> +#endif
> +
> #endif /* __POWERPC_KVM_PPC_H__ */
> diff --git a/arch/powerpc/include/asm/paca.h =
b/arch/powerpc/include/asm/paca.h
> index 8dba5f6..8b6628c 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -151,6 +151,8 @@ struct paca_struct {
> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> #ifdef CONFIG_KVM_BOOK3S_64_HV
> 	struct kvm_vcpu *kvm_vcpu;
> +	struct kvmppc_vcore *kvm_vcore;
> +	unsigned long xics_phys;
> 	u64 dabr;
> 	u64 host_mmcr[3];
> 	u32 host_pmc[6];
> diff --git a/arch/powerpc/kernel/asm-offsets.c =
b/arch/powerpc/kernel/asm-offsets.c
> index fd56f14..1fee2cf 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -204,12 +204,14 @@ int main(void)
> 	DEFINE(PACA_KVM_SVCPU, offsetof(struct paca_struct, =
shadow_vcpu));
> #ifdef CONFIG_KVM_BOOK3S_64_HV
> 	DEFINE(PACA_KVM_VCPU, offsetof(struct paca_struct, kvm_vcpu));
> +	DEFINE(PACA_KVM_VCORE, offsetof(struct paca_struct, kvm_vcore));
> 	DEFINE(PACA_HOST_MMCR, offsetof(struct paca_struct, host_mmcr));
> 	DEFINE(PACA_HOST_PMC, offsetof(struct paca_struct, host_pmc));
> 	DEFINE(PACA_HOST_PURR, offsetof(struct paca_struct, host_purr));
> 	DEFINE(PACA_HOST_SPURR, offsetof(struct paca_struct, =
host_spurr));
> 	DEFINE(PACA_HOST_DSCR, offsetof(struct paca_struct, host_dscr));
> 	DEFINE(PACA_DABR, offsetof(struct paca_struct, dabr));
> +	DEFINE(PACA_XICS_PHYS, offsetof(struct paca_struct, xics_phys));
> 	DEFINE(PACA_KVM_DECEXP, offsetof(struct paca_struct, =
dec_expires));
> #endif
> #endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
> @@ -478,6 +480,10 @@ int main(void)
> 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, =
arch.fault_dar));
> 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, =
arch.last_inst));
> 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
> +	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
> +	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, =
entry_exit_count));
> +	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, =
nap_count));
> +	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
> 	DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, =
shadow_vcpu) -
> 			   offsetof(struct kvmppc_vcpu_book3s, vcpu));
> 	DEFINE(SVCPU_HOST_R1, offsetof(struct kvmppc_book3s_shadow_vcpu, =
host_r1));
> diff --git a/arch/powerpc/kernel/exceptions-64s.S =
b/arch/powerpc/kernel/exceptions-64s.S
> index 80c6456..803dcff 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -49,19 +49,32 @@ BEGIN_FTR_SECTION
> 	 * state loss at this time.
> 	 */
> 	mfspr	r13,SPRN_SRR1
> -	rlwinm	r13,r13,47-31,30,31
> -	cmpwi	cr0,r13,1
> -	bne	1f
> -	b	.power7_wakeup_noloss
> -1:	cmpwi	cr0,r13,2
> -	bne	1f
> -	b	.power7_wakeup_loss
> +	rlwinm.	r13,r13,47-31,30,31
> +	beq	9f
> +
> +	/* waking up from powersave (nap) state */
> +	cmpwi	cr1,r13,2
> 	/* Total loss of HV state is fatal, we could try to use the
> 	 * PIR to locate a PACA, then use an emergency stack etc...
> 	 * but for now, let's just stay stuck here
> 	 */
> -1:	cmpwi	cr0,r13,3
> -	beq	.
> +	bgt	cr1,.
> +	GET_PACA(r13)
> +
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +	lbz	r0,PACAPROCSTART(r13)
> +	cmpwi	r0,0x80
> +	bne	1f
> +	li	r0,0
> +	stb	r0,PACAPROCSTART(r13)
> +	b	kvm_start_guest
> +1:
> +#endif
> +
> +	beq	cr1,2f
> +	b	.power7_wakeup_noloss
> +2:	b	.power7_wakeup_loss
> +9:
> END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206)
> #endif /* CONFIG_PPC_P7_NAP */
> 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, =
EXC_STD,
> diff --git a/arch/powerpc/kernel/idle_power7.S =
b/arch/powerpc/kernel/idle_power7.S
> index f8f0bc7..3a70845 100644
> --- a/arch/powerpc/kernel/idle_power7.S
> +++ b/arch/powerpc/kernel/idle_power7.S
> @@ -73,7 +73,6 @@ _GLOBAL(power7_idle)
> 	b	.
>=20
> _GLOBAL(power7_wakeup_loss)
> -	GET_PACA(r13)
> 	ld	r1,PACAR1(r13)
> 	REST_NVGPRS(r1)
> 	REST_GPR(2, r1)
> @@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss)
> 	rfid
>=20
> _GLOBAL(power7_wakeup_noloss)
> -	GET_PACA(r13)
> 	ld	r1,PACAR1(r13)
> 	ld	r4,_MSR(r1)
> 	ld	r5,_NIP(r1)
> diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
> index 6ff191b..27d8e36 100644
> --- a/arch/powerpc/kvm/Kconfig
> +++ b/arch/powerpc/kvm/Kconfig
> @@ -58,7 +58,7 @@ config KVM_BOOK3S_64
>=20
> config KVM_BOOK3S_64_HV
> 	bool "KVM support for POWER7 using hypervisor mode in host"
> -	depends on EXPERIMENTAL && PPC_BOOK3S_64
> +	depends on EXPERIMENTAL && PPC_BOOK3S_64 && PPC_PSERIES
> 	select KVM
> 	select KVM_BOOK3S_64
> 	---help---
> diff --git a/arch/powerpc/kvm/book3s_hv.c =
b/arch/powerpc/kvm/book3s_hv.c
> index eed2c10..ce006c0 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -50,6 +50,7 @@
> void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> {
> 	local_paca->kvm_vcpu =3D vcpu;
> +	local_paca->kvm_vcore =3D vcpu->arch.vcore;
> 	vcpu->cpu =3D cpu;
> }
>=20
> @@ -58,6 +59,9 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
> 	vcpu->cpu =3D -1;
> }
>=20
> +static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu);
> +static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu);
> +
> void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
> {
> 	u64 now;
> @@ -75,11 +79,15 @@ void kvmppc_vcpu_block(struct kvm_vcpu *vcpu)
> 			      HRTIMER_MODE_REL);
> 	}
>=20
> +	kvmppc_vcpu_blocked(vcpu);
> +
> 	kvm_vcpu_block(vcpu);
> 	vcpu->stat.halt_wakeup++;
>=20
> 	if (vcpu->arch.dec_expires !=3D ~(u64)0)
> 		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
> +
> +	kvmppc_vcpu_unblocked(vcpu);
> }
>=20
> void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
> @@ -415,6 +423,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct =
kvm *kvm, unsigned int id)
> {
> 	struct kvm_vcpu *vcpu;
> 	int err =3D -ENOMEM;
> +	int core;
> +	struct kvmppc_vcore *vcore;
> 	unsigned long lpcr;
>=20
> 	vcpu =3D kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
> @@ -443,6 +453,37 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct =
kvm *kvm, unsigned int id)
> 	lpcr |=3D LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | =
LPCR_HDICE;
> 	vcpu->arch.lpcr =3D lpcr;
>=20
> +	/*
> +	 * Some vcpus may start out in stopped state.  If we initialize
> +	 * them to busy-in-host state they will stop other vcpus in the
> +	 * vcore from running.  Instead we initialize them to blocked
> +	 * state, effectively considering them to be stopped until we
> +	 * see the first run ioctl for them.
> +	 */
> +	vcpu->arch.state =3D KVMPPC_VCPU_BLOCKED;
> +
> +	init_waitqueue_head(&vcpu->arch.cpu_run);
> +	core =3D id / KVM_THREADS_PER_CORE;
> +
> +	mutex_lock(&kvm->lock);
> +	vcore =3D kvm->arch.vcores[core];
> +	if (!vcore) {
> +		vcore =3D kzalloc(sizeof(struct kvmppc_vcore), =
GFP_KERNEL);
> +		if (vcore)
> +			spin_lock_init(&vcore->lock);
> +		kvm->arch.vcores[core] =3D vcore;
> +	}
> +	mutex_unlock(&kvm->lock);
> +
> +	if (!vcore)
> +		goto free_vcpu;
> +
> +	spin_lock(&vcore->lock);
> +	++vcore->num_threads;
> +	++vcore->n_blocked;
> +	spin_unlock(&vcore->lock);
> +	vcpu->arch.vcore =3D vcore;
> +
> 	return vcpu;
>=20
> free_vcpu:
> @@ -457,37 +498,238 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu =
*vcpu)
> 	kfree(vcpu);
> }
>=20
> +static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_vcore *vc =3D vcpu->arch.vcore;
> +
> +	spin_lock(&vc->lock);
> +	vcpu->arch.state =3D KVMPPC_VCPU_BLOCKED;
> +	++vc->n_blocked;
> +	if (vc->n_runnable > 0 &&
> +	    vc->n_runnable + vc->n_blocked =3D=3D vc->num_threads)
> +		wake_up(&vc->runnable_threads[0]->arch.cpu_run);
> +	spin_unlock(&vc->lock);
> +}
> +
> +static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_vcore *vc =3D vcpu->arch.vcore;
> +
> +	spin_lock(&vc->lock);
> +	vcpu->arch.state =3D KVMPPC_VCPU_BUSY_IN_HOST;
> +	--vc->n_blocked;
> +	spin_unlock(&vc->lock);
> +}
> +
> extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct =
kvm_vcpu *vcpu);
> +extern void xics_wake_cpu(int cpu);
> +
> +static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, int ptid)
> +{
> +	int i;
> +	struct kvm_vcpu *vcpu;
> +
> +	vcpu =3D vc->runnable_threads[ptid];
> +	if (vcpu->arch.state !=3D KVMPPC_VCPU_RUNNABLE)
> +		return;
> +	vcpu->arch.state =3D KVMPPC_VCPU_BUSY_IN_HOST;
> +	--vc->n_runnable;
> +	for (i =3D ptid; i < vc->n_runnable; ++i) {
> +		vcpu =3D vc->runnable_threads[i+1];
> +		vc->runnable_threads[i] =3D vcpu;
> +		vcpu->arch.ptid =3D i;
> +		vc->run_task[i] =3D vc->run_task[i+1];
> +		vc->kvm_run[i] =3D vc->kvm_run[i+1];
> +	}
> +}
> +
> +static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
> +{
> +	int cpu;
> +	struct paca_struct *tpaca;
> +	struct kvmppc_vcore *vc =3D vcpu->arch.vcore;
> +
> +	cpu =3D vc->pcpu + vcpu->arch.ptid;
> +	tpaca =3D &paca[cpu];
> +	tpaca->kvm_vcpu =3D vcpu;
> +	tpaca->kvm_vcore =3D vc;
> +	smp_wmb();
> +	if (vcpu->arch.ptid) {
> +		tpaca->cpu_start =3D 0x80;
> +		tpaca->shadow_vcpu.in_guest =3D KVM_GUEST_MODE_GUEST;
> +		wmb();
> +		xics_wake_cpu(cpu);
> +		++vc->n_woken;
> +	}
> +}
> +
> +static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
> +{
> +	int i;
> +
> +	HMT_low();
> +	i =3D 0;
> +	while (vc->nap_count < vc->n_woken) {
> +		if (++i >=3D 1000000) {
> +			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
> +			       vc->nap_count, vc->n_woken);
> +			break;
> +		}
> +		cpu_relax();
> +	}
> +	HMT_medium();
> +}
>=20
> -static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu =
*vcpu)
> +static int kvmppc_run_core(struct kvmppc_vcore *vc)
> {
> +	struct kvm_vcpu *vcpu;
> +	long i, ret;
> 	u64 now;
>=20
> +	/* don't start if any threads have a signal pending */
> +	for (i =3D 0; i < vc->n_runnable; ++i) {
> +		if (signal_pending(vc->run_task[i]))
> +			return 0;
> +	}
> +
> +	vc->n_woken =3D 0;
> +	vc->nap_count =3D 0;
> +	vc->entry_exit_count =3D 0;
> +	vc->vcore_running =3D 1;
> +	vc->in_guest =3D 0;
> +	vc->pcpu =3D smp_processor_id();
> +	for (i =3D 0; i < vc->n_runnable; ++i)
> +		kvmppc_start_thread(vc->runnable_threads[i]);
> +	vcpu =3D vc->runnable_threads[0];
> +
> +	spin_unlock(&vc->lock);
> +
> +	kvm_guest_enter();
> +	__kvmppc_vcore_entry(NULL, vcpu);
> +
> +	/* wait for secondary threads to get back to nap mode */
> +	spin_lock(&vc->lock);
> +	if (vc->nap_count < vc->n_woken)
> +		kvmppc_wait_for_nap(vc);

So you're taking the vcore wide lock and wait for other CPUs to set =
themselves to nap? Not sure I fully understand this. Why would another =
thread want to go to nap mode when it's 100% busy?

> +	vc->vcore_running =3D 2;
> +	spin_unlock(&vc->lock);
> +
> +	/* make sure updates to secondary vcpu structs are visible now =
*/
> +	smp_mb();
> +	kvm_guest_exit();
> +
> +	preempt_enable();
> +	kvm_resched(vcpu);
> +
> +	now =3D get_tb();
> +	for (i =3D 0; i < vc->n_runnable; ++i) {
> +		vcpu =3D vc->runnable_threads[i];
> +		/* cancel pending dec exception if dec is positive */
> +		if (now < vcpu->arch.dec_expires &&
> +		    kvmppc_core_pending_dec(vcpu))
> +			kvmppc_core_dequeue_dec(vcpu);
> +		if (!vcpu->arch.trap) {
> +			if (signal_pending(vc->run_task[i])) {
> +				vc->kvm_run[i]->exit_reason =3D =
KVM_EXIT_INTR;
> +				vcpu->arch.ret =3D -EINTR;
> +			}
> +			continue;		/* didn't get to run */
> +		}
> +		ret =3D kvmppc_handle_exit(vc->kvm_run[i], vcpu, =
vc->run_task[i]);
> +		vcpu->arch.ret =3D ret;
> +		vcpu->arch.trap =3D 0;
> +	}
> +
> +	preempt_disable();
> +	spin_lock(&vc->lock);
> +
> +	vc->vcore_running =3D 0;
> +	for (i =3D 0; i < vc->n_runnable; ) {
> +		vcpu =3D vc->runnable_threads[i];
> +		if (vcpu->arch.ret !=3D RESUME_GUEST) {
> +			kvmppc_remove_runnable(vc, i);
> +			wake_up(&vcpu->arch.cpu_run);
> +		} else
> +			++i;
> +	}
> +
> +	return 1;
> +}
> +
> +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu =
*vcpu)
> +{
> +	int ptid;
> +	int wait_state;
> +	struct kvmppc_vcore *vc;
> +	DEFINE_WAIT(wait);
> +
> +	/* No need to go into the guest when all we do is going out */
> 	if (signal_pending(current)) {
> -		run->exit_reason =3D KVM_EXIT_INTR;
> +		kvm_run->exit_reason =3D KVM_EXIT_INTR;
> 		return -EINTR;
> 	}
>=20
> +	kvm_run->exit_reason =3D 0;
> +	vcpu->arch.ret =3D RESUME_GUEST;
> +	vcpu->arch.trap =3D 0;
> +
> 	flush_fp_to_thread(current);
> 	flush_altivec_to_thread(current);
> 	flush_vsx_to_thread(current);
> 	preempt_disable();
>=20
> -	kvm_guest_enter();
> +	/*
> +	 * Synchronize with other threads in this virtual core
> +	 */
> +	vc =3D vcpu->arch.vcore;
> +	spin_lock(&vc->lock);
> +	/* This happens the first time this is called for a vcpu */
> +	if (vcpu->arch.state =3D=3D KVMPPC_VCPU_BLOCKED)
> +		--vc->n_blocked;
> +	vcpu->arch.state =3D KVMPPC_VCPU_RUNNABLE;
> +	ptid =3D vc->n_runnable;
> +	vc->runnable_threads[ptid] =3D vcpu;
> +	vc->run_task[ptid] =3D current;
> +	vc->kvm_run[ptid] =3D kvm_run;
> +	vcpu->arch.ptid =3D ptid;
> +	++vc->n_runnable;
> +
> +	wait_state =3D TASK_INTERRUPTIBLE;
> +	while (vcpu->arch.state =3D=3D KVMPPC_VCPU_RUNNABLE) {
> +		if (signal_pending(current)) {
> +			if (!vc->vcore_running) {
> +				kvm_run->exit_reason =3D KVM_EXIT_INTR;
> +				vcpu->arch.ret =3D -EINTR;
> +				break;
> +			}
> +			/* have to wait for vcore to stop executing =
guest */
> +			wait_state =3D TASK_UNINTERRUPTIBLE;
> +			smp_send_reschedule(vc->pcpu);
> +		}
>=20
> -	__kvmppc_vcore_entry(NULL, vcpu);
> +		if (!vc->vcore_running &&
> +		    vc->n_runnable + vc->n_blocked =3D=3D =
vc->num_threads) {
> +			/* we can run now */
> +			if (kvmppc_run_core(vc))
> +				continue;
> +		}
>=20
> -	kvm_guest_exit();
> +		if (vc->vcore_running =3D=3D 1 && VCORE_EXIT_COUNT(vc) =
=3D=3D 0)
> +			kvmppc_start_thread(vcpu);
>=20
> -	preempt_enable();
> -	kvm_resched(vcpu);
> +		/* wait for other threads to come in, or wait for vcore =
*/
> +		prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
> +		spin_unlock(&vc->lock);
> +		schedule();
> +		finish_wait(&vcpu->arch.cpu_run, &wait);
> +		spin_lock(&vc->lock);
> +	}
>=20
> -	now =3D get_tb();
> -	/* cancel pending dec exception if dec is positive */
> -	if (now < vcpu->arch.dec_expires && =
kvmppc_core_pending_dec(vcpu))
> -		kvmppc_core_dequeue_dec(vcpu);
> +	if (vcpu->arch.state =3D=3D KVMPPC_VCPU_RUNNABLE)
> +		kvmppc_remove_runnable(vc, vcpu->arch.ptid);
> +	spin_unlock(&vc->lock);
>=20
> -	return kvmppc_handle_exit(run, vcpu, current);
> +	return vcpu->arch.ret;
> }
>=20
> int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S =
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 95f6386..f1d3779 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -111,6 +111,32 @@ kvmppc_trampoline_enter:
>  *                                                                     =
       *
>  =
**************************************************************************=
***/
>=20
> +#define XICS_XIRR		4
> +#define XICS_QIRR		0xc
> +
> +/*
> + * We come in here when wakened from nap mode on a secondary hw =
thread.
> + * Relocation is off and most register values are lost.
> + * r13 points to the PACA.
> + */
> +	.globl	kvm_start_guest
> +kvm_start_guest:
> +	ld	r1,PACAEMERGSP(r13)
> +	subi	r1,r1,STACK_FRAME_OVERHEAD
> +
> +	/* get vcpu pointer */
> +	ld	r4, PACA_KVM_VCPU(r13)
> +
> +	/* We got here with an IPI; clear it */
> +	ld	r5, PACA_XICS_PHYS(r13)
> +	li	r0, 0xff
> +	li	r6, XICS_QIRR
> +	li	r7, XICS_XIRR
> +	lwzcix	r8, r5, r7		/* ack the interrupt */
> +	sync
> +	stbcix	r0, r5, r6		/* clear it */
> +	stwcix	r8, r5, r7		/* EOI it */
> +
> .global kvmppc_handler_trampoline_enter
> kvmppc_handler_trampoline_enter:
>=20
> @@ -229,7 +255,20 @@ kvmppc_handler_trampoline_enter:
> 	slbia
> 	ptesync
>=20
> -	/* Switch to guest partition. */
> +	/* Increment entry count iff exit count is zero. */
> + 	ld	r5,PACA_KVM_VCORE(r13)
> +	addi	r9,r5,VCORE_ENTRY_EXIT
> +21:	lwarx	r3,0,r9
> +	cmpwi	r3,0x100		/* any threads starting to exit? =
*/
> +	bge	secondary_too_late	/* if so we're too late to the =
party */
> +	addi	r3,r3,1
> +	stwcx.	r3,0,r9
> +	bne	21b
> +
> +	/* Primary thread switches to guest partition. */
> +	lwz	r6,VCPU_PTID(r4)
> +	cmpwi	r6,0
> +	bne	20f
> 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> 	ld	r6,KVM_SDR1(r9)
> 	lwz	r7,KVM_LPID(r9)
> @@ -239,7 +278,15 @@ kvmppc_handler_trampoline_enter:
> 	mtspr	SPRN_SDR1,r6		/* switch to partition page =
table */
> 	mtspr	SPRN_LPID,r7
> 	isync
> -	ld	r8,VCPU_LPCR(r4)
> +	li	r0,1
> +	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to =
continue */
> +	b	10f
> +
> +	/* Secondary threads wait for primary to have done partition =
switch */
> +20:	lbz	r0,VCORE_IN_GUEST(r5)
> +	cmpwi	r0,0
> +	beq	20b
> +10:	ld	r8,VCPU_LPCR(r4)
> 	mtspr	SPRN_LPCR,r8
> 	isync
>=20
> @@ -254,10 +301,12 @@ kvmppc_handler_trampoline_enter:
> 	 * Invalidate the TLB if we could possibly have stale TLB
> 	 * entries for this partition on this core due to the use
> 	 * of tlbiel.
> +	 * XXX maybe only need this on primary thread?
> 	 */
> 	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
> 	lwz	r5,VCPU_VCPUID(r4)
> 	lhz	r6,PACAPACAINDEX(r13)
> +	rldimi	r6,r5,0,62		/* XXX map as if threads 1:1 p:v =
*/
> 	lhz	r8,VCPU_LAST_CPU(r4)
> 	sldi	r7,r6,1			/* see if this is the same vcpu =
*/
> 	add	r7,r7,r9		/* as last ran on this pcpu */
> @@ -540,8 +589,51 @@ hcall_real_cont:
> 	ptesync
>=20
> hdec_soon:
> -	/* Switch back to host partition */
> +	/* Increment the threads-exiting-guest count in the 0xff00
> +	   bits of vcore->entry_exit_count */
> +	lwsync
> +	ld	r5,PACA_KVM_VCORE(r13)
> +	addi	r6,r5,VCORE_ENTRY_EXIT
> +41:	lwarx	r3,0,r6
> +	addi	r0,r3,0x100
> +	stwcx.	r0,0,r6
> +	bne	41b
> +
> +	/* If this is not a HDEC interrupt and there are other threads,
> +	   and we were the first thread to take an interrupt,
> +	   set HDEC to 0 to pull the other threads out of the guest. */
> +	cmpwi	r12,0x980
> +	beq	40f
> +	cmpwi	r3,0x100

good old use define comment :)

> +	bge	40f
> +	cmpwi	r3,1
> +	ble	40f
> +	li	r0,0
> +	mtspr	SPRN_HDEC,r0
> +40:
> +
> +	/* Secondary threads wait for primary to do partition switch */
> 	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
> +	ld	r5,PACA_KVM_VCORE(r13)
> +	lwz	r3,VCPU_PTID(r9)
> +	cmpwi	r3,0
> +	beq	15f
> +	HMT_LOW
> +13:	lbz	r3,VCORE_IN_GUEST(r5)
> +	cmpwi	r3,0
> +	bne	13b
> +	HMT_MEDIUM
> +	b	16f
> +
> +	/* Primary thread waits for all the secondaries to exit guest */
> +15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
> +	srwi	r0,r3,8
> +	clrldi	r3,r3,56
> +	cmpw	r3,r0
> +	bne	15b
> +	isync
> +
> +	/* Primary thread switches back to host partition */
> 	ld	r6,KVM_HOST_SDR1(r4)
> 	lwz	r7,KVM_HOST_LPID(r4)
> 	li	r8,0x3ff		/* switch to reserved LPID */
> @@ -550,10 +642,12 @@ hdec_soon:
> 	mtspr	SPRN_SDR1,r6		/* switch to partition page =
table */
> 	mtspr	SPRN_LPID,r7
> 	isync
> +	li	r0,0
> +	stb	r0,VCORE_IN_GUEST(r5)
> 	lis	r8,0x7fff
> 	mtspr	SPRN_HDEC,r8
>=20
> -	ld	r8,KVM_HOST_LPCR(r4)
> +16:	ld	r8,KVM_HOST_LPCR(r4)
> 	mtspr	SPRN_LPCR,r8
> 	isync
>=20
> @@ -662,6 +756,11 @@ hdec_soon:
> 	mr	r3, r9
> 	bl	.kvmppc_save_fp
>=20
> +	/* Secondary threads go off to take a nap */
> +	lwz	r0,VCPU_PTID(r3)
> +	cmpwi	r0,0
> +	bne	secondary_nap
> +
> 	/* RFI into the highmem handler */
> 	mfmsr	r7
> 	ori	r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME	/* Enable paging =
*/
> @@ -807,3 +906,53 @@ _GLOBAL(kvmppc_h_set_dabr)
> 	mtspr	SPRN_DABR,r4
> 	li	r3,0
> 	blr
> +
> +secondary_too_late:
> +	ld	r5,PACA_KVM_VCORE(r13)
> +	HMT_LOW
> +13:	lbz	r3,VCORE_IN_GUEST(r5)
> +	cmpwi	r3,0
> +	bne	13b
> +	HMT_MEDIUM
> +	ld	r11,PACA_SLBSHADOWPTR(r13)
> +
> +	.rept	SLB_NUM_BOLTED
> +	ld	r5,SLBSHADOW_SAVEAREA(r11)
> +	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
> +	andis.	r7,r5,SLB_ESID_V@h
> +	beq	1f
> +	slbmte	r6,r5
> +1:	addi	r11,r11,16
> +	.endr
> +	b	50f
> +
> +secondary_nap:
> +	/* Clear any pending IPI */
> +50:	ld	r5, PACA_XICS_PHYS(r13)
> +	li	r0, 0xff
> +	li	r6, XICS_QIRR
> +	stbcix	r0, r5, r6
> +
> +	/* increment the nap count and then go to nap mode */
> +	ld	r4, PACA_KVM_VCORE(r13)
> +	addi	r4, r4, VCORE_NAP_COUNT
> +	lwsync				/* make previous updates visible =
*/
> +51:	lwarx	r3, 0, r4
> +	addi	r3, r3, 1
> +	stwcx.	r3, 0, r4
> +	bne	51b
> +	isync
> +
> +	mfspr	r4, SPRN_LPCR
> +	li	r0, LPCR_PECE
> +	andc	r4, r4, r0
> +	ori	r4, r4, LPCR_PECE0	/* exit nap on interrupt */
> +	mtspr	SPRN_LPCR, r4
> +	li	r0, 0
> +	std	r0, SHADOW_VCPU_OFF + SVCPU_SCRATCH0(r13)
> +	ptesync
> +	ld	r0, SHADOW_VCPU_OFF + SVCPU_SCRATCH0(r13)
> +1:	cmpd	r0, r0
> +	bne	1b
> +	nap
> +	b	.
> diff --git a/arch/powerpc/sysdev/xics/icp-native.c =
b/arch/powerpc/sysdev/xics/icp-native.c
> index be5e3d7..01149c0 100644
> --- a/arch/powerpc/sysdev/xics/icp-native.c
> +++ b/arch/powerpc/sysdev/xics/icp-native.c
> @@ -23,6 +23,7 @@
> #include <asm/irq.h>
> #include <asm/errno.h>
> #include <asm/xics.h>
> +#include <asm/kvm_ppc.h>
>=20
> struct icp_ipl {
> 	union {
> @@ -142,6 +143,11 @@ static inline void icp_native_do_message(int cpu, =
int msg)
> 	icp_native_set_qirr(cpu, IPI_PRIORITY);
> }
>=20
> +void xics_wake_cpu(int cpu)
> +{
> +	icp_native_set_qirr(cpu, IPI_PRIORITY);
> +}
> +
> static void icp_native_message_pass(int target, int msg)
> {
> 	unsigned int i;
> @@ -204,6 +210,7 @@ static int __init icp_native_map_one_cpu(int =
hw_id, unsigned long addr,
> 	}
>=20
> 	icp_native_regs[cpu] =3D ioremap(addr, size);
> +	kvmppc_set_xics_phys(cpu, addr);
> 	if (!icp_native_regs[cpu]) {
> 		pr_warning("icp_native: Failed ioremap for CPU %d, "
> 			   "interrupt server #0x%x, addr %#lx\n",
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 3d3cdf1..952d556 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -550,6 +550,9 @@ struct kvm_ppc_pvinfo {
> #ifdef __KVM_HAVE_SPAPR_TCE
> #define KVM_CAP_SPAPR_TCE 60
> #endif
> +#ifdef __KVM_HAVE_PPC_SMT
> +#define KVM_CAP_PPC_SMT 61
> +#endif
>=20
> #ifdef KVM_CAP_IRQ_ROUTING
>=20
> --=20
> 1.7.4.4
>=20

Maybe I also missed the point here, but how does this correlate with =
Linux threads? Is each vcpu running in its own Linux thread? How does =
the scheduling happen? IIUC the host only sees a single thread per core =
and then distributes the vcpus to the respective host threads.

Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
  2011-05-17  8:01     ` Alexander Graf
@ 2011-05-17  9:11       ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 103+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-17  9:11 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Paul Mackerras, linuxppc-dev, kvm

On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
> I'm not sure I fully understand how this is supposed to work. If the
> tables are kept inside the kernel, how does userspace get to know
> where to DMA to?

The guest gets a dma range from the device-tree which is the range of
device-side dma addresses it can use that correspond to the table.

The guest kernel uses the normal linux iommu space allocator to allocate
space in that region and uses H_PUT_TCE to populate the corresponding
table entries.

This is the same interface that is used for "real" iommu's with PCI
devices btw.

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
@ 2011-05-17  9:11       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 103+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-17  9:11 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Paul Mackerras, kvm

On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
> I'm not sure I fully understand how this is supposed to work. If the
> tables are kept inside the kernel, how does userspace get to know
> where to DMA to?

The guest gets a dma range from the device-tree which is the range of
device-side dma addresses it can use that correspond to the table.

The guest kernel uses the normal linux iommu space allocator to allocate
space in that region and uses H_PUT_TCE to populate the corresponding
table entries.

This is the same interface that is used for "real" iommu's with PCI
devices btw.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
  2011-05-17  9:11       ` Benjamin Herrenschmidt
@ 2011-05-17  9:31         ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  9:31 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Paul Mackerras, linuxppc-dev, kvm


On 17.05.2011, at 11:11, Benjamin Herrenschmidt wrote:

> On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
>> I'm not sure I fully understand how this is supposed to work. If the
>> tables are kept inside the kernel, how does userspace get to know
>> where to DMA to?
> 
> The guest gets a dma range from the device-tree which is the range of
> device-side dma addresses it can use that correspond to the table.
> 
> The guest kernel uses the normal linux iommu space allocator to allocate
> space in that region and uses H_PUT_TCE to populate the corresponding
> table entries.
> 
> This is the same interface that is used for "real" iommu's with PCI
> devices btw.

I'm still slightly puzzled here :). IIUC the main point of an IOMMU is for the kernel to change where device accesses actually go to. So device DMAs address A, goes through the IOMMU, in reality accesses address B.

Now, how do we tell the devices implemented in qemu that they're supposed to DMA to address B instead of A if the mapping table is kept in-kernel?


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
@ 2011-05-17  9:31         ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  9:31 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Paul Mackerras, kvm


On 17.05.2011, at 11:11, Benjamin Herrenschmidt wrote:

> On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
>> I'm not sure I fully understand how this is supposed to work. If the
>> tables are kept inside the kernel, how does userspace get to know
>> where to DMA to?
>=20
> The guest gets a dma range from the device-tree which is the range of
> device-side dma addresses it can use that correspond to the table.
>=20
> The guest kernel uses the normal linux iommu space allocator to =
allocate
> space in that region and uses H_PUT_TCE to populate the corresponding
> table entries.
>=20
> This is the same interface that is used for "real" iommu's with PCI
> devices btw.

I'm still slightly puzzled here :). IIUC the main point of an IOMMU is =
for the kernel to change where device accesses actually go to. So device =
DMAs address A, goes through the IOMMU, in reality accesses address B.

Now, how do we tell the devices implemented in qemu that they're =
supposed to DMA to address B instead of A if the mapping table is kept =
in-kernel?


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
  2011-05-17  9:31         ` Alexander Graf
@ 2011-05-17  9:35           ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 103+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-17  9:35 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Paul Mackerras, linuxppc-dev, kvm

On Tue, 2011-05-17 at 11:31 +0200, Alexander Graf wrote:
> On 17.05.2011, at 11:11, Benjamin Herrenschmidt wrote:
> 
> > On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
> >> I'm not sure I fully understand how this is supposed to work. If the
> >> tables are kept inside the kernel, how does userspace get to know
> >> where to DMA to?
> > 
> > The guest gets a dma range from the device-tree which is the range of
> > device-side dma addresses it can use that correspond to the table.
> > 
> > The guest kernel uses the normal linux iommu space allocator to allocate
> > space in that region and uses H_PUT_TCE to populate the corresponding
> > table entries.
> > 
> > This is the same interface that is used for "real" iommu's with PCI
> > devices btw.
> 
> I'm still slightly puzzled here :). IIUC the main point of an IOMMU is for the kernel
> to change where device accesses actually go to. So device DMAs address A, goes through
> the IOMMU, in reality accesses address B.

Right :-)

> Now, how do we tell the devices implemented in qemu that they're supposed to DMA to
> address B instead of A if the mapping table is kept in-kernel?

Oh, bcs qemu mmaps the table :-)

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
@ 2011-05-17  9:35           ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 103+ messages in thread
From: Benjamin Herrenschmidt @ 2011-05-17  9:35 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Paul Mackerras, kvm

On Tue, 2011-05-17 at 11:31 +0200, Alexander Graf wrote:
> On 17.05.2011, at 11:11, Benjamin Herrenschmidt wrote:
> 
> > On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
> >> I'm not sure I fully understand how this is supposed to work. If the
> >> tables are kept inside the kernel, how does userspace get to know
> >> where to DMA to?
> > 
> > The guest gets a dma range from the device-tree which is the range of
> > device-side dma addresses it can use that correspond to the table.
> > 
> > The guest kernel uses the normal linux iommu space allocator to allocate
> > space in that region and uses H_PUT_TCE to populate the corresponding
> > table entries.
> > 
> > This is the same interface that is used for "real" iommu's with PCI
> > devices btw.
> 
> I'm still slightly puzzled here :). IIUC the main point of an IOMMU is for the kernel
> to change where device accesses actually go to. So device DMAs address A, goes through
> the IOMMU, in reality accesses address B.

Right :-)

> Now, how do we tell the devices implemented in qemu that they're supposed to DMA to
> address B instead of A if the mapping table is kept in-kernel?

Oh, bcs qemu mmaps the table :-)

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
  2011-05-17  9:35           ` Benjamin Herrenschmidt
@ 2011-05-17  9:39             ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  9:39 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Paul Mackerras, linuxppc-dev, kvm


On 17.05.2011, at 11:35, Benjamin Herrenschmidt wrote:

> On Tue, 2011-05-17 at 11:31 +0200, Alexander Graf wrote:
>> On 17.05.2011, at 11:11, Benjamin Herrenschmidt wrote:
>> 
>>> On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
>>>> I'm not sure I fully understand how this is supposed to work. If the
>>>> tables are kept inside the kernel, how does userspace get to know
>>>> where to DMA to?
>>> 
>>> The guest gets a dma range from the device-tree which is the range of
>>> device-side dma addresses it can use that correspond to the table.
>>> 
>>> The guest kernel uses the normal linux iommu space allocator to allocate
>>> space in that region and uses H_PUT_TCE to populate the corresponding
>>> table entries.
>>> 
>>> This is the same interface that is used for "real" iommu's with PCI
>>> devices btw.
>> 
>> I'm still slightly puzzled here :). IIUC the main point of an IOMMU is for the kernel
>> to change where device accesses actually go to. So device DMAs address A, goes through
>> the IOMMU, in reality accesses address B.
> 
> Right :-)
> 
>> Now, how do we tell the devices implemented in qemu that they're supposed to DMA to
>> address B instead of A if the mapping table is kept in-kernel?
> 
> Oh, bcs qemu mmaps the table :-)

That's the piece to the puzzle I was missing. Please document that interface properly - it needs to be rock stable :)


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode
@ 2011-05-17  9:39             ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  9:39 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev, Paul Mackerras, kvm


On 17.05.2011, at 11:35, Benjamin Herrenschmidt wrote:

> On Tue, 2011-05-17 at 11:31 +0200, Alexander Graf wrote:
>> On 17.05.2011, at 11:11, Benjamin Herrenschmidt wrote:
>>=20
>>> On Tue, 2011-05-17 at 10:01 +0200, Alexander Graf wrote:
>>>> I'm not sure I fully understand how this is supposed to work. If =
the
>>>> tables are kept inside the kernel, how does userspace get to know
>>>> where to DMA to?
>>>=20
>>> The guest gets a dma range from the device-tree which is the range =
of
>>> device-side dma addresses it can use that correspond to the table.
>>>=20
>>> The guest kernel uses the normal linux iommu space allocator to =
allocate
>>> space in that region and uses H_PUT_TCE to populate the =
corresponding
>>> table entries.
>>>=20
>>> This is the same interface that is used for "real" iommu's with PCI
>>> devices btw.
>>=20
>> I'm still slightly puzzled here :). IIUC the main point of an IOMMU =
is for the kernel
>> to change where device accesses actually go to. So device DMAs =
address A, goes through
>> the IOMMU, in reality accesses address B.
>=20
> Right :-)
>=20
>> Now, how do we tell the devices implemented in qemu that they're =
supposed to DMA to
>> address B instead of A if the mapping table is kept in-kernel?
>=20
> Oh, bcs qemu mmaps the table :-)

That's the piece to the puzzle I was missing. Please document that =
interface properly - it needs to be rock stable :)


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
@ 2011-05-17  9:46   ` Alexander Graf
  2011-05-11 10:38 ` [PATCH 02/13] kvm/powerpc: Fix kvmppc_core_pending_dec Paul Mackerras
                     ` (12 subsequent siblings)
  13 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  9:46 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:34, Paul Mackerras wrote:

> The following series of patches enable KVM to exploit the hardware
> hypervisor mode on 64-bit Power ISA Book3S machines.  At present only
> POWER7 is supported, but it would be easy to add other processors.
> 
> Running the KVM host in hypervisor mode means that the guest can use
> both supervisor mode and user mode.  That means that the guest can
> execute supervisor-privilege instructions and access supervisor-
> privilege registers.  In addition the hardware directs most exceptions
> to the guest.  Thus we don't need to emulate any instructions in the
> host.  Generally, the only times we need to exit the guest are when it
> does a hypercall or when an external interrupt or host timer
> (decrementer) interrupt occurs.
> 
> The focus of this KVM implementation is to run guests that use the
> PAPR (Power Architecture Platform Requirements) paravirtualization
> interface, which is the interface supplied by PowerVM on IBM pSeries
> machines.
> 
> These patches are against Ben Herrenschmidt's next branch in his tree
> at git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git.

Very nice patches indeed :). Is there any way I can test them? I don't like pulling code that I couldn't run anywhere yet.


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-17  9:46   ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17  9:46 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm


On 11.05.2011, at 12:34, Paul Mackerras wrote:

> The following series of patches enable KVM to exploit the hardware
> hypervisor mode on 64-bit Power ISA Book3S machines.  At present only
> POWER7 is supported, but it would be easy to add other processors.
>=20
> Running the KVM host in hypervisor mode means that the guest can use
> both supervisor mode and user mode.  That means that the guest can
> execute supervisor-privilege instructions and access supervisor-
> privilege registers.  In addition the hardware directs most exceptions
> to the guest.  Thus we don't need to emulate any instructions in the
> host.  Generally, the only times we need to exit the guest are when it
> does a hypercall or when an external interrupt or host timer
> (decrementer) interrupt occurs.
>=20
> The focus of this KVM implementation is to run guests that use the
> PAPR (Power Architecture Platform Requirements) paravirtualization
> interface, which is the interface supplied by PowerVM on IBM pSeries
> machines.
>=20
> These patches are against Ben Herrenschmidt's next branch in his tree
> at git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc.git.

Very nice patches indeed :). Is there any way I can test them? I don't =
like pulling code that I couldn't run anywhere yet.


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-16  5:58       ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
  (?)
@ 2011-05-17 10:17         ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17 10:17 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc


On 16.05.2011, at 07:58, Paul Mackerras wrote:

> On Sun, May 15, 2011 at 11:58:12PM +0200, Alexander Graf wrote:
>> 
>> On 11.05.2011, at 12:44, Paul Mackerras wrote:
> 
>>> +#ifdef CONFIG_KVM_BOOK3S_NONHV
>> 
>> I really liked how you called the .c file _pr - why call it NONHV now?
> 
> I agree, CONFIG_KVM_BOOK3S_PR would be better, I'll change it.
> 
>>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>>> index 7412676..8dba5f6 100644
>>> --- a/arch/powerpc/include/asm/paca.h
>>> +++ b/arch/powerpc/include/asm/paca.h
>>> @@ -149,6 +149,16 @@ struct paca_struct {
>>> #ifdef CONFIG_KVM_BOOK3S_HANDLER
>>> 	/* We use this to store guest state in */
>>> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
>>> +#ifdef CONFIG_KVM_BOOK3S_64_HV
>>> +	struct kvm_vcpu *kvm_vcpu;
>>> +	u64 dabr;
>>> +	u64 host_mmcr[3];
>>> +	u32 host_pmc[6];
>>> +	u64 host_purr;
>>> +	u64 host_spurr;
>>> +	u64 host_dscr;
>>> +	u64 dec_expires;
>> 
>> Hrm. I'd say either push those into shadow_vcpu for HV mode or get
>> rid of the shadow_vcpu reference. I'd probably prefer the former.
> 
> These are fields that are pieces of host state that we need to save
> and restore across execution of a guest; they don't apply to any
> specific guest or vcpu.  That's why I didn't put them in shadow_vcpu,
> which is specifically for one vcpu in one guest.  
> 
> Given that book3s_pr copies the shadow_vcpu into and out of the paca,
> I thought it best not to add fields to it that are only live while we
> are in the guest.  True, these fields only exist for book3s_hv, but if
> we later on make it possible to select book3s_pr vs. book3s_hv at
> runtime, we won't want to be copying these fields into and out of the
> paca when book3s_pr is active.
> 
> Maybe we need another struct, kvm_host_state or something like that,
> to save this sort of state.

Yeah, just put them into a different struct then. I don't want to clutter the PACA struct with kvm fields all over :).

> 
>>> @@ -65,6 +98,7 @@ config KVM_440
>>> 	bool "KVM support for PowerPC 440 processors"
>>> 	depends on EXPERIMENTAL && 44x
>>> 	select KVM
>>> +	select KVM_MMIO
>> 
>> e500 should also select MMIO, no?
> 
> Good point, I'll fix that.
> 
>>> +long kvmppc_alloc_hpt(struct kvm *kvm)
>>> +{
>>> +	unsigned long hpt;
>>> +	unsigned long lpid;
>>> +
>>> +	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
>>> +			       HPT_ORDER - PAGE_SHIFT);
>> 
>> This would end up failing quite often, no?
> 
> In practice it seems to be OK, possibly because the machines we're
> testing this on have plenty of memory.  Maybe we should get qemu to
> allocate the HPT using hugetlbfs so the memory will come from the
> reserved page pool.  It does need to be physically contiguous and
> aligned on a multiple of its size -- that's a hardware requirement.

Yes, I'd certainly prefer to see qemu allocate it. That'd also make things easier for migration later, as we still have access to the hpt. But then again - we can't really reuse the mappings there anyways, as they'll all be host mappings. Phew. Have you given that some thought yet? We can probably just ignore non-bolted entries - but the bolted ones need to be transferred.

Also, if we have qemu allocate the hpt memory, qemu can modify the mappings and thus break out. Bleks. It should definitely not be able to write to the hpt after it's actually being used by kvm.

> 
>>> +	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
>>> +	kvm->arch.lpid = lpid;
>>> +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
>>> +	kvm->arch.host_lpid = mfspr(SPRN_LPID);
>>> +	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
>> 
>> How do these correlate with the guest's hv mmu? I'd like to keep the
>> code clean enough so we can potentially use it for PR mode as well :). 
> 
> The host SDR1 and LPID are different from the guest's.  That is, the
> guest has its own HPT which is quite separate from the host's.  The
> host values could be saved in global variables, though; there's no
> real need for each VM to have its own copy, except that doing it this
> way simplifies the low-level assembly code a little.

Well, my point is that I tried to separate the MMU implementation for the PR KVM stuff. What I'd like to see at the end of the day would be a "guest" hv implementation file that I could plug into PR kvm and have the MMU rolling by using the exact same code as the HV code. Only the backend would be different. Maybe there's some valid technical reason to not do it, but I haven't come across any yet :).

> 
>>> +	/* First see what page size we have */
>>> +	psize = user_page_size(mem->userspace_addr);
>>> +	/* For now, only allow 16MB pages */
>> 
>> The reason to go for 16MB pages is because of the host mmu code, not
>> the guest hv mmu. So please at least #ifdef the code to HV so we
>> document that correlation. 
> 
> I'm not sure what you mean by that.  The reason for going for 16MB
> pages initially is for performance (this way the guest can use 16MB
> pages for its linear mapping) and to avoid having to deal with the
> pages being paged or swapped on the host side.  Could you explain the
> "because of the host mmu code" part of your comment further?

If you consider a split as I described above, an HV guest running on PR KVM doesn't have to be mapped linearly. But then again - it could. In fact, it might even make sense. Hrm. Very good point! We could also just flip SDR1 in PR mode and reuse the exact same code as the HV mode.

However, my point here was that when we don't flip SDR1 but go through a shadow hpt, we don't have to map it to 16MB pages.

> 
> What would adding #ifdef CONFIG_KVM_BOOK3S_64_HV achieve in a file
> whose name ends in _hv.c and which only gets compiled when
> CONFIG_KVM_BOOK3S_64_HV=y?
> 
>>> +int kvmppc_mmu_hv_init(void)
>>> +{
>>> +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
>>> +		return 0;
>> 
>> Return 0 for "it doesn't work" might not be the right exit code ;).
> 
> Good point, I'll make it -ENXIO or something.

Anything < 0 sounds good to me :). -EINVAL is probably the most sensible one here.

> 
>>> +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>>> +				struct kvmppc_pte *gpte, bool data)
>>> +{
>>> +	return -ENOENT;
>> 
>> Can't you just call the normal book3s_64 mmu code here? Without
>> xlate, doing ppc_ld doesn't work, which means that reading out the
>> faulting guest instruction breaks. We'd also need it for PR mode :). 
> 
> With book3s_hv we currently have no situations where we need to read
> out the faulting guest instruction.  
> 
> We could use the normal code here if we had the guest HPT mapped into
> the qemu address space, which it currently isn't -- but should be.  It
> hasn't been a priority to fix because with book3s_hv we currently have
> no situations where we need to read out the faulting guest
> instruction.

Makes sense. We might however need it in case we ever use this code in PR mode :). I don't fully remember if you shoved around that code, but in case fetching the last instruction fails (which IIRC it can for you too), a manual load through this function gets issued.

> 
>>> +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
>>> +{
>>> +	vcpu->arch.pvr = pvr;
>>> +	kvmppc_mmu_book3s_hv_init(vcpu);
>> 
>> No need for you to do it depending on pvr. You can just do the mmu
>> initialization on normal init :).
> 
> OK, I'll do that.
> 
>>> +	case BOOK3S_INTERRUPT_PROGRAM:
>>> +	{
>>> +		ulong flags;
>>> +		/*
>>> +		 * Normally program interrupts are delivered directly
>>> +		 * to the guest by the hardware, but we can get here
>>> +		 * as a result of a hypervisor emulation interrupt
>>> +		 * (e40) getting turned into a 700 by BML RTAS.
>> 
>> Not sure I fully understand what's going on there. Mind to explain? :)
> 
> Recent versions of the architecture don't actually deliver a 0x700
> interrupt to the OS on an illegal instruction; instead they generate
> an 0xe40 interrupt to the hypervisor in case the hypervisor wants to
> emulate the instruction.  If the hypervisor doesn't want to do that
> it's supposed to synthesize a 0x700 interrupt to the OS.
> 
> When we're running this stuff under our BML (Bare Metal Linux)
> framework in the lab, we use a small RTAS implementation that gets
> loaded by the BML loader, and one of the things that this RTAS does is
> to patch the 0xe40 vector to make the 0xe40 interrupt come in via the
> 0x700 vector, in case the kernel you're running under BML hasn't been
> updated to have an 0xe40 handler.
> 
> I could just remove that case, in fact.

Yeah, the main issue I see here is that there's no hardware that could run this code. Whatever the first hardware that would be able to requires should be used here.

> 
>>> +	case BOOK3S_INTERRUPT_SYSCALL:
>>> +	{
>>> +		/* hcall - punt to userspace */
>>> +		int i;
>>> +
>> 
>> Do we really want to accept sc from pr=1? I'd just reject them straight away.
> 
> Good idea, I'll do that.
> 
>>> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
>>> +		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
>>> +		vcpu->arch.dear = vcpu->arch.fault_dar;
>>> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
>>> +		r = RESUME_GUEST;
>>> +		break;
>>> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
>>> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
>>> +					0x08000000);
>> 
>> What do these do? I thought the guest gets DSI and ISI directly?
> 
> It does for accesses with relocation (IR/DR) on, but because we have
> enabled VRMA mode (Virtualized Real Mode Area) in the LPCR, we get
> these interrupts to the hypervisor if the guest does a bad real-mode
> access.  If we also turned on Virtualized Partition Memory (VPM) mode
> in the LPCR, then all ISI/DSI in the guest come through to the
> hypervisor using these vectors in case the hypervisor wants to do any
> paging/swapping of guest memory.  I plan to do that later when we
> support using 4k/64k pages for guest memory.

I see - please put that explanation in a comment there ;).

> 
>>> +	/* default to book3s_64 (power7) */
>>> +	vcpu->arch.pvr = 0x3f0200;
>> 
>> Wouldn't it make sense to simply default to the host pvr? Not sure -
>> maybe it's not :).
> 
> Sounds sensible, actually.  In fact the hypervisor can't spoof the PVR
> for the guest, that is, the guest can read the real PVR value and
> there's no way the hypervisor can stop it.

If it can't spoof it at all, then yes, use the host pvr.

In fact, thinking about this, how does userspace know which mode the kernel uses? It should be able to fail if we're trying to run a -M mac99 on this code ;).

> 
>>> +	flush_fp_to_thread(current);
>> 
>> Do those with fine with preemption enabled?
> 
> Yes.  If a preemption happens, it will flush the FP/VMX/VSX registers
> out to the thread_struct, and then any of these explicit calls that
> happen after the preemption will do nothing.

Ah, the flushes themselves disable preemption during the flush :). Then it makes sense.

> 
>>> +	/*
>>> +	 * Put whatever is in the decrementer into the
>>> +	 * hypervisor decrementer.
>>> +	 */
>>> +	mfspr	r8,SPRN_DEC
>>> +	mftb	r7
>>> +	mtspr	SPRN_HDEC,r8
>> 
>> Can't we just always use HDEC on the host? That's save us from all
>> the swapping here.
> 
> The problem is that there is only one HDEC per core, so using it would
> become complicated when the host is running in SMT4 or SMT2 mode.

I see.

> 
>>> +	extsw	r8,r8
>>> +	add	r8,r8,r7
>>> +	std	r8,PACA_KVM_DECEXP(r13)
>> 
>> Why is dec+hdec on vcpu_run decexp? What exactly does this store?
> 
> R7 here is the current (or very recent, anyway) timebase value, so
> this stores the timebase value at which the host decrementer would get
> to zero.
> 
>>> +	lwz	r3, PACA_HOST_PMC(r13)
>>> +	lwz	r4, PACA_HOST_PMC + 4(r13)
>>> +	lwz	r5, PACA_HOST_PMC + 4(r13)
>> 
>> copy&paste error?
> 
> Yes, thanks.
> 
>>> +.global kvmppc_handler_lowmem_trampoline
>>> +kvmppc_handler_lowmem_trampoline:
>>> +	cmpwi	r12,0x500
>>> +	beq	1f
>>> +	cmpwi	r12,0x980
>>> +	beq	1f
>> 
>> What?
>> 
>> 1) use the macros please
> 
> OK
> 
>> 2) why?
> 
> The external interrupt and hypervisor decrementer interrupt handlers
> expect the interrupted PC and MSR to be in HSRR0/1 rather than
> SRR0/1.  I could remove the case for 0x980 since we don't call the
> linux handler for HDEC interrupts any more.

Ah, makes sense. This also deserves a comment :)

> 
>>> +	/* Check if HDEC expires soon */
>>> +	mfspr	r3,SPRN_HDEC
>>> +	cmpwi	r3,10
>>> +	li	r12,0x980
>> 
>> define
> 
> OK.
> 
>>> +	mr	r9,r4
>>> +	blt	hdec_soon
>> 
>> Is it faster to do the check than to save the check and take the
>> odds? Also, maybe we should rather do the check in preemptible
>> code that could just directly pass the time slice on.
> 
> I do the check there because I was having problems where, if the HDEC
> goes negative before we do the partition switch, we would occasionally
> not get the HDEC interrupt at all until the next time HDEC went
> negative, ~ 8.4 seconds later.

Yikes - so HDEC is edge and doesn't even keep the interrupt line up? That sounds like a serious hardware limitation. What if you only use HDEC and it triggers while interrupts are off in a critical section? Is the hardware really that broken?

> 
>>> +	/* See if this is a leftover HDEC interrupt */
>>> +	cmpwi	r12,0x980
>> 
>> define
> 
> OK.
> 
>>> +	bne	2f
>>> +	mfspr	r3,SPRN_HDEC
>>> +	cmpwi	r3,0
>>> +	bge	ignore_hdec
>>> +2:
>>> +
>>> +	/* Check for mediated interrupts (could be done earlier really ...) */
>>> +	cmpwi	r12,0x500
>> 
>> define
> 
> OK.
> 
>>> +	bne+	1f
>>> +	ld	r5,VCPU_LPCR(r9)
>>> +	andi.	r0,r11,MSR_EE
>>> +	beq	1f
>>> +	andi.	r0,r5,LPCR_MER
>>> +	bne	bounce_ext_interrupt
>> 
>> So there's no need for the external check that directly goes into
>> the Linux handler code on full-fledged exits?
> 
> No, we still need that; ordinary external interrupts come to the
> hypervisor regardless of the guest's MSR.EE setting.
> 
> The MER bit (Mediated External Request) is a way for the hypervisor to
> know when the guest sets its MSR.EE bit.  If an event happens that
> means the host wants to give a guest vcpu an external interrupt, but
> the guest vcpu has MSR.EE = 0, then the host can't deliver the
> simulated external interrupt.  Instead it sets LPCR.MER, which has the
> effect that the hardware will deliver an external interrupt (to the
> hypervisor) when running in the guest and it has MSR.EE = 1.
> 
> So, when we get an external interrupt, LPCR.MER = 1 and MSR.EE = 1,
> we need to synthesize an external interrupt in the guest.  Doing it
> here means that we don't need to do the full partition switch out to
> the host and back again.

Ah, special functionality then. Please comment this in the code, so the unknowledgeable reader (me) knows what this is about.

> 
>>> +	/* Read the guest SLB and save it away */
>>> +	li	r6,0
>>> +	addi	r7,r9,VCPU_SLB
>>> +	li	r5,0
>>> +1:	slbmfee	r8,r6
>>> +	andis.	r0,r8,SLB_ESID_V@h
>>> +	beq	2f
>>> +	add	r8,r8,r6		/* put index in */
>>> +	slbmfev	r3,r6
>>> +	std	r8,VCPU_SLB_E(r7)
>>> +	std	r3,VCPU_SLB_V(r7)
>>> +	addi	r7,r7,VCPU_SLB_SIZE
>>> +	addi	r5,r5,1
>>> +2:	addi	r6,r6,1
>>> +	cmpwi	r6,32
>> 
>> I don't like how the 32 is hardcoded here. Better create a define
>> for it and use the same in the init code.
> 
> Sure.  In fact I probably should use vcpu->arch.slb_nr here.

and bctr? yup :)

> 
>>> +hdec_soon:
>>> +	/* Switch back to host partition */
>>> +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
>>> +	ld	r6,KVM_HOST_SDR1(r4)
>>> +	lwz	r7,KVM_HOST_LPID(r4)
>>> +	li	r8,0x3ff		/* switch to reserved LPID */
>> 
>> is it reserved by ISA? Either way, hard-coding the constant without
>> a name is not nice :).
> 
> Actually, it just has to be an LPID value that isn't ever used for
> running a real guest (or the host).  I'll make a name for it.
> 
>>> +	lis	r8,0x7fff
>> 
>> INT_MAX@h might be more readable.
> 
> OK.
> 
>>> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
>>> {
>>> -	return !(v->arch.shared->msr & MSR_WE) ||
>>> +#ifndef CONFIG_KVM_BOOK3S_64_HV
>>> +	return !(kvmppc_get_msr(v) & MSR_WE) ||
>>> 	       !!(v->arch.pending_exceptions);
>>> +#else
>>> +	return 1;
>> 
>> So what happens if the guest sets MSR_WE? It just stays in guest
>> context idling? That'd be pretty bad for scheduling on the host.
> 
> The MSR_WE bit doesn't exist on POWER7 (or any of POWER[4567], in
> fact).  If the guest wants to idle it calls the H_CEDE hcall.

Ah, interesting. Didn't know :).

> 
>>> @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
>>> 	case KVM_CAP_PPC_IRQ_LEVEL:
>>> 	case KVM_CAP_ENABLE_CAP:
>>> 	case KVM_CAP_PPC_OSI:
>>> +#ifndef CONFIG_KVM_BOOK3S_64_HV
>> 
>> You also don't do OSI on HV :).
> 
> Good point, I'll fix that.
> 
>>> @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
>>> 	if (waitqueue_active(&vcpu->wq)) {
>>> 		wake_up_interruptible(&vcpu->wq);
>>> 		vcpu->stat.halt_wakeup++;
>>> +#ifdef CONFIG_KVM_BOOK3S_64_HV
>>> +	} else if (vcpu->cpu != -1) {
>>> +		smp_send_reschedule(vcpu->cpu);
>> 
>> Shouldn't this be done for non-HV too? The only reason we don't do
>> it yet is because we don't do SMP, no?
> 
> I didn't know why you didn't do it for non-HV, so I didn't change it
> for that case.  If you say it's OK, I'll change it (we'll need to set
> vcpu->cpu in the vcpu_load/unload code for book3s_pr too, then).

Sure, go ahead and set it.


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-17 10:17         ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17 10:17 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, kvm-ppc, KVM list


On 16.05.2011, at 07:58, Paul Mackerras wrote:

> On Sun, May 15, 2011 at 11:58:12PM +0200, Alexander Graf wrote:
>>=20
>> On 11.05.2011, at 12:44, Paul Mackerras wrote:
>=20
>>> +#ifdef CONFIG_KVM_BOOK3S_NONHV
>>=20
>> I really liked how you called the .c file _pr - why call it NONHV =
now?
>=20
> I agree, CONFIG_KVM_BOOK3S_PR would be better, I'll change it.
>=20
>>> diff --git a/arch/powerpc/include/asm/paca.h =
b/arch/powerpc/include/asm/paca.h
>>> index 7412676..8dba5f6 100644
>>> --- a/arch/powerpc/include/asm/paca.h
>>> +++ b/arch/powerpc/include/asm/paca.h
>>> @@ -149,6 +149,16 @@ struct paca_struct {
>>> #ifdef CONFIG_KVM_BOOK3S_HANDLER
>>> 	/* We use this to store guest state in */
>>> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
>>> +#ifdef CONFIG_KVM_BOOK3S_64_HV
>>> +	struct kvm_vcpu *kvm_vcpu;
>>> +	u64 dabr;
>>> +	u64 host_mmcr[3];
>>> +	u32 host_pmc[6];
>>> +	u64 host_purr;
>>> +	u64 host_spurr;
>>> +	u64 host_dscr;
>>> +	u64 dec_expires;
>>=20
>> Hrm. I'd say either push those into shadow_vcpu for HV mode or get
>> rid of the shadow_vcpu reference. I'd probably prefer the former.
>=20
> These are fields that are pieces of host state that we need to save
> and restore across execution of a guest; they don't apply to any
> specific guest or vcpu.  That's why I didn't put them in shadow_vcpu,
> which is specifically for one vcpu in one guest. =20
>=20
> Given that book3s_pr copies the shadow_vcpu into and out of the paca,
> I thought it best not to add fields to it that are only live while we
> are in the guest.  True, these fields only exist for book3s_hv, but if
> we later on make it possible to select book3s_pr vs. book3s_hv at
> runtime, we won't want to be copying these fields into and out of the
> paca when book3s_pr is active.
>=20
> Maybe we need another struct, kvm_host_state or something like that,
> to save this sort of state.

Yeah, just put them into a different struct then. I don't want to =
clutter the PACA struct with kvm fields all over :).

>=20
>>> @@ -65,6 +98,7 @@ config KVM_440
>>> 	bool "KVM support for PowerPC 440 processors"
>>> 	depends on EXPERIMENTAL && 44x
>>> 	select KVM
>>> +	select KVM_MMIO
>>=20
>> e500 should also select MMIO, no?
>=20
> Good point, I'll fix that.
>=20
>>> +long kvmppc_alloc_hpt(struct kvm *kvm)
>>> +{
>>> +	unsigned long hpt;
>>> +	unsigned long lpid;
>>> +
>>> +	hpt =3D =
__get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
>>> +			       HPT_ORDER - PAGE_SHIFT);
>>=20
>> This would end up failing quite often, no?
>=20
> In practice it seems to be OK, possibly because the machines we're
> testing this on have plenty of memory.  Maybe we should get qemu to
> allocate the HPT using hugetlbfs so the memory will come from the
> reserved page pool.  It does need to be physically contiguous and
> aligned on a multiple of its size -- that's a hardware requirement.

Yes, I'd certainly prefer to see qemu allocate it. That'd also make =
things easier for migration later, as we still have access to the hpt. =
But then again - we can't really reuse the mappings there anyways, as =
they'll all be host mappings. Phew. Have you given that some thought =
yet? We can probably just ignore non-bolted entries - but the bolted =
ones need to be transferred.

Also, if we have qemu allocate the hpt memory, qemu can modify the =
mappings and thus break out. Bleks. It should definitely not be able to =
write to the hpt after it's actually being used by kvm.

>=20
>>> +	kvm->arch.sdr1 =3D __pa(hpt) | (HPT_ORDER - 18);
>>> +	kvm->arch.lpid =3D lpid;
>>> +	kvm->arch.host_sdr1 =3D mfspr(SPRN_SDR1);
>>> +	kvm->arch.host_lpid =3D mfspr(SPRN_LPID);
>>> +	kvm->arch.host_lpcr =3D mfspr(SPRN_LPCR);
>>=20
>> How do these correlate with the guest's hv mmu? I'd like to keep the
>> code clean enough so we can potentially use it for PR mode as well =
:).=20
>=20
> The host SDR1 and LPID are different from the guest's.  That is, the
> guest has its own HPT which is quite separate from the host's.  The
> host values could be saved in global variables, though; there's no
> real need for each VM to have its own copy, except that doing it this
> way simplifies the low-level assembly code a little.

Well, my point is that I tried to separate the MMU implementation for =
the PR KVM stuff. What I'd like to see at the end of the day would be a =
"guest" hv implementation file that I could plug into PR kvm and have =
the MMU rolling by using the exact same code as the HV code. Only the =
backend would be different. Maybe there's some valid technical reason to =
not do it, but I haven't come across any yet :).

>=20
>>> +	/* First see what page size we have */
>>> +	psize =3D user_page_size(mem->userspace_addr);
>>> +	/* For now, only allow 16MB pages */
>>=20
>> The reason to go for 16MB pages is because of the host mmu code, not
>> the guest hv mmu. So please at least #ifdef the code to HV so we
>> document that correlation.=20
>=20
> I'm not sure what you mean by that.  The reason for going for 16MB
> pages initially is for performance (this way the guest can use 16MB
> pages for its linear mapping) and to avoid having to deal with the
> pages being paged or swapped on the host side.  Could you explain the
> "because of the host mmu code" part of your comment further?

If you consider a split as I described above, an HV guest running on PR =
KVM doesn't have to be mapped linearly. But then again - it could. In =
fact, it might even make sense. Hrm. Very good point! We could also just =
flip SDR1 in PR mode and reuse the exact same code as the HV mode.

However, my point here was that when we don't flip SDR1 but go through a =
shadow hpt, we don't have to map it to 16MB pages.

>=20
> What would adding #ifdef CONFIG_KVM_BOOK3S_64_HV achieve in a file
> whose name ends in _hv.c and which only gets compiled when
> CONFIG_KVM_BOOK3S_64_HV=3Dy?
>=20
>>> +int kvmppc_mmu_hv_init(void)
>>> +{
>>> +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
>>> +		return 0;
>>=20
>> Return 0 for "it doesn't work" might not be the right exit code ;).
>=20
> Good point, I'll make it -ENXIO or something.

Anything < 0 sounds good to me :). -EINVAL is probably the most sensible =
one here.

>=20
>>> +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, =
gva_t eaddr,
>>> +				struct kvmppc_pte *gpte, bool data)
>>> +{
>>> +	return -ENOENT;
>>=20
>> Can't you just call the normal book3s_64 mmu code here? Without
>> xlate, doing ppc_ld doesn't work, which means that reading out the
>> faulting guest instruction breaks. We'd also need it for PR mode :).=20=

>=20
> With book3s_hv we currently have no situations where we need to read
> out the faulting guest instruction. =20
>=20
> We could use the normal code here if we had the guest HPT mapped into
> the qemu address space, which it currently isn't -- but should be.  It
> hasn't been a priority to fix because with book3s_hv we currently have
> no situations where we need to read out the faulting guest
> instruction.

Makes sense. We might however need it in case we ever use this code in =
PR mode :). I don't fully remember if you shoved around that code, but =
in case fetching the last instruction fails (which IIRC it can for you =
too), a manual load through this function gets issued.

>=20
>>> +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
>>> +{
>>> +	vcpu->arch.pvr =3D pvr;
>>> +	kvmppc_mmu_book3s_hv_init(vcpu);
>>=20
>> No need for you to do it depending on pvr. You can just do the mmu
>> initialization on normal init :).
>=20
> OK, I'll do that.
>=20
>>> +	case BOOK3S_INTERRUPT_PROGRAM:
>>> +	{
>>> +		ulong flags;
>>> +		/*
>>> +		 * Normally program interrupts are delivered directly
>>> +		 * to the guest by the hardware, but we can get here
>>> +		 * as a result of a hypervisor emulation interrupt
>>> +		 * (e40) getting turned into a 700 by BML RTAS.
>>=20
>> Not sure I fully understand what's going on there. Mind to explain? =
:)
>=20
> Recent versions of the architecture don't actually deliver a 0x700
> interrupt to the OS on an illegal instruction; instead they generate
> an 0xe40 interrupt to the hypervisor in case the hypervisor wants to
> emulate the instruction.  If the hypervisor doesn't want to do that
> it's supposed to synthesize a 0x700 interrupt to the OS.
>=20
> When we're running this stuff under our BML (Bare Metal Linux)
> framework in the lab, we use a small RTAS implementation that gets
> loaded by the BML loader, and one of the things that this RTAS does is
> to patch the 0xe40 vector to make the 0xe40 interrupt come in via the
> 0x700 vector, in case the kernel you're running under BML hasn't been
> updated to have an 0xe40 handler.
>=20
> I could just remove that case, in fact.

Yeah, the main issue I see here is that there's no hardware that could =
run this code. Whatever the first hardware that would be able to =
requires should be used here.

>=20
>>> +	case BOOK3S_INTERRUPT_SYSCALL:
>>> +	{
>>> +		/* hcall - punt to userspace */
>>> +		int i;
>>> +
>>=20
>> Do we really want to accept sc from pr=3D1? I'd just reject them =
straight away.
>=20
> Good idea, I'll do that.
>=20
>>> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
>>> +		vcpu->arch.dsisr =3D vcpu->arch.fault_dsisr;
>>> +		vcpu->arch.dear =3D vcpu->arch.fault_dar;
>>> +		kvmppc_inject_interrupt(vcpu, =
BOOK3S_INTERRUPT_DATA_STORAGE, 0);
>>> +		r =3D RESUME_GUEST;
>>> +		break;
>>> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
>>> +		kvmppc_inject_interrupt(vcpu, =
BOOK3S_INTERRUPT_INST_STORAGE,
>>> +					0x08000000);
>>=20
>> What do these do? I thought the guest gets DSI and ISI directly?
>=20
> It does for accesses with relocation (IR/DR) on, but because we have
> enabled VRMA mode (Virtualized Real Mode Area) in the LPCR, we get
> these interrupts to the hypervisor if the guest does a bad real-mode
> access.  If we also turned on Virtualized Partition Memory (VPM) mode
> in the LPCR, then all ISI/DSI in the guest come through to the
> hypervisor using these vectors in case the hypervisor wants to do any
> paging/swapping of guest memory.  I plan to do that later when we
> support using 4k/64k pages for guest memory.

I see - please put that explanation in a comment there ;).

>=20
>>> +	/* default to book3s_64 (power7) */
>>> +	vcpu->arch.pvr =3D 0x3f0200;
>>=20
>> Wouldn't it make sense to simply default to the host pvr? Not sure -
>> maybe it's not :).
>=20
> Sounds sensible, actually.  In fact the hypervisor can't spoof the PVR
> for the guest, that is, the guest can read the real PVR value and
> there's no way the hypervisor can stop it.

If it can't spoof it at all, then yes, use the host pvr.

In fact, thinking about this, how does userspace know which mode the =
kernel uses? It should be able to fail if we're trying to run a -M mac99 =
on this code ;).

>=20
>>> +	flush_fp_to_thread(current);
>>=20
>> Do those with fine with preemption enabled?
>=20
> Yes.  If a preemption happens, it will flush the FP/VMX/VSX registers
> out to the thread_struct, and then any of these explicit calls that
> happen after the preemption will do nothing.

Ah, the flushes themselves disable preemption during the flush :). Then =
it makes sense.

>=20
>>> +	/*
>>> +	 * Put whatever is in the decrementer into the
>>> +	 * hypervisor decrementer.
>>> +	 */
>>> +	mfspr	r8,SPRN_DEC
>>> +	mftb	r7
>>> +	mtspr	SPRN_HDEC,r8
>>=20
>> Can't we just always use HDEC on the host? That's save us from all
>> the swapping here.
>=20
> The problem is that there is only one HDEC per core, so using it would
> become complicated when the host is running in SMT4 or SMT2 mode.

I see.

>=20
>>> +	extsw	r8,r8
>>> +	add	r8,r8,r7
>>> +	std	r8,PACA_KVM_DECEXP(r13)
>>=20
>> Why is dec+hdec on vcpu_run decexp? What exactly does this store?
>=20
> R7 here is the current (or very recent, anyway) timebase value, so
> this stores the timebase value at which the host decrementer would get
> to zero.
>=20
>>> +	lwz	r3, PACA_HOST_PMC(r13)
>>> +	lwz	r4, PACA_HOST_PMC + 4(r13)
>>> +	lwz	r5, PACA_HOST_PMC + 4(r13)
>>=20
>> copy&paste error?
>=20
> Yes, thanks.
>=20
>>> +.global kvmppc_handler_lowmem_trampoline
>>> +kvmppc_handler_lowmem_trampoline:
>>> +	cmpwi	r12,0x500
>>> +	beq	1f
>>> +	cmpwi	r12,0x980
>>> +	beq	1f
>>=20
>> What?
>>=20
>> 1) use the macros please
>=20
> OK
>=20
>> 2) why?
>=20
> The external interrupt and hypervisor decrementer interrupt handlers
> expect the interrupted PC and MSR to be in HSRR0/1 rather than
> SRR0/1.  I could remove the case for 0x980 since we don't call the
> linux handler for HDEC interrupts any more.

Ah, makes sense. This also deserves a comment :)

>=20
>>> +	/* Check if HDEC expires soon */
>>> +	mfspr	r3,SPRN_HDEC
>>> +	cmpwi	r3,10
>>> +	li	r12,0x980
>>=20
>> define
>=20
> OK.
>=20
>>> +	mr	r9,r4
>>> +	blt	hdec_soon
>>=20
>> Is it faster to do the check than to save the check and take the
>> odds? Also, maybe we should rather do the check in preemptible
>> code that could just directly pass the time slice on.
>=20
> I do the check there because I was having problems where, if the HDEC
> goes negative before we do the partition switch, we would occasionally
> not get the HDEC interrupt at all until the next time HDEC went
> negative, ~ 8.4 seconds later.

Yikes - so HDEC is edge and doesn't even keep the interrupt line up? =
That sounds like a serious hardware limitation. What if you only use =
HDEC and it triggers while interrupts are off in a critical section? Is =
the hardware really that broken?

>=20
>>> +	/* See if this is a leftover HDEC interrupt */
>>> +	cmpwi	r12,0x980
>>=20
>> define
>=20
> OK.
>=20
>>> +	bne	2f
>>> +	mfspr	r3,SPRN_HDEC
>>> +	cmpwi	r3,0
>>> +	bge	ignore_hdec
>>> +2:
>>> +
>>> +	/* Check for mediated interrupts (could be done earlier really =
...) */
>>> +	cmpwi	r12,0x500
>>=20
>> define
>=20
> OK.
>=20
>>> +	bne+	1f
>>> +	ld	r5,VCPU_LPCR(r9)
>>> +	andi.	r0,r11,MSR_EE
>>> +	beq	1f
>>> +	andi.	r0,r5,LPCR_MER
>>> +	bne	bounce_ext_interrupt
>>=20
>> So there's no need for the external check that directly goes into
>> the Linux handler code on full-fledged exits?
>=20
> No, we still need that; ordinary external interrupts come to the
> hypervisor regardless of the guest's MSR.EE setting.
>=20
> The MER bit (Mediated External Request) is a way for the hypervisor to
> know when the guest sets its MSR.EE bit.  If an event happens that
> means the host wants to give a guest vcpu an external interrupt, but
> the guest vcpu has MSR.EE =3D 0, then the host can't deliver the
> simulated external interrupt.  Instead it sets LPCR.MER, which has the
> effect that the hardware will deliver an external interrupt (to the
> hypervisor) when running in the guest and it has MSR.EE =3D 1.
>=20
> So, when we get an external interrupt, LPCR.MER =3D 1 and MSR.EE =3D =
1,
> we need to synthesize an external interrupt in the guest.  Doing it
> here means that we don't need to do the full partition switch out to
> the host and back again.

Ah, special functionality then. Please comment this in the code, so the =
unknowledgeable reader (me) knows what this is about.

>=20
>>> +	/* Read the guest SLB and save it away */
>>> +	li	r6,0
>>> +	addi	r7,r9,VCPU_SLB
>>> +	li	r5,0
>>> +1:	slbmfee	r8,r6
>>> +	andis.	r0,r8,SLB_ESID_V@h
>>> +	beq	2f
>>> +	add	r8,r8,r6		/* put index in */
>>> +	slbmfev	r3,r6
>>> +	std	r8,VCPU_SLB_E(r7)
>>> +	std	r3,VCPU_SLB_V(r7)
>>> +	addi	r7,r7,VCPU_SLB_SIZE
>>> +	addi	r5,r5,1
>>> +2:	addi	r6,r6,1
>>> +	cmpwi	r6,32
>>=20
>> I don't like how the 32 is hardcoded here. Better create a define
>> for it and use the same in the init code.
>=20
> Sure.  In fact I probably should use vcpu->arch.slb_nr here.

and bctr? yup :)

>=20
>>> +hdec_soon:
>>> +	/* Switch back to host partition */
>>> +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
>>> +	ld	r6,KVM_HOST_SDR1(r4)
>>> +	lwz	r7,KVM_HOST_LPID(r4)
>>> +	li	r8,0x3ff		/* switch to reserved LPID */
>>=20
>> is it reserved by ISA? Either way, hard-coding the constant without
>> a name is not nice :).
>=20
> Actually, it just has to be an LPID value that isn't ever used for
> running a real guest (or the host).  I'll make a name for it.
>=20
>>> +	lis	r8,0x7fff
>>=20
>> INT_MAX@h might be more readable.
>=20
> OK.
>=20
>>> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
>>> {
>>> -	return !(v->arch.shared->msr & MSR_WE) ||
>>> +#ifndef CONFIG_KVM_BOOK3S_64_HV
>>> +	return !(kvmppc_get_msr(v) & MSR_WE) ||
>>> 	       !!(v->arch.pending_exceptions);
>>> +#else
>>> +	return 1;
>>=20
>> So what happens if the guest sets MSR_WE? It just stays in guest
>> context idling? That'd be pretty bad for scheduling on the host.
>=20
> The MSR_WE bit doesn't exist on POWER7 (or any of POWER[4567], in
> fact).  If the guest wants to idle it calls the H_CEDE hcall.

Ah, interesting. Didn't know :).

>=20
>>> @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
>>> 	case KVM_CAP_PPC_IRQ_LEVEL:
>>> 	case KVM_CAP_ENABLE_CAP:
>>> 	case KVM_CAP_PPC_OSI:
>>> +#ifndef CONFIG_KVM_BOOK3S_64_HV
>>=20
>> You also don't do OSI on HV :).
>=20
> Good point, I'll fix that.
>=20
>>> @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu =
*vcpu, struct kvm_interrupt *irq)
>>> 	if (waitqueue_active(&vcpu->wq)) {
>>> 		wake_up_interruptible(&vcpu->wq);
>>> 		vcpu->stat.halt_wakeup++;
>>> +#ifdef CONFIG_KVM_BOOK3S_64_HV
>>> +	} else if (vcpu->cpu !=3D -1) {
>>> +		smp_send_reschedule(vcpu->cpu);
>>=20
>> Shouldn't this be done for non-HV too? The only reason we don't do
>> it yet is because we don't do SMP, no?
>=20
> I didn't know why you didn't do it for non-HV, so I didn't change it
> for that case.  If you say it's OK, I'll change it (we'll need to set
> vcpu->cpu in the vcpu_load/unload code for book3s_pr too, then).

Sure, go ahead and set it.


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-17 10:17         ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17 10:17 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc


On 16.05.2011, at 07:58, Paul Mackerras wrote:

> On Sun, May 15, 2011 at 11:58:12PM +0200, Alexander Graf wrote:
>> 
>> On 11.05.2011, at 12:44, Paul Mackerras wrote:
> 
>>> +#ifdef CONFIG_KVM_BOOK3S_NONHV
>> 
>> I really liked how you called the .c file _pr - why call it NONHV now?
> 
> I agree, CONFIG_KVM_BOOK3S_PR would be better, I'll change it.
> 
>>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>>> index 7412676..8dba5f6 100644
>>> --- a/arch/powerpc/include/asm/paca.h
>>> +++ b/arch/powerpc/include/asm/paca.h
>>> @@ -149,6 +149,16 @@ struct paca_struct {
>>> #ifdef CONFIG_KVM_BOOK3S_HANDLER
>>> 	/* We use this to store guest state in */
>>> 	struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
>>> +#ifdef CONFIG_KVM_BOOK3S_64_HV
>>> +	struct kvm_vcpu *kvm_vcpu;
>>> +	u64 dabr;
>>> +	u64 host_mmcr[3];
>>> +	u32 host_pmc[6];
>>> +	u64 host_purr;
>>> +	u64 host_spurr;
>>> +	u64 host_dscr;
>>> +	u64 dec_expires;
>> 
>> Hrm. I'd say either push those into shadow_vcpu for HV mode or get
>> rid of the shadow_vcpu reference. I'd probably prefer the former.
> 
> These are fields that are pieces of host state that we need to save
> and restore across execution of a guest; they don't apply to any
> specific guest or vcpu.  That's why I didn't put them in shadow_vcpu,
> which is specifically for one vcpu in one guest.  
> 
> Given that book3s_pr copies the shadow_vcpu into and out of the paca,
> I thought it best not to add fields to it that are only live while we
> are in the guest.  True, these fields only exist for book3s_hv, but if
> we later on make it possible to select book3s_pr vs. book3s_hv at
> runtime, we won't want to be copying these fields into and out of the
> paca when book3s_pr is active.
> 
> Maybe we need another struct, kvm_host_state or something like that,
> to save this sort of state.

Yeah, just put them into a different struct then. I don't want to clutter the PACA struct with kvm fields all over :).

> 
>>> @@ -65,6 +98,7 @@ config KVM_440
>>> 	bool "KVM support for PowerPC 440 processors"
>>> 	depends on EXPERIMENTAL && 44x
>>> 	select KVM
>>> +	select KVM_MMIO
>> 
>> e500 should also select MMIO, no?
> 
> Good point, I'll fix that.
> 
>>> +long kvmppc_alloc_hpt(struct kvm *kvm)
>>> +{
>>> +	unsigned long hpt;
>>> +	unsigned long lpid;
>>> +
>>> +	hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN,
>>> +			       HPT_ORDER - PAGE_SHIFT);
>> 
>> This would end up failing quite often, no?
> 
> In practice it seems to be OK, possibly because the machines we're
> testing this on have plenty of memory.  Maybe we should get qemu to
> allocate the HPT using hugetlbfs so the memory will come from the
> reserved page pool.  It does need to be physically contiguous and
> aligned on a multiple of its size -- that's a hardware requirement.

Yes, I'd certainly prefer to see qemu allocate it. That'd also make things easier for migration later, as we still have access to the hpt. But then again - we can't really reuse the mappings there anyways, as they'll all be host mappings. Phew. Have you given that some thought yet? We can probably just ignore non-bolted entries - but the bolted ones need to be transferred.

Also, if we have qemu allocate the hpt memory, qemu can modify the mappings and thus break out. Bleks. It should definitely not be able to write to the hpt after it's actually being used by kvm.

> 
>>> +	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
>>> +	kvm->arch.lpid = lpid;
>>> +	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
>>> +	kvm->arch.host_lpid = mfspr(SPRN_LPID);
>>> +	kvm->arch.host_lpcr = mfspr(SPRN_LPCR);
>> 
>> How do these correlate with the guest's hv mmu? I'd like to keep the
>> code clean enough so we can potentially use it for PR mode as well :). 
> 
> The host SDR1 and LPID are different from the guest's.  That is, the
> guest has its own HPT which is quite separate from the host's.  The
> host values could be saved in global variables, though; there's no
> real need for each VM to have its own copy, except that doing it this
> way simplifies the low-level assembly code a little.

Well, my point is that I tried to separate the MMU implementation for the PR KVM stuff. What I'd like to see at the end of the day would be a "guest" hv implementation file that I could plug into PR kvm and have the MMU rolling by using the exact same code as the HV code. Only the backend would be different. Maybe there's some valid technical reason to not do it, but I haven't come across any yet :).

> 
>>> +	/* First see what page size we have */
>>> +	psize = user_page_size(mem->userspace_addr);
>>> +	/* For now, only allow 16MB pages */
>> 
>> The reason to go for 16MB pages is because of the host mmu code, not
>> the guest hv mmu. So please at least #ifdef the code to HV so we
>> document that correlation. 
> 
> I'm not sure what you mean by that.  The reason for going for 16MB
> pages initially is for performance (this way the guest can use 16MB
> pages for its linear mapping) and to avoid having to deal with the
> pages being paged or swapped on the host side.  Could you explain the
> "because of the host mmu code" part of your comment further?

If you consider a split as I described above, an HV guest running on PR KVM doesn't have to be mapped linearly. But then again - it could. In fact, it might even make sense. Hrm. Very good point! We could also just flip SDR1 in PR mode and reuse the exact same code as the HV mode.

However, my point here was that when we don't flip SDR1 but go through a shadow hpt, we don't have to map it to 16MB pages.

> 
> What would adding #ifdef CONFIG_KVM_BOOK3S_64_HV achieve in a file
> whose name ends in _hv.c and which only gets compiled when
> CONFIG_KVM_BOOK3S_64_HV=y?
> 
>>> +int kvmppc_mmu_hv_init(void)
>>> +{
>>> +	if (!cpu_has_feature(CPU_FTR_HVMODE_206))
>>> +		return 0;
>> 
>> Return 0 for "it doesn't work" might not be the right exit code ;).
> 
> Good point, I'll make it -ENXIO or something.

Anything < 0 sounds good to me :). -EINVAL is probably the most sensible one here.

> 
>>> +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
>>> +				struct kvmppc_pte *gpte, bool data)
>>> +{
>>> +	return -ENOENT;
>> 
>> Can't you just call the normal book3s_64 mmu code here? Without
>> xlate, doing ppc_ld doesn't work, which means that reading out the
>> faulting guest instruction breaks. We'd also need it for PR mode :). 
> 
> With book3s_hv we currently have no situations where we need to read
> out the faulting guest instruction.  
> 
> We could use the normal code here if we had the guest HPT mapped into
> the qemu address space, which it currently isn't -- but should be.  It
> hasn't been a priority to fix because with book3s_hv we currently have
> no situations where we need to read out the faulting guest
> instruction.

Makes sense. We might however need it in case we ever use this code in PR mode :). I don't fully remember if you shoved around that code, but in case fetching the last instruction fails (which IIRC it can for you too), a manual load through this function gets issued.

> 
>>> +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
>>> +{
>>> +	vcpu->arch.pvr = pvr;
>>> +	kvmppc_mmu_book3s_hv_init(vcpu);
>> 
>> No need for you to do it depending on pvr. You can just do the mmu
>> initialization on normal init :).
> 
> OK, I'll do that.
> 
>>> +	case BOOK3S_INTERRUPT_PROGRAM:
>>> +	{
>>> +		ulong flags;
>>> +		/*
>>> +		 * Normally program interrupts are delivered directly
>>> +		 * to the guest by the hardware, but we can get here
>>> +		 * as a result of a hypervisor emulation interrupt
>>> +		 * (e40) getting turned into a 700 by BML RTAS.
>> 
>> Not sure I fully understand what's going on there. Mind to explain? :)
> 
> Recent versions of the architecture don't actually deliver a 0x700
> interrupt to the OS on an illegal instruction; instead they generate
> an 0xe40 interrupt to the hypervisor in case the hypervisor wants to
> emulate the instruction.  If the hypervisor doesn't want to do that
> it's supposed to synthesize a 0x700 interrupt to the OS.
> 
> When we're running this stuff under our BML (Bare Metal Linux)
> framework in the lab, we use a small RTAS implementation that gets
> loaded by the BML loader, and one of the things that this RTAS does is
> to patch the 0xe40 vector to make the 0xe40 interrupt come in via the
> 0x700 vector, in case the kernel you're running under BML hasn't been
> updated to have an 0xe40 handler.
> 
> I could just remove that case, in fact.

Yeah, the main issue I see here is that there's no hardware that could run this code. Whatever the first hardware that would be able to requires should be used here.

> 
>>> +	case BOOK3S_INTERRUPT_SYSCALL:
>>> +	{
>>> +		/* hcall - punt to userspace */
>>> +		int i;
>>> +
>> 
>> Do we really want to accept sc from pr=1? I'd just reject them straight away.
> 
> Good idea, I'll do that.
> 
>>> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
>>> +		vcpu->arch.dsisr = vcpu->arch.fault_dsisr;
>>> +		vcpu->arch.dear = vcpu->arch.fault_dar;
>>> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0);
>>> +		r = RESUME_GUEST;
>>> +		break;
>>> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
>>> +		kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE,
>>> +					0x08000000);
>> 
>> What do these do? I thought the guest gets DSI and ISI directly?
> 
> It does for accesses with relocation (IR/DR) on, but because we have
> enabled VRMA mode (Virtualized Real Mode Area) in the LPCR, we get
> these interrupts to the hypervisor if the guest does a bad real-mode
> access.  If we also turned on Virtualized Partition Memory (VPM) mode
> in the LPCR, then all ISI/DSI in the guest come through to the
> hypervisor using these vectors in case the hypervisor wants to do any
> paging/swapping of guest memory.  I plan to do that later when we
> support using 4k/64k pages for guest memory.

I see - please put that explanation in a comment there ;).

> 
>>> +	/* default to book3s_64 (power7) */
>>> +	vcpu->arch.pvr = 0x3f0200;
>> 
>> Wouldn't it make sense to simply default to the host pvr? Not sure -
>> maybe it's not :).
> 
> Sounds sensible, actually.  In fact the hypervisor can't spoof the PVR
> for the guest, that is, the guest can read the real PVR value and
> there's no way the hypervisor can stop it.

If it can't spoof it at all, then yes, use the host pvr.

In fact, thinking about this, how does userspace know which mode the kernel uses? It should be able to fail if we're trying to run a -M mac99 on this code ;).

> 
>>> +	flush_fp_to_thread(current);
>> 
>> Do those with fine with preemption enabled?
> 
> Yes.  If a preemption happens, it will flush the FP/VMX/VSX registers
> out to the thread_struct, and then any of these explicit calls that
> happen after the preemption will do nothing.

Ah, the flushes themselves disable preemption during the flush :). Then it makes sense.

> 
>>> +	/*
>>> +	 * Put whatever is in the decrementer into the
>>> +	 * hypervisor decrementer.
>>> +	 */
>>> +	mfspr	r8,SPRN_DEC
>>> +	mftb	r7
>>> +	mtspr	SPRN_HDEC,r8
>> 
>> Can't we just always use HDEC on the host? That's save us from all
>> the swapping here.
> 
> The problem is that there is only one HDEC per core, so using it would
> become complicated when the host is running in SMT4 or SMT2 mode.

I see.

> 
>>> +	extsw	r8,r8
>>> +	add	r8,r8,r7
>>> +	std	r8,PACA_KVM_DECEXP(r13)
>> 
>> Why is dec+hdec on vcpu_run decexp? What exactly does this store?
> 
> R7 here is the current (or very recent, anyway) timebase value, so
> this stores the timebase value at which the host decrementer would get
> to zero.
> 
>>> +	lwz	r3, PACA_HOST_PMC(r13)
>>> +	lwz	r4, PACA_HOST_PMC + 4(r13)
>>> +	lwz	r5, PACA_HOST_PMC + 4(r13)
>> 
>> copy&paste error?
> 
> Yes, thanks.
> 
>>> +.global kvmppc_handler_lowmem_trampoline
>>> +kvmppc_handler_lowmem_trampoline:
>>> +	cmpwi	r12,0x500
>>> +	beq	1f
>>> +	cmpwi	r12,0x980
>>> +	beq	1f
>> 
>> What?
>> 
>> 1) use the macros please
> 
> OK
> 
>> 2) why?
> 
> The external interrupt and hypervisor decrementer interrupt handlers
> expect the interrupted PC and MSR to be in HSRR0/1 rather than
> SRR0/1.  I could remove the case for 0x980 since we don't call the
> linux handler for HDEC interrupts any more.

Ah, makes sense. This also deserves a comment :)

> 
>>> +	/* Check if HDEC expires soon */
>>> +	mfspr	r3,SPRN_HDEC
>>> +	cmpwi	r3,10
>>> +	li	r12,0x980
>> 
>> define
> 
> OK.
> 
>>> +	mr	r9,r4
>>> +	blt	hdec_soon
>> 
>> Is it faster to do the check than to save the check and take the
>> odds? Also, maybe we should rather do the check in preemptible
>> code that could just directly pass the time slice on.
> 
> I do the check there because I was having problems where, if the HDEC
> goes negative before we do the partition switch, we would occasionally
> not get the HDEC interrupt at all until the next time HDEC went
> negative, ~ 8.4 seconds later.

Yikes - so HDEC is edge and doesn't even keep the interrupt line up? That sounds like a serious hardware limitation. What if you only use HDEC and it triggers while interrupts are off in a critical section? Is the hardware really that broken?

> 
>>> +	/* See if this is a leftover HDEC interrupt */
>>> +	cmpwi	r12,0x980
>> 
>> define
> 
> OK.
> 
>>> +	bne	2f
>>> +	mfspr	r3,SPRN_HDEC
>>> +	cmpwi	r3,0
>>> +	bge	ignore_hdec
>>> +2:
>>> +
>>> +	/* Check for mediated interrupts (could be done earlier really ...) */
>>> +	cmpwi	r12,0x500
>> 
>> define
> 
> OK.
> 
>>> +	bne+	1f
>>> +	ld	r5,VCPU_LPCR(r9)
>>> +	andi.	r0,r11,MSR_EE
>>> +	beq	1f
>>> +	andi.	r0,r5,LPCR_MER
>>> +	bne	bounce_ext_interrupt
>> 
>> So there's no need for the external check that directly goes into
>> the Linux handler code on full-fledged exits?
> 
> No, we still need that; ordinary external interrupts come to the
> hypervisor regardless of the guest's MSR.EE setting.
> 
> The MER bit (Mediated External Request) is a way for the hypervisor to
> know when the guest sets its MSR.EE bit.  If an event happens that
> means the host wants to give a guest vcpu an external interrupt, but
> the guest vcpu has MSR.EE = 0, then the host can't deliver the
> simulated external interrupt.  Instead it sets LPCR.MER, which has the
> effect that the hardware will deliver an external interrupt (to the
> hypervisor) when running in the guest and it has MSR.EE = 1.
> 
> So, when we get an external interrupt, LPCR.MER = 1 and MSR.EE = 1,
> we need to synthesize an external interrupt in the guest.  Doing it
> here means that we don't need to do the full partition switch out to
> the host and back again.

Ah, special functionality then. Please comment this in the code, so the unknowledgeable reader (me) knows what this is about.

> 
>>> +	/* Read the guest SLB and save it away */
>>> +	li	r6,0
>>> +	addi	r7,r9,VCPU_SLB
>>> +	li	r5,0
>>> +1:	slbmfee	r8,r6
>>> +	andis.	r0,r8,SLB_ESID_V@h
>>> +	beq	2f
>>> +	add	r8,r8,r6		/* put index in */
>>> +	slbmfev	r3,r6
>>> +	std	r8,VCPU_SLB_E(r7)
>>> +	std	r3,VCPU_SLB_V(r7)
>>> +	addi	r7,r7,VCPU_SLB_SIZE
>>> +	addi	r5,r5,1
>>> +2:	addi	r6,r6,1
>>> +	cmpwi	r6,32
>> 
>> I don't like how the 32 is hardcoded here. Better create a define
>> for it and use the same in the init code.
> 
> Sure.  In fact I probably should use vcpu->arch.slb_nr here.

and bctr? yup :)

> 
>>> +hdec_soon:
>>> +	/* Switch back to host partition */
>>> +	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
>>> +	ld	r6,KVM_HOST_SDR1(r4)
>>> +	lwz	r7,KVM_HOST_LPID(r4)
>>> +	li	r8,0x3ff		/* switch to reserved LPID */
>> 
>> is it reserved by ISA? Either way, hard-coding the constant without
>> a name is not nice :).
> 
> Actually, it just has to be an LPID value that isn't ever used for
> running a real guest (or the host).  I'll make a name for it.
> 
>>> +	lis	r8,0x7fff
>> 
>> INT_MAX@h might be more readable.
> 
> OK.
> 
>>> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
>>> {
>>> -	return !(v->arch.shared->msr & MSR_WE) ||
>>> +#ifndef CONFIG_KVM_BOOK3S_64_HV
>>> +	return !(kvmppc_get_msr(v) & MSR_WE) ||
>>> 	       !!(v->arch.pending_exceptions);
>>> +#else
>>> +	return 1;
>> 
>> So what happens if the guest sets MSR_WE? It just stays in guest
>> context idling? That'd be pretty bad for scheduling on the host.
> 
> The MSR_WE bit doesn't exist on POWER7 (or any of POWER[4567], in
> fact).  If the guest wants to idle it calls the H_CEDE hcall.

Ah, interesting. Didn't know :).

> 
>>> @@ -184,12 +188,14 @@ int kvm_dev_ioctl_check_extension(long ext)
>>> 	case KVM_CAP_PPC_IRQ_LEVEL:
>>> 	case KVM_CAP_ENABLE_CAP:
>>> 	case KVM_CAP_PPC_OSI:
>>> +#ifndef CONFIG_KVM_BOOK3S_64_HV
>> 
>> You also don't do OSI on HV :).
> 
> Good point, I'll fix that.
> 
>>> @@ -496,8 +510,11 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
>>> 	if (waitqueue_active(&vcpu->wq)) {
>>> 		wake_up_interruptible(&vcpu->wq);
>>> 		vcpu->stat.halt_wakeup++;
>>> +#ifdef CONFIG_KVM_BOOK3S_64_HV
>>> +	} else if (vcpu->cpu != -1) {
>>> +		smp_send_reschedule(vcpu->cpu);
>> 
>> Shouldn't this be done for non-HV too? The only reason we don't do
>> it yet is because we don't do SMP, no?
> 
> I didn't know why you didn't do it for non-HV, so I didn't change it
> for that case.  If you say it's OK, I'll change it (we'll need to set
> vcpu->cpu in the vcpu_load/unload code for book3s_pr too, then).

Sure, go ahead and set it.


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel
  2011-05-17  7:54     ` Alexander Graf
  (?)
@ 2011-05-17 10:28     ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-17 10:28 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm

On Tue, May 17, 2011 at 09:54:35AM +0200, Alexander Graf wrote:

> Not sure I like the name - when is it used? :)

When the real-mode hcall handler decides it can't handle the hcall and
wants to pass it up.

> Also, if it's not in the PAPR, the guest should never receive it, right?

Right.  It's purely an internal value.

> > +	/* See if this is something we can handle in real mode */
> > +	cmpwi	r12,0xc00
> 
> use the define please

OK

> > +	beq	hcall_real_mode
> 
> This is simply a hcall helper, as the name suggests. So the comment
> is slightly misleading - it should rather read like "Try to handle
> hypercalls in real mode".

Hmmm, OK, that's what it was trying to indicate.  I could add "try" to
the name if you like.

> > +	cmpdi	r3,H_TOO_HARD
> > +	beq	hcall_real_fallback
> 
> Ah, very good. Please mark the constant as "for internal use only"
> then, as that's certainly fine :).

Cool, will do.

> > +	li	r12,0xc00
> 
> use the define please :)

OK.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
  2011-05-17  8:21     ` Alexander Graf
  (?)
@ 2011-05-17 10:44     ` Paul Mackerras
  2011-05-17 11:36         ` Alexander Graf
  -1 siblings, 1 reply; 103+ messages in thread
From: Paul Mackerras @ 2011-05-17 10:44 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm

On Tue, May 17, 2011 at 10:21:56AM +0200, Alexander Graf wrote:
> 
> On 11.05.2011, at 12:46, Paul Mackerras wrote:
> 
> > -#define KVM_MAX_VCPUS 1
> > +#define KVM_MAX_VCPUS		NR_CPUS
> > +#define KVM_THREADS_PER_CORE	4
> 
> So what if POWER8 (or whatever it will be called) comes along with 8
> threads per core? Would that change the userspace interface?

The idea is that userspace queries the KVM_CAP_PPC_SMT capability and
the value it gets back is the number of vcpus per vcore.  It then
allocates vcpu numbers based on that.

If a CPU came along with more than 4 threads per core then we'd have
to change that define in the kernel, but that won't affect the
userspace API.

> > +	/* wait for secondary threads to get back to nap mode */
> > +	spin_lock(&vc->lock);
> > +	if (vc->nap_count < vc->n_woken)
> > +		kvmppc_wait_for_nap(vc);
> 
> So you're taking the vcore wide lock and wait for other CPUs to set
> themselves to nap? Not sure I fully understand this. Why would
> another thread want to go to nap mode when it's 100% busy?

It's more about waiting for the other hardware threads to have
finished writing their vcpu state to memory.  Currently those threads
then go to nap mode, but they could in fact poll instead for a bit,
so that name is possible a bit misleading, I agree.

> > +	cmpwi	r12,0x980
> > +	beq	40f
> > +	cmpwi	r3,0x100
> 
> good old use define comment :)

Yep, OK. :)

> Maybe I also missed the point here, but how does this correlate with
> Linux threads? Is each vcpu running in its own Linux thread? How
> does the scheduling happen? IIUC the host only sees a single thread
> per core and then distributes the vcpus to the respective host
> threads.

Each vcpu has its own Linux thread, but while the vcore is running,
all but one of them are sleeping.  The thing is that since the host is
running with each core single-threaded, one Linux thread is enough to
run 4 vcpus.  So when we decide we can run the vcore, the vcpu thread
that discovered that we can now run the vcore takes the responsibility
to run it.  That involves sending an IPI to the other hardware threads
to wake them up and get them to each run a vcpu.  Then the vcpu thread
that is running the vcore dives into the guest switch code itself.  It
synchronizes with the other threads and does the partition switch, and
then they all enter the guest.

We thought about various schemes to cope with the hardware restriction
that all hardware threads in a core have to be in the same partition
(at least whenever the MMU is on).  This is the least messy scheme we
could come up with.  I'd be happy to discuss the alternatives if you
like.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-17  9:46   ` Alexander Graf
  (?)
@ 2011-05-17 11:15   ` Paul Mackerras
  2011-05-17 11:38       ` Alexander Graf
  -1 siblings, 1 reply; 103+ messages in thread
From: Paul Mackerras @ 2011-05-17 11:15 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm

On Tue, May 17, 2011 at 11:46:34AM +0200, Alexander Graf wrote:

> Very nice patches indeed :). Is there any way I can test them? I
> don't like pulling code that I couldn't run anywhere yet.

I can understand that, but unfortunately there are no machines
available outside of IBM at this stage that can run this stuff.

Do you think that the earlier patches in the series could go in for
2.6.40?  Patches 1 to 8 don't require hypervisor mode and can be
tested on a G5 or whatever.

What would be the path for these patches to get upstream?  Would this
stuff normally go through Avi's tree?  There is a bit of a
complication in that they are based on Ben's next branch.  Would Avi
pull Ben's next branch, or would they go in via Ben's tree?

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
  2011-05-17 10:44     ` Paul Mackerras
@ 2011-05-17 11:36         ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17 11:36 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm



Am 17.05.2011 um 12:44 schrieb Paul Mackerras <paulus@samba.org>:

> On Tue, May 17, 2011 at 10:21:56AM +0200, Alexander Graf wrote:
>> 
>> On 11.05.2011, at 12:46, Paul Mackerras wrote:
>> 
>>> -#define KVM_MAX_VCPUS 1
>>> +#define KVM_MAX_VCPUS        NR_CPUS
>>> +#define KVM_THREADS_PER_CORE    4
>> 
>> So what if POWER8 (or whatever it will be called) comes along with 8
>> threads per core? Would that change the userspace interface?
> 
> The idea is that userspace queries the KVM_CAP_PPC_SMT capability and
> the value it gets back is the number of vcpus per vcore.  It then
> allocates vcpu numbers based on that.
> 
> If a CPU came along with more than 4 threads per core then we'd have
> to change that define in the kernel, but that won't affect the
> userspace API.

Ah, I see :). Tht's exactly why documentation is so important - with proper documentation you wouldn't need to explain those things to me :)

> 
>>> +    /* wait for secondary threads to get back to nap mode */
>>> +    spin_lock(&vc->lock);
>>> +    if (vc->nap_count < vc->n_woken)
>>> +        kvmppc_wait_for_nap(vc);
>> 
>> So you're taking the vcore wide lock and wait for other CPUs to set
>> themselves to nap? Not sure I fully understand this. Why would
>> another thread want to go to nap mode when it's 100% busy?
> 
> It's more about waiting for the other hardware threads to have
> finished writing their vcpu state to memory.  Currently those threads
> then go to nap mode, but they could in fact poll instead for a bit,
> so that name is possible a bit misleading, I agree.

Just so I understand the scheme: One vcpu needs to go to MMU mode in KVM, it then sends IPIs to stop the other threads and finally we return from this wait here?

> 
>>> +    cmpwi    r12,0x980
>>> +    beq    40f
>>> +    cmpwi    r3,0x100
>> 
>> good old use define comment :)
> 
> Yep, OK. :)
> 
>> Maybe I also missed the point here, but how does this correlate with
>> Linux threads? Is each vcpu running in its own Linux thread? How
>> does the scheduling happen? IIUC the host only sees a single thread
>> per core and then distributes the vcpus to the respective host
>> threads.
> 
> Each vcpu has its own Linux thread, but while the vcore is running,
> all but one of them are sleeping.  The thing is that since the host is
> running with each core single-threaded, one Linux thread is enough to
> run 4 vcpus.  So when we decide we can run the vcore, the vcpu thread
> that discovered that we can now run the vcore takes the responsibility
> to run it.  That involves sending an IPI to the other hardware threads
> to wake them up and get them to each run a vcpu.  Then the vcpu thread
> that is running the vcore dives into the guest switch code itself.  It
> synchronizes with the other threads and does the partition switch, and
> then they all enter the guest.
> 
> We thought about various schemes to cope with the hardware restriction
> that all hardware threads in a core have to be in the same partition
> (at least whenever the MMU is on).  This is the least messy scheme we
> could come up with.  I'd be happy to discuss the alternatives if you
> like.

Oh, I'm certainly fine with the scheme :). I would just like to understand it and see it documented somewhere, as it's slightly unintuitive.

Also, this scheme might confuse the host scheduler for a bit, as it might migrate threads to other host CPUs while it would prove beneficial for cache usage to keep them local. But since the scheduler doesn't know about the correlation between the threads, it can't be clever about it.

Alex

> 

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
@ 2011-05-17 11:36         ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17 11:36 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm



Am 17.05.2011 um 12:44 schrieb Paul Mackerras <paulus@samba.org>:

> On Tue, May 17, 2011 at 10:21:56AM +0200, Alexander Graf wrote:
>>=20
>> On 11.05.2011, at 12:46, Paul Mackerras wrote:
>>=20
>>> -#define KVM_MAX_VCPUS 1
>>> +#define KVM_MAX_VCPUS        NR_CPUS
>>> +#define KVM_THREADS_PER_CORE    4
>>=20
>> So what if POWER8 (or whatever it will be called) comes along with 8
>> threads per core? Would that change the userspace interface?
>=20
> The idea is that userspace queries the KVM_CAP_PPC_SMT capability and
> the value it gets back is the number of vcpus per vcore.  It then
> allocates vcpu numbers based on that.
>=20
> If a CPU came along with more than 4 threads per core then we'd have
> to change that define in the kernel, but that won't affect the
> userspace API.

Ah, I see :). Tht's exactly why documentation is so important - with proper d=
ocumentation you wouldn't need to explain those things to me :)

>=20
>>> +    /* wait for secondary threads to get back to nap mode */
>>> +    spin_lock(&vc->lock);
>>> +    if (vc->nap_count < vc->n_woken)
>>> +        kvmppc_wait_for_nap(vc);
>>=20
>> So you're taking the vcore wide lock and wait for other CPUs to set
>> themselves to nap? Not sure I fully understand this. Why would
>> another thread want to go to nap mode when it's 100% busy?
>=20
> It's more about waiting for the other hardware threads to have
> finished writing their vcpu state to memory.  Currently those threads
> then go to nap mode, but they could in fact poll instead for a bit,
> so that name is possible a bit misleading, I agree.

Just so I understand the scheme: One vcpu needs to go to MMU mode in KVM, it=
 then sends IPIs to stop the other threads and finally we return from this w=
ait here?

>=20
>>> +    cmpwi    r12,0x980
>>> +    beq    40f
>>> +    cmpwi    r3,0x100
>>=20
>> good old use define comment :)
>=20
> Yep, OK. :)
>=20
>> Maybe I also missed the point here, but how does this correlate with
>> Linux threads? Is each vcpu running in its own Linux thread? How
>> does the scheduling happen? IIUC the host only sees a single thread
>> per core and then distributes the vcpus to the respective host
>> threads.
>=20
> Each vcpu has its own Linux thread, but while the vcore is running,
> all but one of them are sleeping.  The thing is that since the host is
> running with each core single-threaded, one Linux thread is enough to
> run 4 vcpus.  So when we decide we can run the vcore, the vcpu thread
> that discovered that we can now run the vcore takes the responsibility
> to run it.  That involves sending an IPI to the other hardware threads
> to wake them up and get them to each run a vcpu.  Then the vcpu thread
> that is running the vcore dives into the guest switch code itself.  It
> synchronizes with the other threads and does the partition switch, and
> then they all enter the guest.
>=20
> We thought about various schemes to cope with the hardware restriction
> that all hardware threads in a core have to be in the same partition
> (at least whenever the MMU is on).  This is the least messy scheme we
> could come up with.  I'd be happy to discuss the alternatives if you
> like.

Oh, I'm certainly fine with the scheme :). I would just like to understand i=
t and see it documented somewhere, as it's slightly unintuitive.

Also, this scheme might confuse the host scheduler for a bit, as it might mi=
grate threads to other host CPUs while it would prove beneficial for cache u=
sage to keep them local. But since the scheduler doesn't know about the corr=
elation between the threads, it can't be clever about it.

Alex

>=20

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-17 11:15   ` Paul Mackerras
@ 2011-05-17 11:38       ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17 11:38 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm



Am 17.05.2011 um 13:15 schrieb Paul Mackerras <paulus@samba.org>:

> On Tue, May 17, 2011 at 11:46:34AM +0200, Alexander Graf wrote:
> 
>> Very nice patches indeed :). Is there any way I can test them? I
>> don't like pulling code that I couldn't run anywhere yet.
> 
> I can understand that, but unfortunately there are no machines
> available outside of IBM at this stage that can run this stuff.
> 
> Do you think that the earlier patches in the series could go in for
> 2.6.40?  Patches 1 to 8 don't require hypervisor mode and can be
> tested on a G5 or whatever.

Oh, I can and will certainly test if the code breaks pr mode, but that doesn't mean it will do the right thing on HV machines :o.

> 
> What would be the path for these patches to get upstream?  Would this
> stuff normally go through Avi's tree?  There is a bit of a
> complication in that they are based on Ben's next branch.  Would Avi
> pull Ben's next branch, or would they go in via Ben's tree?

Usually the ppc tree gets merged into Avi's tree and goes on from there. When we have interdependencies, we can certainly do it differently though. We can also shove them through Ben's tree this time around, as there are more dependencies on ppc code than KVM code.

> 
> Thanks,
> Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-17 11:38       ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-17 11:38 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm



Am 17.05.2011 um 13:15 schrieb Paul Mackerras <paulus@samba.org>:

> On Tue, May 17, 2011 at 11:46:34AM +0200, Alexander Graf wrote:
>=20
>> Very nice patches indeed :). Is there any way I can test them? I
>> don't like pulling code that I couldn't run anywhere yet.
>=20
> I can understand that, but unfortunately there are no machines
> available outside of IBM at this stage that can run this stuff.
>=20
> Do you think that the earlier patches in the series could go in for
> 2.6.40?  Patches 1 to 8 don't require hypervisor mode and can be
> tested on a G5 or whatever.

Oh, I can and will certainly test if the code breaks pr mode, but that doesn=
't mean it will do the right thing on HV machines :o.

>=20
> What would be the path for these patches to get upstream?  Would this
> stuff normally go through Avi's tree?  There is a bit of a
> complication in that they are based on Ben's next branch.  Would Avi
> pull Ben's next branch, or would they go in via Ben's tree?

Usually the ppc tree gets merged into Avi's tree and goes on from there. Whe=
n we have interdependencies, we can certainly do it differently though. We c=
an also shove them through Ben's tree this time around, as there are more de=
pendencies on ppc code than KVM code.

>=20
> Thanks,
> Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-17 11:38       ` Alexander Graf
@ 2011-05-17 11:42         ` Avi Kivity
  -1 siblings, 0 replies; 103+ messages in thread
From: Avi Kivity @ 2011-05-17 11:42 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Paul Mackerras, linuxppc-dev, kvm

On 05/17/2011 02:38 PM, Alexander Graf wrote:
> >
> >  What would be the path for these patches to get upstream?  Would this
> >  stuff normally go through Avi's tree?  There is a bit of a
> >  complication in that they are based on Ben's next branch.  Would Avi
> >  pull Ben's next branch, or would they go in via Ben's tree?
>
> Usually the ppc tree gets merged into Avi's tree and goes on from there. When we have interdependencies, we can certainly do it differently though. We can also shove them through Ben's tree this time around, as there are more dependencies on ppc code than KVM code.
>

Yes, both options are fine.  If it goes through kvm.git I can merge 
Ben's tree (provided it is append-only) and apply the kvm-ppc patches on 
top.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-17 11:42         ` Avi Kivity
  0 siblings, 0 replies; 103+ messages in thread
From: Avi Kivity @ 2011-05-17 11:42 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Paul Mackerras, kvm

On 05/17/2011 02:38 PM, Alexander Graf wrote:
> >
> >  What would be the path for these patches to get upstream?  Would this
> >  stuff normally go through Avi's tree?  There is a bit of a
> >  complication in that they are based on Ben's next branch.  Would Avi
> >  pull Ben's next branch, or would they go in via Ben's tree?
>
> Usually the ppc tree gets merged into Avi's tree and goes on from there. When we have interdependencies, we can certainly do it differently though. We can also shove them through Ben's tree this time around, as there are more dependencies on ppc code than KVM code.
>

Yes, both options are fine.  If it goes through kvm.git I can merge 
Ben's tree (provided it is append-only) and apply the kvm-ppc patches on 
top.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code
  2011-05-11 10:43 ` [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code Paul Mackerras
@ 2011-05-17 18:05     ` Marcelo Tosatti
  0 siblings, 0 replies; 103+ messages in thread
From: Marcelo Tosatti @ 2011-05-17 18:05 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm, Alexander Graf

On Wed, May 11, 2011 at 08:43:31PM +1000, Paul Mackerras wrote:
> >From 964ee93b2d728e4fb16ae66eaceb6e912bf114ad Mon Sep 17 00:00:00 2001
> From: Paul Mackerras <paulus@samba.org>
> Date: Tue, 10 May 2011 22:23:18 +1000
> Subject: [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into
>  subarch-specific code
> 
> Instead of doing the kvm_guest_enter/exit() and local_irq_dis/enable()
> calls in powerpc.c, this moves them down into the subarch-specific
> book3s_pr.c and booke.c.  This eliminates an extra local_irq_enable()
> call in book3s_pr.c, and will be needed for when we do SMT4 guest
> support in the book3s hypervisor mode code.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
>  arch/powerpc/include/asm/kvm_ppc.h   |    1 +
>  arch/powerpc/kvm/book3s_interrupts.S |    2 +-
>  arch/powerpc/kvm/book3s_pr.c         |   12 ++++++------
>  arch/powerpc/kvm/booke.c             |   13 +++++++++++++
>  arch/powerpc/kvm/powerpc.c           |    6 +-----
>  5 files changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index f3c218a..3210911 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -42,6 +42,7 @@ enum emulation_result {
>  	EMULATE_AGAIN,        /* something went wrong. go again */
>  };
>  
> +extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
>  extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
>  extern char kvmppc_handlers_start[];
>  extern unsigned long kvmppc_handler_len;
> diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
> index 2f0bc92..8c5e0e1 100644
> --- a/arch/powerpc/kvm/book3s_interrupts.S
> +++ b/arch/powerpc/kvm/book3s_interrupts.S
> @@ -85,7 +85,7 @@
>   *  r3: kvm_run pointer
>   *  r4: vcpu pointer
>   */
> -_GLOBAL(__kvmppc_vcpu_entry)
> +_GLOBAL(__kvmppc_vcpu_run)
>  
>  kvm_start_entry:
>  	/* Write correct stack frame */
> diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
> index 08cedf0..f769915 100644
> --- a/arch/powerpc/kvm/book3s_pr.c
> +++ b/arch/powerpc/kvm/book3s_pr.c
> @@ -891,8 +891,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>  	vfree(vcpu_book3s);
>  }
>  
> -extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
> -int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>  {
>  	int ret;
>  	double fpr[32][TS_FPRWIDTH];
> @@ -944,14 +943,15 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>  	/* Remember the MSR with disabled extensions */
>  	ext_msr = current->thread.regs->msr;
>  
> -	/* XXX we get called with irq disabled - change that! */
> -	local_irq_enable();
> -
>  	/* Preload FPU if it's enabled */
>  	if (vcpu->arch.shared->msr & MSR_FP)
>  		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
>  
> -	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
> +	kvm_guest_enter();


kvm_guest_enter should run with interrupts disabled.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code
@ 2011-05-17 18:05     ` Marcelo Tosatti
  0 siblings, 0 replies; 103+ messages in thread
From: Marcelo Tosatti @ 2011-05-17 18:05 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Alexander Graf, kvm

On Wed, May 11, 2011 at 08:43:31PM +1000, Paul Mackerras wrote:
> >From 964ee93b2d728e4fb16ae66eaceb6e912bf114ad Mon Sep 17 00:00:00 2001
> From: Paul Mackerras <paulus@samba.org>
> Date: Tue, 10 May 2011 22:23:18 +1000
> Subject: [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into
>  subarch-specific code
> 
> Instead of doing the kvm_guest_enter/exit() and local_irq_dis/enable()
> calls in powerpc.c, this moves them down into the subarch-specific
> book3s_pr.c and booke.c.  This eliminates an extra local_irq_enable()
> call in book3s_pr.c, and will be needed for when we do SMT4 guest
> support in the book3s hypervisor mode code.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
>  arch/powerpc/include/asm/kvm_ppc.h   |    1 +
>  arch/powerpc/kvm/book3s_interrupts.S |    2 +-
>  arch/powerpc/kvm/book3s_pr.c         |   12 ++++++------
>  arch/powerpc/kvm/booke.c             |   13 +++++++++++++
>  arch/powerpc/kvm/powerpc.c           |    6 +-----
>  5 files changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index f3c218a..3210911 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -42,6 +42,7 @@ enum emulation_result {
>  	EMULATE_AGAIN,        /* something went wrong. go again */
>  };
>  
> +extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
>  extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
>  extern char kvmppc_handlers_start[];
>  extern unsigned long kvmppc_handler_len;
> diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
> index 2f0bc92..8c5e0e1 100644
> --- a/arch/powerpc/kvm/book3s_interrupts.S
> +++ b/arch/powerpc/kvm/book3s_interrupts.S
> @@ -85,7 +85,7 @@
>   *  r3: kvm_run pointer
>   *  r4: vcpu pointer
>   */
> -_GLOBAL(__kvmppc_vcpu_entry)
> +_GLOBAL(__kvmppc_vcpu_run)
>  
>  kvm_start_entry:
>  	/* Write correct stack frame */
> diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
> index 08cedf0..f769915 100644
> --- a/arch/powerpc/kvm/book3s_pr.c
> +++ b/arch/powerpc/kvm/book3s_pr.c
> @@ -891,8 +891,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>  	vfree(vcpu_book3s);
>  }
>  
> -extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
> -int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>  {
>  	int ret;
>  	double fpr[32][TS_FPRWIDTH];
> @@ -944,14 +943,15 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>  	/* Remember the MSR with disabled extensions */
>  	ext_msr = current->thread.regs->msr;
>  
> -	/* XXX we get called with irq disabled - change that! */
> -	local_irq_enable();
> -
>  	/* Preload FPU if it's enabled */
>  	if (vcpu->arch.shared->msr & MSR_FP)
>  		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
>  
> -	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
> +	kvm_guest_enter();


kvm_guest_enter should run with interrupts disabled.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code
  2011-05-17 18:05     ` Marcelo Tosatti
@ 2011-05-17 18:10       ` Marcelo Tosatti
  -1 siblings, 0 replies; 103+ messages in thread
From: Marcelo Tosatti @ 2011-05-17 18:10 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm, Alexander Graf

On Tue, May 17, 2011 at 03:05:12PM -0300, Marcelo Tosatti wrote:
> >  
> > -	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
> > +	kvm_guest_enter();
> 
> 
> kvm_guest_enter should run with interrupts disabled.

Its fine, please ignore message.


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code
@ 2011-05-17 18:10       ` Marcelo Tosatti
  0 siblings, 0 replies; 103+ messages in thread
From: Marcelo Tosatti @ 2011-05-17 18:10 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Alexander Graf, kvm

On Tue, May 17, 2011 at 03:05:12PM -0300, Marcelo Tosatti wrote:
> >  
> > -	ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
> > +	kvm_guest_enter();
> 
> 
> kvm_guest_enter should run with interrupts disabled.

Its fine, please ignore message.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-17 11:42         ` Avi Kivity
@ 2011-05-19  5:22           ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-19  5:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Alexander Graf, linuxppc-dev, kvm

On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
> On 05/17/2011 02:38 PM, Alexander Graf wrote:
> >>
> >>  What would be the path for these patches to get upstream?  Would this
> >>  stuff normally go through Avi's tree?  There is a bit of a
> >>  complication in that they are based on Ben's next branch.  Would Avi
> >>  pull Ben's next branch, or would they go in via Ben's tree?
> >
> >Usually the ppc tree gets merged into Avi's tree and goes on from
> >there. When we have interdependencies, we can certainly do it
> >differently though. We can also shove them through Ben's tree this
> >time around, as there are more dependencies on ppc code than KVM
> >code.
> >
> 
> Yes, both options are fine.  If it goes through kvm.git I can merge
> Ben's tree (provided it is append-only) and apply the kvm-ppc
> patches on top.

OK, the easiest thing is for them to go via Ben's tree, I think, since
they depend so much on other stuff in Ben's tree.

Alex, could you give Ben an acked-by for patches 1-8 of the series?
There haven't been any changes requested for them.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-19  5:22           ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-19  5:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linuxppc-dev, Alexander Graf, kvm

On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
> On 05/17/2011 02:38 PM, Alexander Graf wrote:
> >>
> >>  What would be the path for these patches to get upstream?  Would this
> >>  stuff normally go through Avi's tree?  There is a bit of a
> >>  complication in that they are based on Ben's next branch.  Would Avi
> >>  pull Ben's next branch, or would they go in via Ben's tree?
> >
> >Usually the ppc tree gets merged into Avi's tree and goes on from
> >there. When we have interdependencies, we can certainly do it
> >differently though. We can also shove them through Ben's tree this
> >time around, as there are more dependencies on ppc code than KVM
> >code.
> >
> 
> Yes, both options are fine.  If it goes through kvm.git I can merge
> Ben's tree (provided it is append-only) and apply the kvm-ppc
> patches on top.

OK, the easiest thing is for them to go via Ben's tree, I think, since
they depend so much on other stuff in Ben's tree.

Alex, could you give Ben an acked-by for patches 1-8 of the series?
There haven't been any changes requested for them.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-19  5:22           ` Paul Mackerras
  (?)
@ 2011-05-19  6:01             ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-19  6:01 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Avi Kivity, Linuxppc-dev, KVM list, kvm-ppc


On 19.05.2011, at 07:22, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>> 
>>>> What would be the path for these patches to get upstream?  Would this
>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>> complication in that they are based on Ben's next branch.  Would Avi
>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>> 
>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>> there. When we have interdependencies, we can certainly do it
>>> differently though. We can also shove them through Ben's tree this
>>> time around, as there are more dependencies on ppc code than KVM
>>> code.
>>> 
>> 
>> Yes, both options are fine.  If it goes through kvm.git I can merge
>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>> patches on top.
> 
> OK, the easiest thing is for them to go via Ben's tree, I think, since
> they depend so much on other stuff in Ben's tree.
> 
> Alex, could you give Ben an acked-by for patches 1-8 of the series?
> There haven't been any changes requested for them.

Let me give them a spin on a G5, so I can at least verify nothing breaks ;). I'll hopefully get to this before next week.

Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-19  6:01             ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-19  6:01 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: kvm-ppc, Linuxppc-dev, Avi Kivity, KVM list


On 19.05.2011, at 07:22, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>>=20
>>>> What would be the path for these patches to get upstream?  Would =
this
>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>> complication in that they are based on Ben's next branch.  Would =
Avi
>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>>=20
>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>> there. When we have interdependencies, we can certainly do it
>>> differently though. We can also shove them through Ben's tree this
>>> time around, as there are more dependencies on ppc code than KVM
>>> code.
>>>=20
>>=20
>> Yes, both options are fine.  If it goes through kvm.git I can merge
>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>> patches on top.
>=20
> OK, the easiest thing is for them to go via Ben's tree, I think, since
> they depend so much on other stuff in Ben's tree.
>=20
> Alex, could you give Ben an acked-by for patches 1-8 of the series?
> There haven't been any changes requested for them.

Let me give them a spin on a G5, so I can at least verify nothing breaks =
;). I'll hopefully get to this before next week.

Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-19  6:01             ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-19  6:01 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Avi Kivity, Linuxppc-dev, KVM list, kvm-ppc


On 19.05.2011, at 07:22, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>> 
>>>> What would be the path for these patches to get upstream?  Would this
>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>> complication in that they are based on Ben's next branch.  Would Avi
>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>> 
>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>> there. When we have interdependencies, we can certainly do it
>>> differently though. We can also shove them through Ben's tree this
>>> time around, as there are more dependencies on ppc code than KVM
>>> code.
>>> 
>> 
>> Yes, both options are fine.  If it goes through kvm.git I can merge
>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>> patches on top.
> 
> OK, the easiest thing is for them to go via Ben's tree, I think, since
> they depend so much on other stuff in Ben's tree.
> 
> Alex, could you give Ben an acked-by for patches 1-8 of the series?
> There haven't been any changes requested for them.

Let me give them a spin on a G5, so I can at least verify nothing breaks ;). I'll hopefully get to this before next week.

Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes
  2011-05-17 11:36         ` Alexander Graf
  (?)
@ 2011-05-19  6:06         ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-19  6:06 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm

On Tue, May 17, 2011 at 01:36:26PM +0200, Alexander Graf wrote:
>
> Just so I understand the scheme: One vcpu needs to go to MMU mode in
> KVM, it then sends IPIs to stop the other threads and finally we
> return from this wait here?

Actually, if one thread needs to get the other threads out of the
guest, it sets the HDEC to 0.  Since it's a shared register and
interrupts all threads on a 0 to -1 transition, setting it to 0 makes
all threads come out of the guest.

The IPI is for when we're going into the guest.  When we're in the
host, all the secondary threads are in nap and only the primary thread
is running.  (Offlining a cpu in the host results in the cpu/thread
going to nap mode.)  Sending an IPI to a napping thread wakes it up
and it resumes at the system reset vector with some bits set in SRR1
to say that it was previously in nap mode.

> Oh, I'm certainly fine with the scheme :). I would just like to
> understand it and see it documented somewhere, as it's slightly
> unintuitive.

It took some thought to work it out, so you're right, I should
definitely document it.

> Also, this scheme might confuse the host scheduler for a bit, as it
> might migrate threads to other host CPUs while it would prove
> beneficial for cache usage to keep them local. But since the
> scheduler doesn't know about the correlation between the threads, it
> can't be clever about it.

Well, it's not going to migrate a sleeping thread.  The accounting
gets slightly strange in that all the CPU time for running the 4 vcpus
in the vcore gets accounted to one of the vcpu threads (which one can
change over time).  However, the total across all qemu threads should
be correct.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-19  5:22           ` Paul Mackerras
@ 2011-05-21 16:41             ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-21 16:41 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Avi Kivity, linuxppc-dev, kvm


On 19.05.2011, at 07:22, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>> 
>>>> What would be the path for these patches to get upstream?  Would this
>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>> complication in that they are based on Ben's next branch.  Would Avi
>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>> 
>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>> there. When we have interdependencies, we can certainly do it
>>> differently though. We can also shove them through Ben's tree this
>>> time around, as there are more dependencies on ppc code than KVM
>>> code.
>>> 
>> 
>> Yes, both options are fine.  If it goes through kvm.git I can merge
>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>> patches on top.
> 
> OK, the easiest thing is for them to go via Ben's tree, I think, since
> they depend so much on other stuff in Ben's tree.
> 
> Alex, could you give Ben an acked-by for patches 1-8 of the series?
> There haven't been any changes requested for them.

With ben's tree merged into avi's tree and your patches applied on top, MOL breaks. I'll have to track down why exactly though.


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-21 16:41             ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-21 16:41 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Avi Kivity, kvm


On 19.05.2011, at 07:22, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>>=20
>>>> What would be the path for these patches to get upstream?  Would =
this
>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>> complication in that they are based on Ben's next branch.  Would =
Avi
>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>>=20
>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>> there. When we have interdependencies, we can certainly do it
>>> differently though. We can also shove them through Ben's tree this
>>> time around, as there are more dependencies on ppc code than KVM
>>> code.
>>>=20
>>=20
>> Yes, both options are fine.  If it goes through kvm.git I can merge
>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>> patches on top.
>=20
> OK, the easiest thing is for them to go via Ben's tree, I think, since
> they depend so much on other stuff in Ben's tree.
>=20
> Alex, could you give Ben an acked-by for patches 1-8 of the series?
> There haven't been any changes requested for them.

With ben's tree merged into avi's tree and your patches applied on top, =
MOL breaks. I'll have to track down why exactly though.


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-21 16:41             ` Alexander Graf
@ 2011-05-21 17:00               ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-21 17:00 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Paul Mackerras, Avi Kivity, linuxppc-dev, kvm


On 21.05.2011, at 18:41, Alexander Graf wrote:

> 
> On 19.05.2011, at 07:22, Paul Mackerras wrote:
> 
>> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>>> 
>>>>> What would be the path for these patches to get upstream?  Would this
>>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>>> complication in that they are based on Ben's next branch.  Would Avi
>>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>>> 
>>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>>> there. When we have interdependencies, we can certainly do it
>>>> differently though. We can also shove them through Ben's tree this
>>>> time around, as there are more dependencies on ppc code than KVM
>>>> code.
>>>> 
>>> 
>>> Yes, both options are fine.  If it goes through kvm.git I can merge
>>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>>> patches on top.
>> 
>> OK, the easiest thing is for them to go via Ben's tree, I think, since
>> they depend so much on other stuff in Ben's tree.
>> 
>> Alex, could you give Ben an acked-by for patches 1-8 of the series?
>> There haven't been any changes requested for them.
> 
> With ben's tree merged into avi's tree and your patches applied on top, MOL breaks. I'll have to track down why exactly though.

In fact, qemu-system-ppc64 -enable-kvm also breaks.


Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-21 17:00               ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-21 17:00 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Paul Mackerras, Avi Kivity, kvm


On 21.05.2011, at 18:41, Alexander Graf wrote:

>=20
> On 19.05.2011, at 07:22, Paul Mackerras wrote:
>=20
>> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>>>=20
>>>>> What would be the path for these patches to get upstream?  Would =
this
>>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>>> complication in that they are based on Ben's next branch.  Would =
Avi
>>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>>>=20
>>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>>> there. When we have interdependencies, we can certainly do it
>>>> differently though. We can also shove them through Ben's tree this
>>>> time around, as there are more dependencies on ppc code than KVM
>>>> code.
>>>>=20
>>>=20
>>> Yes, both options are fine.  If it goes through kvm.git I can merge
>>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>>> patches on top.
>>=20
>> OK, the easiest thing is for them to go via Ben's tree, I think, =
since
>> they depend so much on other stuff in Ben's tree.
>>=20
>> Alex, could you give Ben an acked-by for patches 1-8 of the series?
>> There haven't been any changes requested for them.
>=20
> With ben's tree merged into avi's tree and your patches applied on =
top, MOL breaks. I'll have to track down why exactly though.

In fact, qemu-system-ppc64 -enable-kvm also breaks.


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
  2011-05-21 17:00               ` Alexander Graf
@ 2011-05-21 18:15                 ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-21 18:15 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Paul Mackerras, Avi Kivity, kvm


On 21.05.2011, at 19:00, Alexander Graf wrote:

> 
> On 21.05.2011, at 18:41, Alexander Graf wrote:
> 
>> 
>> On 19.05.2011, at 07:22, Paul Mackerras wrote:
>> 
>>> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>>>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>>>> 
>>>>>> What would be the path for these patches to get upstream?  Would this
>>>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>>>> complication in that they are based on Ben's next branch.  Would Avi
>>>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>>>> 
>>>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>>>> there. When we have interdependencies, we can certainly do it
>>>>> differently though. We can also shove them through Ben's tree this
>>>>> time around, as there are more dependencies on ppc code than KVM
>>>>> code.
>>>>> 
>>>> 
>>>> Yes, both options are fine.  If it goes through kvm.git I can merge
>>>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>>>> patches on top.
>>> 
>>> OK, the easiest thing is for them to go via Ben's tree, I think, since
>>> they depend so much on other stuff in Ben's tree.
>>> 
>>> Alex, could you give Ben an acked-by for patches 1-8 of the series?
>>> There haven't been any changes requested for them.
>> 
>> With ben's tree merged into avi's tree and your patches applied on top, MOL breaks. I'll have to track down why exactly though.
> 
> In fact, qemu-system-ppc64 -enable-kvm also breaks.

Only ben's tree merged into avi's tree works fine. So it's definitely one of your patches from 1-8 (except for 2,3 which are already in Ben's tree).


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 0/13] Hypervisor-mode KVM on POWER7
@ 2011-05-21 18:15                 ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-21 18:15 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, Paul Mackerras, Avi Kivity, kvm


On 21.05.2011, at 19:00, Alexander Graf wrote:

>=20
> On 21.05.2011, at 18:41, Alexander Graf wrote:
>=20
>>=20
>> On 19.05.2011, at 07:22, Paul Mackerras wrote:
>>=20
>>> On Tue, May 17, 2011 at 02:42:08PM +0300, Avi Kivity wrote:
>>>> On 05/17/2011 02:38 PM, Alexander Graf wrote:
>>>>>>=20
>>>>>> What would be the path for these patches to get upstream?  Would =
this
>>>>>> stuff normally go through Avi's tree?  There is a bit of a
>>>>>> complication in that they are based on Ben's next branch.  Would =
Avi
>>>>>> pull Ben's next branch, or would they go in via Ben's tree?
>>>>>=20
>>>>> Usually the ppc tree gets merged into Avi's tree and goes on from
>>>>> there. When we have interdependencies, we can certainly do it
>>>>> differently though. We can also shove them through Ben's tree this
>>>>> time around, as there are more dependencies on ppc code than KVM
>>>>> code.
>>>>>=20
>>>>=20
>>>> Yes, both options are fine.  If it goes through kvm.git I can merge
>>>> Ben's tree (provided it is append-only) and apply the kvm-ppc
>>>> patches on top.
>>>=20
>>> OK, the easiest thing is for them to go via Ben's tree, I think, =
since
>>> they depend so much on other stuff in Ben's tree.
>>>=20
>>> Alex, could you give Ben an acked-by for patches 1-8 of the series?
>>> There haven't been any changes requested for them.
>>=20
>> With ben's tree merged into avi's tree and your patches applied on =
top, MOL breaks. I'll have to track down why exactly though.
>=20
> In fact, qemu-system-ppc64 -enable-kvm also breaks.

Only ben's tree merged into avi's tree works fine. So it's definitely =
one of your patches from 1-8 (except for 2,3 which are already in Ben's =
tree).


Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-17 10:17         ` Alexander Graf
  (?)
@ 2011-05-27 10:33           ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-27 10:33 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, KVM list, kvm-ppc

On Tue, May 17, 2011 at 12:17:50PM +0200, Alexander Graf wrote:
> 
> On 16.05.2011, at 07:58, Paul Mackerras wrote:
> 
> > I do the check there because I was having problems where, if the HDEC
> > goes negative before we do the partition switch, we would occasionally
> > not get the HDEC interrupt at all until the next time HDEC went
> > negative, ~ 8.4 seconds later.
> 
> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
> That sounds like a serious hardware limitation. What if you only use
> HDEC and it triggers while interrupts are off in a critical section?
> Is the hardware really that broken?

If HDEC expires when interrupts are off, the HDEC interrupt stays
pending until interrupts get re-enabled.  I'm not sure exactly what
the conditions are that cause an HDEC interrupt to get lost, but they
seem to involve at least a partition switch.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-27 10:33           ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-27 10:33 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, kvm-ppc, KVM list

On Tue, May 17, 2011 at 12:17:50PM +0200, Alexander Graf wrote:
> 
> On 16.05.2011, at 07:58, Paul Mackerras wrote:
> 
> > I do the check there because I was having problems where, if the HDEC
> > goes negative before we do the partition switch, we would occasionally
> > not get the HDEC interrupt at all until the next time HDEC went
> > negative, ~ 8.4 seconds later.
> 
> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
> That sounds like a serious hardware limitation. What if you only use
> HDEC and it triggers while interrupts are off in a critical section?
> Is the hardware really that broken?

If HDEC expires when interrupts are off, the HDEC interrupt stays
pending until interrupts get re-enabled.  I'm not sure exactly what
the conditions are that cause an HDEC interrupt to get lost, but they
seem to involve at least a partition switch.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in
@ 2011-05-27 10:33           ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-05-27 10:33 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, KVM list, kvm-ppc

On Tue, May 17, 2011 at 12:17:50PM +0200, Alexander Graf wrote:
> 
> On 16.05.2011, at 07:58, Paul Mackerras wrote:
> 
> > I do the check there because I was having problems where, if the HDEC
> > goes negative before we do the partition switch, we would occasionally
> > not get the HDEC interrupt at all until the next time HDEC went
> > negative, ~ 8.4 seconds later.
> 
> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
> That sounds like a serious hardware limitation. What if you only use
> HDEC and it triggers while interrupts are off in a critical section?
> Is the hardware really that broken?

If HDEC expires when interrupts are off, the HDEC interrupt stays
pending until interrupts get re-enabled.  I'm not sure exactly what
the conditions are that cause an HDEC interrupt to get lost, but they
seem to involve at least a partition switch.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-27 10:33           ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
  (?)
@ 2011-05-27 10:43             ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-27 10:43 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc


On 27.05.2011, at 12:33, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 12:17:50PM +0200, Alexander Graf wrote:
>> 
>> On 16.05.2011, at 07:58, Paul Mackerras wrote:
>> 
>>> I do the check there because I was having problems where, if the HDEC
>>> goes negative before we do the partition switch, we would occasionally
>>> not get the HDEC interrupt at all until the next time HDEC went
>>> negative, ~ 8.4 seconds later.
>> 
>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>> That sounds like a serious hardware limitation. What if you only use
>> HDEC and it triggers while interrupts are off in a critical section?
>> Is the hardware really that broken?
> 
> If HDEC expires when interrupts are off, the HDEC interrupt stays
> pending until interrupts get re-enabled.  I'm not sure exactly what
> the conditions are that cause an HDEC interrupt to get lost, but they
> seem to involve at least a partition switch.

Please try to contact some of your hardware designers and figure out what exactly the conditions are. Maybe we don't need this hack.

Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-27 10:43             ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-27 10:43 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, kvm-ppc, KVM list


On 27.05.2011, at 12:33, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 12:17:50PM +0200, Alexander Graf wrote:
>>=20
>> On 16.05.2011, at 07:58, Paul Mackerras wrote:
>>=20
>>> I do the check there because I was having problems where, if the =
HDEC
>>> goes negative before we do the partition switch, we would =
occasionally
>>> not get the HDEC interrupt at all until the next time HDEC went
>>> negative, ~ 8.4 seconds later.
>>=20
>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>> That sounds like a serious hardware limitation. What if you only use
>> HDEC and it triggers while interrupts are off in a critical section?
>> Is the hardware really that broken?
>=20
> If HDEC expires when interrupts are off, the HDEC interrupt stays
> pending until interrupts get re-enabled.  I'm not sure exactly what
> the conditions are that cause an HDEC interrupt to get lost, but they
> seem to involve at least a partition switch.

Please try to contact some of your hardware designers and figure out =
what exactly the conditions are. Maybe we don't need this hack.

Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-27 10:43             ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-27 10:43 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc


On 27.05.2011, at 12:33, Paul Mackerras wrote:

> On Tue, May 17, 2011 at 12:17:50PM +0200, Alexander Graf wrote:
>> 
>> On 16.05.2011, at 07:58, Paul Mackerras wrote:
>> 
>>> I do the check there because I was having problems where, if the HDEC
>>> goes negative before we do the partition switch, we would occasionally
>>> not get the HDEC interrupt at all until the next time HDEC went
>>> negative, ~ 8.4 seconds later.
>> 
>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>> That sounds like a serious hardware limitation. What if you only use
>> HDEC and it triggers while interrupts are off in a critical section?
>> Is the hardware really that broken?
> 
> If HDEC expires when interrupts are off, the HDEC interrupt stays
> pending until interrupts get re-enabled.  I'm not sure exactly what
> the conditions are that cause an HDEC interrupt to get lost, but they
> seem to involve at least a partition switch.

Please try to contact some of your hardware designers and figure out what exactly the conditions are. Maybe we don't need this hack.

Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-27 10:33           ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
  (?)
@ 2011-05-27 20:59             ` Segher Boessenkool
  -1 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-27 20:59 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc, Alexander Graf

>>> I do the check there because I was having problems where, if the HDEC
>>> goes negative before we do the partition switch, we would 
>>> occasionally
>>> not get the HDEC interrupt at all until the next time HDEC went
>>> negative, ~ 8.4 seconds later.
>>
>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>> That sounds like a serious hardware limitation. What if you only use
>> HDEC and it triggers while interrupts are off in a critical section?
>> Is the hardware really that broken?
>
> If HDEC expires when interrupts are off, the HDEC interrupt stays
> pending until interrupts get re-enabled.  I'm not sure exactly what
> the conditions are that cause an HDEC interrupt to get lost, but they
> seem to involve at least a partition switch.

On some CPUs, if the top bit of the decrementer is 0 again when you 
re-enable
the interrupt, the interrupt is lost (so it is actually 
level-triggered).
The arch books talk a bit about this AFAIR.


Segher

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-27 20:59             ` Segher Boessenkool
  0 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-27 20:59 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, kvm-ppc, KVM list, Alexander Graf

>>> I do the check there because I was having problems where, if the HDEC
>>> goes negative before we do the partition switch, we would 
>>> occasionally
>>> not get the HDEC interrupt at all until the next time HDEC went
>>> negative, ~ 8.4 seconds later.
>>
>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>> That sounds like a serious hardware limitation. What if you only use
>> HDEC and it triggers while interrupts are off in a critical section?
>> Is the hardware really that broken?
>
> If HDEC expires when interrupts are off, the HDEC interrupt stays
> pending until interrupts get re-enabled.  I'm not sure exactly what
> the conditions are that cause an HDEC interrupt to get lost, but they
> seem to involve at least a partition switch.

On some CPUs, if the top bit of the decrementer is 0 again when you 
re-enable
the interrupt, the interrupt is lost (so it is actually 
level-triggered).
The arch books talk a bit about this AFAIR.


Segher

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-27 20:59             ` Segher Boessenkool
  0 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-27 20:59 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Linuxppc-dev, KVM list, kvm-ppc, Alexander Graf

>>> I do the check there because I was having problems where, if the HDEC
>>> goes negative before we do the partition switch, we would 
>>> occasionally
>>> not get the HDEC interrupt at all until the next time HDEC went
>>> negative, ~ 8.4 seconds later.
>>
>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>> That sounds like a serious hardware limitation. What if you only use
>> HDEC and it triggers while interrupts are off in a critical section?
>> Is the hardware really that broken?
>
> If HDEC expires when interrupts are off, the HDEC interrupt stays
> pending until interrupts get re-enabled.  I'm not sure exactly what
> the conditions are that cause an HDEC interrupt to get lost, but they
> seem to involve at least a partition switch.

On some CPUs, if the top bit of the decrementer is 0 again when you 
re-enable
the interrupt, the interrupt is lost (so it is actually 
level-triggered).
The arch books talk a bit about this AFAIR.


Segher


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-27 20:59             ` Segher Boessenkool
  (?)
@ 2011-05-27 23:19               ` Alexander Graf
  -1 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-27 23:19 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Linuxppc-dev, Paul Mackerras, kvm-ppc, KVM list


On 27.05.2011, at 22:59, Segher Boessenkool wrote:

>>>> I do the check there because I was having problems where, if the HDEC
>>>> goes negative before we do the partition switch, we would occasionally
>>>> not get the HDEC interrupt at all until the next time HDEC went
>>>> negative, ~ 8.4 seconds later.
>>> 
>>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>>> That sounds like a serious hardware limitation. What if you only use
>>> HDEC and it triggers while interrupts are off in a critical section?
>>> Is the hardware really that broken?
>> 
>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>> pending until interrupts get re-enabled.  I'm not sure exactly what
>> the conditions are that cause an HDEC interrupt to get lost, but they
>> seem to involve at least a partition switch.
> 
> On some CPUs, if the top bit of the decrementer is 0 again when you re-enable
> the interrupt, the interrupt is lost (so it is actually level-triggered).
> The arch books talk a bit about this AFAIR.

Sure, but that shouldn't happen with HDEC during the odd 50 instructions that it takes to enter the guest :)

Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-27 23:19               ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-27 23:19 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Linuxppc-dev, Paul Mackerras, kvm-ppc, KVM list


On 27.05.2011, at 22:59, Segher Boessenkool wrote:

>>>> I do the check there because I was having problems where, if the =
HDEC
>>>> goes negative before we do the partition switch, we would =
occasionally
>>>> not get the HDEC interrupt at all until the next time HDEC went
>>>> negative, ~ 8.4 seconds later.
>>>=20
>>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>>> That sounds like a serious hardware limitation. What if you only use
>>> HDEC and it triggers while interrupts are off in a critical section?
>>> Is the hardware really that broken?
>>=20
>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>> pending until interrupts get re-enabled.  I'm not sure exactly what
>> the conditions are that cause an HDEC interrupt to get lost, but they
>> seem to involve at least a partition switch.
>=20
> On some CPUs, if the top bit of the decrementer is 0 again when you =
re-enable
> the interrupt, the interrupt is lost (so it is actually =
level-triggered).
> The arch books talk a bit about this AFAIR.

Sure, but that shouldn't happen with HDEC during the odd 50 instructions =
that it takes to enter the guest :)

Alex

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-27 23:19               ` Alexander Graf
  0 siblings, 0 replies; 103+ messages in thread
From: Alexander Graf @ 2011-05-27 23:19 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Linuxppc-dev, Paul Mackerras, kvm-ppc, KVM list


On 27.05.2011, at 22:59, Segher Boessenkool wrote:

>>>> I do the check there because I was having problems where, if the HDEC
>>>> goes negative before we do the partition switch, we would occasionally
>>>> not get the HDEC interrupt at all until the next time HDEC went
>>>> negative, ~ 8.4 seconds later.
>>> 
>>> Yikes - so HDEC is edge and doesn't even keep the interrupt line up?
>>> That sounds like a serious hardware limitation. What if you only use
>>> HDEC and it triggers while interrupts are off in a critical section?
>>> Is the hardware really that broken?
>> 
>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>> pending until interrupts get re-enabled.  I'm not sure exactly what
>> the conditions are that cause an HDEC interrupt to get lost, but they
>> seem to involve at least a partition switch.
> 
> On some CPUs, if the top bit of the decrementer is 0 again when you re-enable
> the interrupt, the interrupt is lost (so it is actually level-triggered).
> The arch books talk a bit about this AFAIR.

Sure, but that shouldn't happen with HDEC during the odd 50 instructions that it takes to enter the guest :)

Alex


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-27 23:19               ` Alexander Graf
  (?)
@ 2011-05-28  1:07                 ` Segher Boessenkool
  -1 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-28  1:07 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, Paul Mackerras, KVM list, kvm-ppc

>>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>>> pending until interrupts get re-enabled.  I'm not sure exactly what
>>> the conditions are that cause an HDEC interrupt to get lost, but they
>>> seem to involve at least a partition switch.
>>
>> On some CPUs, if the top bit of the decrementer is 0 again when you 
>> re-enable
>> the interrupt, the interrupt is lost (so it is actually 
>> level-triggered).
>> The arch books talk a bit about this AFAIR.
>
> Sure, but that shouldn't happen with HDEC during the odd 50 
> instructions that it takes to enter the guest :)

It's more like 500 insns, including some ptesync, so lots of cycles too.
Can another hardware thread be running at the same time?


Segher


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-28  1:07                 ` Segher Boessenkool
  0 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-28  1:07 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, Paul Mackerras, kvm-ppc, KVM list

>>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>>> pending until interrupts get re-enabled.  I'm not sure exactly what
>>> the conditions are that cause an HDEC interrupt to get lost, but they
>>> seem to involve at least a partition switch.
>>
>> On some CPUs, if the top bit of the decrementer is 0 again when you 
>> re-enable
>> the interrupt, the interrupt is lost (so it is actually 
>> level-triggered).
>> The arch books talk a bit about this AFAIR.
>
> Sure, but that shouldn't happen with HDEC during the odd 50 
> instructions that it takes to enter the guest :)

It's more like 500 insns, including some ptesync, so lots of cycles too.
Can another hardware thread be running at the same time?


Segher

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-28  1:07                 ` Segher Boessenkool
  0 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-28  1:07 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Linuxppc-dev, Paul Mackerras, KVM list, kvm-ppc

>>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>>> pending until interrupts get re-enabled.  I'm not sure exactly what
>>> the conditions are that cause an HDEC interrupt to get lost, but they
>>> seem to involve at least a partition switch.
>>
>> On some CPUs, if the top bit of the decrementer is 0 again when you 
>> re-enable
>> the interrupt, the interrupt is lost (so it is actually 
>> level-triggered).
>> The arch books talk a bit about this AFAIR.
>
> Sure, but that shouldn't happen with HDEC during the odd 50 
> instructions that it takes to enter the guest :)

It's more like 500 insns, including some ptesync, so lots of cycles too.
Can another hardware thread be running at the same time?


Segher


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-28  1:07                 ` Segher Boessenkool
  (?)
@ 2011-05-31 20:26                   ` Jimi Xenidis
  -1 siblings, 0 replies; 103+ messages in thread
From: Jimi Xenidis @ 2011-05-31 20:26 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alexander Graf, Linuxppc-dev, Paul Mackerras, KVM list, kvm-ppc


On May 27, 2011, at 9:07 PM, Segher Boessenkool wrote:

>>>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>>>> pending until interrupts get re-enabled.  I'm not sure exactly what
>>>> the conditions are that cause an HDEC interrupt to get lost, but they
>>>> seem to involve at least a partition switch.
>>> 
>>> On some CPUs, if the top bit of the decrementer is 0 again when you re-enable
>>> the interrupt, the interrupt is lost (so it is actually level-triggered).
>>> The arch books talk a bit about this AFAIR.
>> 
>> Sure, but that shouldn't happen with HDEC during the odd 50 instructions that it takes to enter the guest :)
> 
> It's more like 500 insns, including some ptesync, so lots of cycles too.

I don't think its actually that bad.

IIRC the problem is mostly due to another interrupt of a higher priority that sets MSR[HV] is pending.
This could also be a synchronous instruction on or near the HSRR0 (like a hypercall).
Since almost everything _is_ of a higher priority, externals, VRMA-ish, emulation, that will occur first (or at the same time).
This extends the window where the HDEC could go +ve.

Another way around this is to check, on HV switch, if the HDEC is ever bigger then it should _ever_ be, but what paulus has in his code is actually best, although the value (10?) may be too small.

> Can another hardware thread be running at the same time?

I'll leave this question to someone else.
-JX

> 
> 
> Segher
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-31 20:26                   ` Jimi Xenidis
  0 siblings, 0 replies; 103+ messages in thread
From: Jimi Xenidis @ 2011-05-31 20:26 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: kvm-ppc, Linuxppc-dev, Paul Mackerras, Alexander Graf, KVM list


On May 27, 2011, at 9:07 PM, Segher Boessenkool wrote:

>>>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>>>> pending until interrupts get re-enabled.  I'm not sure exactly what
>>>> the conditions are that cause an HDEC interrupt to get lost, but =
they
>>>> seem to involve at least a partition switch.
>>>=20
>>> On some CPUs, if the top bit of the decrementer is 0 again when you =
re-enable
>>> the interrupt, the interrupt is lost (so it is actually =
level-triggered).
>>> The arch books talk a bit about this AFAIR.
>>=20
>> Sure, but that shouldn't happen with HDEC during the odd 50 =
instructions that it takes to enter the guest :)
>=20
> It's more like 500 insns, including some ptesync, so lots of cycles =
too.

I don't think its actually that bad.

IIRC the problem is mostly due to another interrupt of a higher priority =
that sets MSR[HV] is pending.
This could also be a synchronous instruction on or near the HSRR0 (like =
a hypercall).
Since almost everything _is_ of a higher priority, externals, VRMA-ish, =
emulation, that will occur first (or at the same time).
This extends the window where the HDEC could go +ve.

Another way around this is to check, on HV switch, if the HDEC is ever =
bigger then it should _ever_ be, but what paulus has in his code is =
actually best, although the value (10?) may be too small.

> Can another hardware thread be running at the same time?

I'll leave this question to someone else.
-JX

>=20
>=20
> Segher
>=20
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-31 20:26                   ` Jimi Xenidis
  0 siblings, 0 replies; 103+ messages in thread
From: Jimi Xenidis @ 2011-05-31 20:26 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Alexander Graf, Linuxppc-dev, Paul Mackerras, KVM list, kvm-ppc


On May 27, 2011, at 9:07 PM, Segher Boessenkool wrote:

>>>> If HDEC expires when interrupts are off, the HDEC interrupt stays
>>>> pending until interrupts get re-enabled.  I'm not sure exactly what
>>>> the conditions are that cause an HDEC interrupt to get lost, but they
>>>> seem to involve at least a partition switch.
>>> 
>>> On some CPUs, if the top bit of the decrementer is 0 again when you re-enable
>>> the interrupt, the interrupt is lost (so it is actually level-triggered).
>>> The arch books talk a bit about this AFAIR.
>> 
>> Sure, but that shouldn't happen with HDEC during the odd 50 instructions that it takes to enter the guest :)
> 
> It's more like 500 insns, including some ptesync, so lots of cycles too.

I don't think its actually that bad.

IIRC the problem is mostly due to another interrupt of a higher priority that sets MSR[HV] is pending.
This could also be a synchronous instruction on or near the HSRR0 (like a hypercall).
Since almost everything _is_ of a higher priority, externals, VRMA-ish, emulation, that will occur first (or at the same time).
This extends the window where the HDEC could go +ve.

Another way around this is to check, on HV switch, if the HDEC is ever bigger then it should _ever_ be, but what paulus has in his code is actually best, although the value (10?) may be too small.

> Can another hardware thread be running at the same time?

I'll leave this question to someone else.
-JX

> 
> 
> Segher
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-31 20:26                   ` Jimi Xenidis
  (?)
@ 2011-05-31 22:34                     ` Segher Boessenkool
  -1 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-31 22:34 UTC (permalink / raw)
  To: Jimi Xenidis
  Cc: Paul Mackerras, KVM list, Alexander Graf, Linuxppc-dev, kvm-ppc

>>> Sure, but that shouldn't happen with HDEC during the odd 50 
>>> instructions that it takes to enter the guest :)
>>
>> It's more like 500 insns, including some ptesync, so lots of cycles 
>> too.
>
> I don't think its actually that bad.

There's a loop of 128 iterations of 3 insns.

I'm not saying it is actually bad, just that that 50 is slightly off ;-)


Segher

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-31 22:34                     ` Segher Boessenkool
  0 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-31 22:34 UTC (permalink / raw)
  To: Jimi Xenidis
  Cc: kvm-ppc, Linuxppc-dev, Paul Mackerras, Alexander Graf, KVM list

>>> Sure, but that shouldn't happen with HDEC during the odd 50 
>>> instructions that it takes to enter the guest :)
>>
>> It's more like 500 insns, including some ptesync, so lots of cycles 
>> too.
>
> I don't think its actually that bad.

There's a loop of 128 iterations of 3 insns.

I'm not saying it is actually bad, just that that 50 is slightly off ;-)


Segher

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-05-31 22:34                     ` Segher Boessenkool
  0 siblings, 0 replies; 103+ messages in thread
From: Segher Boessenkool @ 2011-05-31 22:34 UTC (permalink / raw)
  To: Jimi Xenidis
  Cc: Paul Mackerras, KVM list, Alexander Graf, Linuxppc-dev, kvm-ppc

>>> Sure, but that shouldn't happen with HDEC during the odd 50 
>>> instructions that it takes to enter the guest :)
>>
>> It's more like 500 insns, including some ptesync, so lots of cycles 
>> too.
>
> I don't think its actually that bad.

There's a loop of 128 iterations of 3 insns.

I'm not saying it is actually bad, just that that 50 is slightly off ;-)


Segher


^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
  2011-05-31 22:34                     ` Segher Boessenkool
  (?)
@ 2011-06-01  5:11                       ` Paul Mackerras
  -1 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-06-01  5:11 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Jimi Xenidis, KVM list, Alexander Graf, Linuxppc-dev, kvm-ppc

On Wed, Jun 01, 2011 at 12:34:43AM +0200, Segher Boessenkool wrote:

> There's a loop of 128 iterations of 3 insns.
> 
> I'm not saying it is actually bad, just that that 50 is slightly off ;-)

That would be the TLB invalidation.  On POWER7 we only need to do that
if the virtual cpu last ran on a different physical cpu, or if this
physical cpu last ran a different virtual cpu in the same partition.
So hopefully we don't have to do it very often.

The reason we have to do it in that case is to allow the optimization
where we use tlbiel for TLB invalidations if the guest claims that
the translation being invalidated was only ever used on this virtual
cpu.  That means that we have to guard against stale TLB entries left
behind when a virtual cpu moves from one physical cpu to another.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode
@ 2011-06-01  5:11                       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-06-01  5:11 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: kvm-ppc, Linuxppc-dev, Alexander Graf, KVM list, Jimi Xenidis

On Wed, Jun 01, 2011 at 12:34:43AM +0200, Segher Boessenkool wrote:

> There's a loop of 128 iterations of 3 insns.
> 
> I'm not saying it is actually bad, just that that 50 is slightly off ;-)

That would be the TLB invalidation.  On POWER7 we only need to do that
if the virtual cpu last ran on a different physical cpu, or if this
physical cpu last ran a different virtual cpu in the same partition.
So hopefully we don't have to do it very often.

The reason we have to do it in that case is to allow the optimization
where we use tlbiel for TLB invalidations if the guest claims that
the translation being invalidated was only ever used on this virtual
cpu.  That means that we have to guard against stale TLB entries left
behind when a virtual cpu moves from one physical cpu to another.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

* Re: [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in
@ 2011-06-01  5:11                       ` Paul Mackerras
  0 siblings, 0 replies; 103+ messages in thread
From: Paul Mackerras @ 2011-06-01  5:11 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Jimi Xenidis, KVM list, Alexander Graf, Linuxppc-dev, kvm-ppc

On Wed, Jun 01, 2011 at 12:34:43AM +0200, Segher Boessenkool wrote:

> There's a loop of 128 iterations of 3 insns.
> 
> I'm not saying it is actually bad, just that that 50 is slightly off ;-)

That would be the TLB invalidation.  On POWER7 we only need to do that
if the virtual cpu last ran on a different physical cpu, or if this
physical cpu last ran a different virtual cpu in the same partition.
So hopefully we don't have to do it very often.

The reason we have to do it in that case is to allow the optimization
where we use tlbiel for TLB invalidations if the guest claims that
the translation being invalidated was only ever used on this virtual
cpu.  That means that we have to guard against stale TLB entries left
behind when a virtual cpu moves from one physical cpu to another.

Paul.

^ permalink raw reply	[flat|nested] 103+ messages in thread

end of thread, other threads:[~2011-06-01  5:11 UTC | newest]

Thread overview: 103+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-11 10:34 [PATCH 0/13] Hypervisor-mode KVM on POWER7 Paul Mackerras
2011-05-11 10:36 ` [PATCH 01/13] kvm/powerpc: Move fields between struct kvm_vcpu_arch and kvmppc_vcpu_book3s Paul Mackerras
2011-05-11 10:38 ` [PATCH 02/13] kvm/powerpc: Fix kvmppc_core_pending_dec Paul Mackerras
2011-05-11 10:39 ` [PATCH 03/13] kvm/powerpc: Fix the build for 32-bit Book 3S (classic) processors Paul Mackerras
2011-05-12  9:33   ` Alexander Graf
2011-05-12  9:33     ` Alexander Graf
2011-05-12 11:15     ` Paul Mackerras
2011-05-12 11:15       ` Paul Mackerras
2011-05-12 11:16     ` Benjamin Herrenschmidt
2011-05-12 11:16       ` Benjamin Herrenschmidt
2011-05-12 11:57       ` Alexander Graf
2011-05-12 11:57         ` Alexander Graf
2011-05-11 10:40 ` [PATCH 04/13] kvm/powerpc: Split out code from book3s.c into book3s_pr.c Paul Mackerras
2011-05-11 10:41 ` [PATCH 05/13] powerpc, kvm: Rework KVM checks in first-level interrupt handlers Paul Mackerras
2011-05-11 10:42 ` [PATCH 06/13] kvm/powerpc: Deliver program interrupts right away instead of queueing them Paul Mackerras
2011-05-11 10:42 ` [PATCH 07/13] kvm/powerpc: Pass init/destroy vm and prepare/commit memory region ops down Paul Mackerras
2011-05-11 10:43 ` [PATCH 08/13] kvm/powerpc: Move guest enter/exit down into subarch-specific code Paul Mackerras
2011-05-17 18:05   ` Marcelo Tosatti
2011-05-17 18:05     ` Marcelo Tosatti
2011-05-17 18:10     ` Marcelo Tosatti
2011-05-17 18:10       ` Marcelo Tosatti
2011-05-11 10:44 ` [PATCH 09/13] powerpc: Set up LPCR for running guest partitions Paul Mackerras
2011-05-11 10:44 ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
2011-05-12  9:07   ` Avi Kivity
2011-05-12  9:07     ` Avi Kivity
2011-05-16  1:07     ` Paul Mackerras
2011-05-16  1:07       ` Paul Mackerras
2011-05-15 21:58   ` Alexander Graf
2011-05-15 21:58     ` Alexander Graf
2011-05-15 21:58     ` Alexander Graf
2011-05-16  5:58     ` Paul Mackerras
2011-05-16  5:58       ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in Paul Mackerras
2011-05-16  5:58       ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
2011-05-17 10:17       ` Alexander Graf
2011-05-17 10:17         ` Alexander Graf
2011-05-17 10:17         ` Alexander Graf
2011-05-27 10:33         ` Paul Mackerras
2011-05-27 10:33           ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in Paul Mackerras
2011-05-27 10:33           ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
2011-05-27 10:43           ` Alexander Graf
2011-05-27 10:43             ` Alexander Graf
2011-05-27 10:43             ` Alexander Graf
2011-05-27 20:59           ` Segher Boessenkool
2011-05-27 20:59             ` Segher Boessenkool
2011-05-27 20:59             ` Segher Boessenkool
2011-05-27 23:19             ` Alexander Graf
2011-05-27 23:19               ` Alexander Graf
2011-05-27 23:19               ` Alexander Graf
2011-05-28  1:07               ` Segher Boessenkool
2011-05-28  1:07                 ` Segher Boessenkool
2011-05-28  1:07                 ` Segher Boessenkool
2011-05-31 20:26                 ` Jimi Xenidis
2011-05-31 20:26                   ` Jimi Xenidis
2011-05-31 20:26                   ` Jimi Xenidis
2011-05-31 22:34                   ` Segher Boessenkool
2011-05-31 22:34                     ` Segher Boessenkool
2011-05-31 22:34                     ` Segher Boessenkool
2011-06-01  5:11                     ` Paul Mackerras
2011-06-01  5:11                       ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in Paul Mackerras
2011-06-01  5:11                       ` [PATCH 10/13] kvm/powerpc: Add support for Book3S processors in hypervisor mode Paul Mackerras
2011-05-11 10:45 ` [PATCH 11/13] kvm/powerpc: Handle some PAPR hcalls in the kernel Paul Mackerras
2011-05-17  7:54   ` Alexander Graf
2011-05-17  7:54     ` Alexander Graf
2011-05-17 10:28     ` Paul Mackerras
2011-05-11 10:46 ` [PATCH 12/13] kvm/powerpc: Accelerate H_PUT_TCE by implementing it in real mode Paul Mackerras
2011-05-17  8:01   ` Alexander Graf
2011-05-17  8:01     ` Alexander Graf
2011-05-17  9:11     ` Benjamin Herrenschmidt
2011-05-17  9:11       ` Benjamin Herrenschmidt
2011-05-17  9:31       ` Alexander Graf
2011-05-17  9:31         ` Alexander Graf
2011-05-17  9:35         ` Benjamin Herrenschmidt
2011-05-17  9:35           ` Benjamin Herrenschmidt
2011-05-17  9:39           ` Alexander Graf
2011-05-17  9:39             ` Alexander Graf
2011-05-11 10:46 ` [PATCH 13/13] kvm/powerpc: Allow book3s_hv guests to use SMT processor modes Paul Mackerras
2011-05-11 13:44   ` Christoph Hellwig
2011-05-11 21:17     ` Paul Mackerras
2011-05-11 21:17       ` Paul Mackerras
2011-05-17  8:21   ` Alexander Graf
2011-05-17  8:21     ` Alexander Graf
2011-05-17 10:44     ` Paul Mackerras
2011-05-17 11:36       ` Alexander Graf
2011-05-17 11:36         ` Alexander Graf
2011-05-19  6:06         ` Paul Mackerras
2011-05-17  9:46 ` [PATCH 0/13] Hypervisor-mode KVM on POWER7 Alexander Graf
2011-05-17  9:46   ` Alexander Graf
2011-05-17 11:15   ` Paul Mackerras
2011-05-17 11:38     ` Alexander Graf
2011-05-17 11:38       ` Alexander Graf
2011-05-17 11:42       ` Avi Kivity
2011-05-17 11:42         ` Avi Kivity
2011-05-19  5:22         ` Paul Mackerras
2011-05-19  5:22           ` Paul Mackerras
2011-05-19  6:01           ` Alexander Graf
2011-05-19  6:01             ` Alexander Graf
2011-05-19  6:01             ` Alexander Graf
2011-05-21 16:41           ` Alexander Graf
2011-05-21 16:41             ` Alexander Graf
2011-05-21 17:00             ` Alexander Graf
2011-05-21 17:00               ` Alexander Graf
2011-05-21 18:15               ` Alexander Graf
2011-05-21 18:15                 ` Alexander Graf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.