linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: Paul Mackerras <paulus@samba.org>
To: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@ozlabs.org, Alexander Graf <agraf@suse.de>
Subject: [RFC PATCH 06/11] KVM: PPC: Use Linux page tables in h_enter and map_vrma
Date: Thu, 17 Nov 2011 09:59:48 +1100	[thread overview]
Message-ID: <20111116225948.GG26985@bloggs.ozlabs.ibm.com> (raw)
In-Reply-To: <20111116225055.GA26985@bloggs.ozlabs.ibm.com>

This changes kvmppc_h_enter() and kvmppc_map_vrma to get the real page
numbers that they put into the guest HPT from the Linux page tables
for our userspace as an alternative to getting them from the slot_pfns
arrays.  In future this will enable us to avoid pinning all of guest
memory on POWER7, but we will still have to pin all guest memory on
PPC970 as it doesn't support virtual partition memory.

This also exports find_linux_pte_or_hugepte() since we need it when
KVM is modular.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h |   31 +++++++
 arch/powerpc/include/asm/kvm_host.h      |    2 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |   26 +++++-
 arch/powerpc/kvm/book3s_hv.c             |    1 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  127 ++++++++++++++++--------------
 arch/powerpc/mm/hugetlbpage.c            |    2 +
 6 files changed, 125 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9243f35..307e649 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -121,4 +121,35 @@ static inline unsigned long *kvmppc_pfn_entry(struct kvm *kvm,
 }
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 
+/*
+ * Lock and read a linux PTE.  If it's present and writable, atomically
+ * set dirty and referenced bits and return the PFN, otherwise return 0.
+ */
+static inline unsigned long kvmppc_read_update_linux_pte(pte_t *p)
+{
+	pte_t pte, tmp;
+	unsigned long pfn = 0;
+
+	/* wait until _PAGE_BUSY is clear then set it atomically */
+	__asm__ __volatile__ (
+		"1:	ldarx	%0,0,%3\n"
+		"	andi.	%1,%0,%4\n"
+		"	bne-	1b\n"
+		"	ori	%1,%0,%4\n"
+		"	stdcx.	%1,0,%3\n"
+		"	bne-	1b"
+		: "=&r" (pte), "=&r" (tmp), "=m" (*p)
+		: "r" (p), "i" (_PAGE_BUSY)
+		: "cc");
+
+	if (pte_present(pte) && pte_write(pte)) {
+		pfn = pte_pfn(pte);
+		pte = pte_mkdirty(pte_mkyoung(pte));
+	}
+
+	*p = pte;	/* clears _PAGE_BUSY */
+
+	return pfn;
+}
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 93b7e04..f211643 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -32,6 +32,7 @@
 #include <linux/atomic.h>
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
+#include <asm/page.h>
 
 #define KVM_MAX_VCPUS		NR_CPUS
 #define KVM_MAX_VCORES		NR_CPUS
@@ -432,6 +433,7 @@ struct kvm_vcpu_arch {
 	struct list_head run_list;
 	struct task_struct *run_task;
 	struct kvm_run *kvm_run;
+	pgd_t *pgdir;
 #endif
 };
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 4d558c4..99187db 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -111,13 +111,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 	unsigned long npages;
 	unsigned long pfn;
 	unsigned long *hpte;
-	unsigned long addr, hash;
+	unsigned long addr, hash, hva;
 	unsigned long psize;
 	int porder;
 	struct revmap_entry *rev;
 	struct kvm_memory_slot *memslot;
 	unsigned long hp0, hp1;
 	unsigned long *pfns;
+	pte_t *p;
+	unsigned int shift;
 
 	memslot = &kvm->memslots->memslots[mem->slot];
 	pfns = kvm->arch.slot_pfns[mem->slot];
@@ -138,10 +140,26 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem)
 		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 
 	for (i = 0; i < npages; ++i) {
-		pfn = pfns[i];
-		if (!pfn)
-			continue;
 		addr = i << porder;
+		if (pfns) {
+			pfn = pfns[i];
+		} else {
+			pfn = 0;
+			local_irq_disable();
+			hva = addr + mem->userspace_addr;
+			p = find_linux_pte_or_hugepte(current->mm->pgd, hva,
+						      &shift);
+			if (p && (psize == PAGE_SIZE || shift == porder))
+				pfn = kvmppc_read_update_linux_pte(p);
+			local_irq_enable();
+		}
+
+		if (!pfn) {
+			pr_err("KVM: Couldn't find page for VRMA at %lx\n",
+			       addr);
+			break;
+		}
+
 		/* can't use hpt_hash since va > 64 bits */
 		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 		/*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7434258..cb21845 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -868,6 +868,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	flush_altivec_to_thread(current);
 	flush_vsx_to_thread(current);
 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+	vcpu->arch.pgdir = current->mm->pgd;
 
 	do {
 		r = kvmppc_run_vcpu(run, vcpu);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5438442..1778091 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -59,37 +59,27 @@ static struct kvm_memory_slot *builtin_gfn_to_memslot(struct kvm *kvm,
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 		    long pte_index, unsigned long pteh, unsigned long ptel)
 {
-	unsigned long porder;
 	struct kvm *kvm = vcpu->kvm;
 	unsigned long i, pa, gpa, gfn, psize;
+	unsigned long slot_fn, hva;
 	unsigned long *hpte;
 	struct revmap_entry *rev;
 	unsigned long g_ptel = ptel;
 	struct kvm_memory_slot *memslot;
 	unsigned long *pfnp, pte_size;
+	unsigned long is_io;
+	pte_t *ptep;
+	unsigned int shift;
 
-	/* only handle 4k, 64k and 16M pages for now */
-	porder = 12;
-	if (pteh & HPTE_V_LARGE) {
-		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-		    (ptel & 0xf000) == 0x1000) {
-			/* 64k page */
-			porder = 16;
-		} else if ((ptel & 0xff000) == 0) {
-			/* 16M page */
-			porder = 24;
-			/* lowest AVA bit must be 0 for 16M pages */
-			if (pteh & 0x80)
-				return H_PARAMETER;
-		} else
-			return H_PARAMETER;
-	}
-	psize = (1ul << porder);
+	psize = hpte_page_size(pteh, ptel);
+	if (!psize)
+		return H_PARAMETER;
 
-	/* We do not allow the guest to set key 31 which is reserved
-	 * for MMIO emulation. We don't want to allow MMIO emulation
-	 * to be used to access RAM due to possible races between
-	 * emulation and TLB invalidations.
+	/*
+	 * We do not allow the guest to set key 31 which is reserved
+	 * for MMIO emulation and non-present RAM pages.  We don't want
+	 * to allow MMIO emulation to be used to access RAM due to possible
+	 * races between emulation and TLB invalidations.
 	 *
 	 * Emulated accesses are emulated by looking at the hash for
 	 * translation once, then performing the access later. The
@@ -106,66 +96,79 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
 		return H_PARAMETER;
 
-	/* Figure out the type of page and handle accordingly,
-	 * first check for RAM pages
-	 */
+	/* Find the memslot (if any) for this address */
 	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 	gfn = gpa >> PAGE_SHIFT;
 	memslot = builtin_gfn_to_memslot(kvm, gfn);
+	pa = 0;
+	is_io = 1;
 	if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
-		unsigned long egfn = (gpa + psize) >> PAGE_SHIFT;
-
 		/* Check if the requested page fits entirely in the memslot. */
-		if ((egfn - memslot->base_gfn) > memslot->npages)
+		slot_fn = gfn - memslot->base_gfn;
+		if (slot_fn + (psize >> PAGE_SHIFT) > memslot->npages) 
 			return H_PARAMETER;
+		is_io = memslot->flags & KVM_MEMSLOT_IO;
 
-		/* Check for MMIO pass-through */
-		if (memslot->flags & KVM_MEMSLOT_IO) {
-			/* Check WIMG */
-			if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
-			    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
-				return H_PARAMETER;		
-		} else {
-			/* System RAM */
-			if (porder > kvm->arch.slot_page_order[memslot->id])
+		pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
+		if (pfnp) {
+			pte_size = 1ul << kvm->arch.slot_page_order[memslot->id];
+			if (!is_io && psize > pte_size)
 				return H_PARAMETER;
-
-			/* Check WIMG */
-			if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
-			    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
+			pfnp = real_vmalloc_addr(pfnp);
+			pa = *pfnp << PAGE_SHIFT;
+			if (!pa)
 				return H_PARAMETER;
+		} else {
+			/* Translate to host virtual address */
+			hva = gfn_to_hva_memslot(memslot, gfn);
+
+			/* Look up the Linux PTE for the backing page */
+			ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva,
+							 &shift);
+			if (ptep) {
+				if (shift)
+					pte_size = 1ul << shift;
+				else
+					pte_size = PAGE_SIZE;
+				if (pte_size < psize)
+					return H_PARAMETER;
+				pa = kvmppc_read_update_linux_pte(ptep);
+				pa <<= PAGE_SHIFT;
+			}
 		}
-		pfnp = kvmppc_pfn_entry(kvm, memslot, gfn);
-		if (!pfnp)
-			return H_PARAMETER;
-		pfnp = real_vmalloc_addr(pfnp);
-		pa = *pfnp << PAGE_SHIFT;
-		if (!pa)
-			return H_PARAMETER;
-		pte_size = 1ul << kvm->arch.slot_page_order[memslot->id];
-		pa |= gpa & (pte_size - 1);
+		if (pa && pte_size > psize)
+			pa |= gpa & (pte_size - 1);
 
 		/* check if the start pfn has page size alignment */
 		if (pa & (psize - 1))
 			return H_PARAMETER;
 		ptel &= ~(HPTE_R_PP0 - psize);
 		ptel |= pa;
-
+	}
+	pteh &= ~0x60UL;
+	
+	/* Check WIMG */
+	if (is_io) {
+		if ((ptel & HPTE_R_WIMG) != (HPTE_R_I) &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_I | HPTE_R_G))
+			return H_PARAMETER;
 	} else {
-		/* Else check for MMIO emulation */
-		if (!cpu_has_feature(CPU_FTR_ARCH_206))
+		if ((ptel & HPTE_R_WIMG) != HPTE_R_M &&
+		    (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M))
 			return H_PARAMETER;
+	}
 
-		/* Leave RPN intact */
-		/* We force no-execute and set key to 1 to cause
-		 * faults on access.
-		 * XXX Should we instead just return H_PARAMETER if
-		 * N isn't already set ?
+	if (!pa) {
+		/*
+		 * If this is a non-present page for any reason
+		 * and this is a POWER7, set the key to 31 and set N.
+		 * On 970 we have to have all pages present.
 		 */
+		if (!cpu_has_feature(CPU_FTR_ARCH_206))
+			return H_PARAMETER;
 		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
 	}
-	pteh &= ~0x60UL;
-	
+
 	if (pte_index >= HPT_NPTE)
 		return H_PARAMETER;
 	if (likely((flags & H_EXACT) == 0)) {
@@ -190,10 +193,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	if (rev)
 		rev->guest_rpte = g_ptel;
+
 	hpte[1] = ptel;
+
+	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
 	eieio();
 	hpte[0] = pteh;
 	asm volatile("ptesync" : : : "memory");
+
 	vcpu->arch.gpr[4] = pte_index;
 	return H_SUCCESS;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0b9a5c1..701e920 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -11,6 +11,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
+#include <linux/module.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -105,6 +106,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
 		*shift = hugepd_shift(*hpdp);
 	return hugepte_offset(hpdp, ea, pdshift);
 }
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-- 
1.7.7.2

  parent reply	other threads:[~2011-11-16 22:59 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-11-16 22:50 [RFC PATCH 0/11] KVM: PPC: Update Book3S HV memory handling Paul Mackerras
2011-11-16 22:52 ` [PATCH 01/11] KVM: PPC: Add memory-mapping support for PCI passthrough and emulation Paul Mackerras
2011-11-20 12:23   ` Avi Kivity
2011-11-21 11:03     ` Paul Mackerras
2011-11-21 12:22       ` Avi Kivity
2011-11-21 21:29         ` Paul Mackerras
2011-11-16 22:56 ` [PATCH 02/11] KVM: PPC: Keep a record of HV guest view of hashed page table entries Paul Mackerras
2011-11-16 22:58 ` [PATCH 03/11] KVM: PPC: Allow use of small pages to back guest memory Paul Mackerras
2011-11-16 22:58 ` [PATCH 04/11] KVM: PPC: Remove io_slot_pfn array Paul Mackerras
2011-11-16 22:59 ` [PATCH 05/11] KVM: PPC: Use a separate vmalloc'd array to store pfns Paul Mackerras
2011-11-16 22:59 ` Paul Mackerras [this message]
2011-11-16 23:02 ` [RFC PATCH 07/11] KVM: PPC: Convert do_h_register_vpa to use Linux page tables Paul Mackerras
2011-11-16 23:50 ` [RFC PATCH 08/11] KVM: PPC: Add a page fault handler function Paul Mackerras
2011-11-16 23:51 ` [RFC PATCH 09/11] KVM: PPC: Maintain a doubly-linked list of guest HPTEs for each gfn Paul Mackerras
2011-11-16 23:52 ` [RFC PATCH 10/11] KVM: PPC: Implement MMU notifiers Paul Mackerras
2011-11-20 12:38   ` Avi Kivity
2011-11-16 23:55 ` [RFC PATCH 11/11] KVM: PPC: Eliminate global spinlock in kvmppc_h_enter Paul Mackerras
2011-11-23 23:54   ` Marcelo Tosatti
2011-11-18 13:57 ` [RFC PATCH 0/11] KVM: PPC: Update Book3S HV memory handling Alexander Graf
2011-11-18 21:54   ` Paul Mackerras
2011-11-23 23:59     ` Marcelo Tosatti

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20111116225948.GG26985@bloggs.ozlabs.ibm.com \
    --to=paulus@samba.org \
    --cc=agraf@suse.de \
    --cc=kvm-ppc@vger.kernel.org \
    --cc=linuxppc-dev@ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).