kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] KVM: x86: Fix recording of guest steal time / preempted status
@ 2021-11-01 14:09 David Woodhouse
  2021-11-02 16:38 ` [PATCH v2] " David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-01 14:09 UTC (permalink / raw)
  To: kvm
  Cc: Boris Ostrovsky, Joao Martins, Paolo Bonzini, jmattson,
	wanpengli, seanjc, vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 7977 bytes --]

From: David Woodhouse <dwmw@amazon.co.uk>

In commit b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is
not missed") we switched to using a gfn_to_pfn_cache for accessing the
guest steal time structure in order to allow for an atomic xchg of the
preempted field. This has a couple of problems.

Firstly, kvm_map_gfn() doesn't work at all for IOMEM pages when the
atomic flag is set, which it is in kvm_steal_time_set_preempted(). So a
guest vCPU using an IOMEM page for its steal time would never have its
preempted field set.

Secondly, the gfn_to_pfn_cache is not invalidated in all cases where it
should have been. There are two stages to the GFN → PFN conversion;
first the GFN is converted to a userspace HVA, and then that HVA is
looked up in the process page tables to find the underlying host PFN.
Correct invalidation of the latter would require being hooked up to the
MMU notifiers, but that doesn't happen — so it just keeps mapping and
unmapping the *wrong* PFN after the userspace page tables change.

In the !IOMEM case at least the stale page *is* pinned all the time it's
cached, so it won't be freed and reused by anyone else while still
receiving the steal time updates. (This kind of makes a mockery of this
repeated map/unmap dance which I thought was supposed to avoid pinning
the page. AFAICT we might as well have just kept a kernel mapping of it
all the time).

But there's no point in a kernel mapping of it anyway, when in all cases
we care about, we have a perfectly serviceable userspace HVA for it. We
just need to implement the atomic xchg on the userspace address with
appropriate exception handling, which is fairly trivial.

Cc: stable@vger.kernel.org
Fixes: b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/x86.c              | 109 +++++++++++++++++++++++---------
 2 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 63d70fa34d3a..02ec330dbb4a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -752,7 +752,7 @@ struct kvm_vcpu_arch {
 		u8 preempted;
 		u64 msr_val;
 		u64 last_steal;
-		struct gfn_to_pfn_cache cache;
+		struct gfn_to_hva_cache cache;
 	} st;
 
 	u64 l1_tsc_offset;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a116999f601..14c44e1c1bc7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3195,8 +3195,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
-	struct kvm_host_map map;
-	struct kvm_steal_time *st;
+	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+	struct kvm_steal_time __user *st;
+	struct kvm_memslots *slots;
+	u64 steal;
+	u32 version;
 
 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
 		kvm_xen_runstate_set_running(vcpu);
@@ -3206,47 +3209,87 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	/* -EAGAIN is returned in atomic context so we can just return. */
-	if (kvm_map_gfn(vcpu->kvm, vcpu->arch.st.msr_val >> PAGE_SHIFT,
-			&map, &vcpu->arch.st.cache, false))
+	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
 		return;
 
-	st = map.hva +
-		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+	slots = kvm_memslots(vcpu->kvm);
+
+	if (unlikely(slots->generation != ghc->generation ||
+		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+		gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+		/* We rely on the fact that it fits in a single page. */
+		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+			return;
+	}
+
+	st = (struct kvm_steal_time __user *)ghc->hva;
+	if (!user_access_begin(st, sizeof(*st)))
+		return;
 
 	/*
 	 * Doing a TLB flush here, on the guest's behalf, can avoid
 	 * expensive IPIs.
 	 */
-	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
-		u8 st_preempted = xchg(&st->preempted, 0);
+	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+		int err;
+		u8 st_preempted = 0;
+
+		asm volatile("1:\t" LOCK_PREFIX "xchgb %0, %1\n"
+			     "\txor %2, %2\n"
+			     "2:\n"
+			     "\t.section .fixup,\"ax\"\n"
+			     "3:\tmovl %3, %2\n"
+			     "\tjmp\t2b\n"
+			     "\t.previous\n"
+			     _ASM_EXTABLE_UA(1b, 3b)
+			     : "=r" (st_preempted)
+			     : "m" (st->preempted),
+			       "r" (err),
+			       "i" (-EFAULT),
+			       "0" (st_preempted));
+		if (err)
+			goto out;
+
+		user_access_end();
+
+		vcpu->arch.st.preempted = 0;
 
 		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
 				       st_preempted & KVM_VCPU_FLUSH_TLB);
 		if (st_preempted & KVM_VCPU_FLUSH_TLB)
 			kvm_vcpu_flush_tlb_guest(vcpu);
+
+		if (!user_access_begin(st, sizeof(*st)))
+			return;
 	} else {
-		st->preempted = 0;
+		unsafe_put_user(0, &st->preempted, out);
+		vcpu->arch.st.preempted = 0;
 	}
 
-	vcpu->arch.st.preempted = 0;
-
-	if (st->version & 1)
-		st->version += 1;  /* first time write, random junk */
+	unsafe_get_user(version, &st->version, out);
+	if (version & 1)
+		version += 1;  /* first time write, random junk */
 
-	st->version += 1;
+	version += 1;
+	unsafe_put_user(version, &st->version, out);
 
 	smp_wmb();
 
-	st->steal += current->sched_info.run_delay -
+	unsafe_get_user(steal, &st->steal, out);
+	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	unsafe_put_user(steal, &st->steal, out);
 
-	smp_wmb();
-
-	st->version += 1;
+	version += 1;
+	unsafe_put_user(version, &st->version, out);
 
-	kvm_unmap_gfn(vcpu->kvm, &map, &vcpu->arch.st.cache, true, false);
+ out:
+	user_access_end();
 }
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4286,8 +4329,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 {
-	struct kvm_host_map map;
-	struct kvm_steal_time *st;
+	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+	struct kvm_steal_time __user *st;
+	struct kvm_memslots *slots;
+	static const u8 preempted = KVM_VCPU_PREEMPTED;
 
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
@@ -4295,16 +4340,21 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.st.preempted)
 		return;
 
-	if (kvm_map_gfn(vcpu->kvm, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
-			&vcpu->arch.st.cache, true))
+	/* This happens on process exit */
+	if (unlikely(current->mm != vcpu->kvm->mm))
 		return;
 
-	st = map.hva +
-		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+	slots = kvm_memslots(vcpu->kvm);
 
-	st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+	if (unlikely(slots->generation != ghc->generation ||
+		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+		return;
 
-	kvm_unmap_gfn(vcpu->kvm, &map, &vcpu->arch.st.cache, true, true);
+	st = (struct kvm_steal_time __user *)ghc->hva;
+	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+
+	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -10818,11 +10868,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
-	struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
 	int idx;
 
-	kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
 	kvmclock_reset(vcpu);
 
 	static_call(kvm_x86_vcpu_free)(vcpu);
-- 
2.31.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-01 14:09 [PATCH] KVM: x86: Fix recording of guest steal time / preempted status David Woodhouse
@ 2021-11-02 16:38 ` David Woodhouse
  2021-11-02 17:01   ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-02 16:38 UTC (permalink / raw)
  To: kvm
  Cc: Boris Ostrovsky, Joao Martins, Paolo Bonzini, jmattson,
	wanpengli, seanjc, vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 8103 bytes --]

From: David Woodhouse <dwmw@amazon.co.uk>

In commit b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is
not missed") we switched to using a gfn_to_pfn_cache for accessing the
guest steal time structure in order to allow for an atomic xchg of the
preempted field. This has a couple of problems.

Firstly, kvm_map_gfn() doesn't work at all for IOMEM pages when the
atomic flag is set, which it is in kvm_steal_time_set_preempted(). So a
guest vCPU using an IOMEM page for its steal time would never have its
preempted field set.

Secondly, the gfn_to_pfn_cache is not invalidated in all cases where it
should have been. There are two stages to the GFN → PFN conversion;
first the GFN is converted to a userspace HVA, and then that HVA is
looked up in the process page tables to find the underlying host PFN.
Correct invalidation of the latter would require being hooked up to the
MMU notifiers, but that doesn't happen — so it just keeps mapping and
unmapping the *wrong* PFN after the userspace page tables change.

In the !IOMEM case at least the stale page *is* pinned all the time it's
cached, so it won't be freed and reused by anyone else while still
receiving the steal time updates. (This kind of makes a mockery of this
repeated map/unmap dance which I thought was supposed to avoid pinning
the page. AFAICT we might as well have just kept a kernel mapping of it
all the time).

But there's no point in a kernel mapping of it anyway, when in all cases
we care about, we have a perfectly serviceable userspace HVA for it. We
just need to implement the atomic xchg on the userspace address with
appropriate exception handling, which is fairly trivial.

Cc: stable@vger.kernel.org
Fixes: b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
v2: Fix asm constraints (err is an output). Rebase so that it applies cleanly
    before the Xen series (which changes the argument to kvm_map_gfn() that
    is removed in this patch anyway.)

 
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/x86.c              | 107 +++++++++++++++++++++++---------
 2 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 13f64654dfff..750f74da9793 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -751,7 +751,7 @@ struct kvm_vcpu_arch {
 		u8 preempted;
 		u64 msr_val;
 		u64 last_steal;
-		struct gfn_to_pfn_cache cache;
+		struct gfn_to_hva_cache cache;
 	} st;
 
 	u64 l1_tsc_offset;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfe0de3008a6..e6905a1068ad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3195,8 +3195,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
-	struct kvm_host_map map;
-	struct kvm_steal_time *st;
+	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+	struct kvm_steal_time __user *st;
+	struct kvm_memslots *slots;
+	u64 steal;
+	u32 version;
 
 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
 		kvm_xen_runstate_set_running(vcpu);
@@ -3206,47 +3209,87 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	/* -EAGAIN is returned in atomic context so we can just return. */
-	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
-			&map, &vcpu->arch.st.cache, false))
+	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
 		return;
 
-	st = map.hva +
-		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+	slots = kvm_memslots(vcpu->kvm);
+
+	if (unlikely(slots->generation != ghc->generation ||
+		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+		gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+		/* We rely on the fact that it fits in a single page. */
+		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+			return;
+	}
+
+	st = (struct kvm_steal_time __user *)ghc->hva;
+	if (!user_access_begin(st, sizeof(*st)))
+		return;
 
 	/*
 	 * Doing a TLB flush here, on the guest's behalf, can avoid
 	 * expensive IPIs.
 	 */
 	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
-		u8 st_preempted = xchg(&st->preempted, 0);
+		u8 st_preempted = 0;
+		int err;
+
+		asm volatile("1:\t" LOCK_PREFIX "xchgb %0, %2\n"
+			     "\txor %1, %1\n"
+			     "2:\n"
+			     "\t.section .fixup,\"ax\"\n"
+			     "3:\tmovl %3, %1\n"
+			     "\tjmp\t2b\n"
+			     "\t.previous\n"
+			     _ASM_EXTABLE_UA(1b, 3b)
+			     : "=r" (st_preempted),
+			       "=r" (err)
+			     : "m" (st->preempted),
+			       "i" (-EFAULT),
+			       "0" (st_preempted));
+		if (err)
+			goto out;
+
+		user_access_end();
+
+		vcpu->arch.st.preempted = 0;
 
 		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
 				       st_preempted & KVM_VCPU_FLUSH_TLB);
 		if (st_preempted & KVM_VCPU_FLUSH_TLB)
 			kvm_vcpu_flush_tlb_guest(vcpu);
+
+		if (!user_access_begin(st, sizeof(*st)))
+			return;
 	} else {
-		st->preempted = 0;
+		unsafe_put_user(0, &st->preempted, out);
+		vcpu->arch.st.preempted = 0;
 	}
 
-	vcpu->arch.st.preempted = 0;
-
-	if (st->version & 1)
-		st->version += 1;  /* first time write, random junk */
+	unsafe_get_user(version, &st->version, out);
+	if (version & 1)
+		version += 1;  /* first time write, random junk */
 
-	st->version += 1;
+	version += 1;
+	unsafe_put_user(version, &st->version, out);
 
 	smp_wmb();
 
-	st->steal += current->sched_info.run_delay -
+	unsafe_get_user(steal, &st->steal, out);
+	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	unsafe_put_user(steal, &st->steal, out);
 
-	smp_wmb();
-
-	st->version += 1;
+	version += 1;
+	unsafe_put_user(version, &st->version, out);
 
-	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+	user_access_end();
 }
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4285,8 +4328,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 {
-	struct kvm_host_map map;
-	struct kvm_steal_time *st;
+	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+	struct kvm_steal_time __user *st;
+	struct kvm_memslots *slots;
+	static const u8 preempted = KVM_VCPU_PREEMPTED;
 
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
@@ -4294,16 +4339,21 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.st.preempted)
 		return;
 
-	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
-			&vcpu->arch.st.cache, true))
+	/* This happens on process exit */
+	if (unlikely(current->mm != vcpu->kvm->mm))
 		return;
 
-	st = map.hva +
-		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+	slots = kvm_memslots(vcpu->kvm);
 
-	st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+	if (unlikely(slots->generation != ghc->generation ||
+		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+		return;
 
-	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+	st = (struct kvm_steal_time __user *)ghc->hva;
+	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+
+	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -10817,11 +10867,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
-	struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
 	int idx;
 
-	kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
 	kvmclock_reset(vcpu);
 
 	static_call(kvm_x86_vcpu_free)(vcpu);
-- 
2.25.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-02 16:38 ` [PATCH v2] " David Woodhouse
@ 2021-11-02 17:01   ` Paolo Bonzini
  2021-11-02 17:11     ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-02 17:01 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 02/11/21 17:38, David Woodhouse wrote:
> This kind of makes a mockery of this
> repeated map/unmap dance which I thought was supposed to avoid pinning
> the page

The map/unmap dance is supposed to catch the moment where you'd look at 
a stale cache, by giving the non-atomic code a chance to update the 
gfn->pfn mapping.

The unmap is also the moment where you can mark the page as dirty.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-02 17:01   ` Paolo Bonzini
@ 2021-11-02 17:11     ` David Woodhouse
  2021-11-02 17:19       ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-02 17:11 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 1450 bytes --]

On Tue, 2021-11-02 at 18:01 +0100, Paolo Bonzini wrote:
> On 02/11/21 17:38, David Woodhouse wrote:
> > This kind of makes a mockery of this
> > repeated map/unmap dance which I thought was supposed to avoid pinning
> > the page
> 
> The map/unmap dance is supposed to catch the moment where you'd look at 
> a stale cache, by giving the non-atomic code a chance to update the 
> gfn->pfn mapping.
> 

It might have *chance* to do so, but it doesn't actually do it.

As noted, a GFN→PFN mapping is really a GFN→HVA→PFN mapping. And the
non-atomic code *does* update the GFN→HVA part of that, correctly
looking at the memslots generation etc.. 

But it pays absolutely no attention to the *second* part, and assumes
that the HVA→PFN mapping in the userspace page tables will never
change.

Which isn't necessarily true, even if the underlying physical page *is*
pinned to avoid most cases (ksm, swap, etc.) of the *kernel* changing
it. Userspace still can.

> The unmap is also the moment where you can mark the page as dirty.

Sure, but it's the wrong page :)

It's not necessarily the page that is at that userspace HVA, and hence
in the guest's EPT at that GFN any more.

In my Xen event channel series, I added a 'mmap a page from /dev/zero
over the shared_info page after it's active' torture test to
demonstrate this and check it was fixed. I suppose we could do the same
in the steal_time test...?

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-02 17:11     ` David Woodhouse
@ 2021-11-02 17:19       ` Paolo Bonzini
  2021-11-02 17:26         ` David Woodhouse
                           ` (2 more replies)
  0 siblings, 3 replies; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-02 17:19 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 02/11/21 18:11, David Woodhouse wrote:
> On Tue, 2021-11-02 at 18:01 +0100, Paolo Bonzini wrote:
>> On 02/11/21 17:38, David Woodhouse wrote:
>>> This kind of makes a mockery of this
>>> repeated map/unmap dance which I thought was supposed to avoid pinning
>>> the page
>>
>> The map/unmap dance is supposed to catch the moment where you'd look at
>> a stale cache, by giving the non-atomic code a chance to update the
>> gfn->pfn mapping.
>>
> 
> It might have *chance* to do so, but it doesn't actually do it.
> 
> As noted, a GFN→PFN mapping is really a GFN→HVA→PFN mapping. And the
> non-atomic code *does* update the GFN→HVA part of that, correctly
> looking at the memslots generation etc..
> 
> But it pays absolutely no attention to the *second* part, and assumes
> that the HVA→PFN mapping in the userspace page tables will never
> change.
> 
> Which isn't necessarily true, even if the underlying physical page *is*
> pinned to avoid most cases (ksm, swap, etc.) of the *kernel* changing
> it. Userspace still can.

Yes, I agree.  What I am saying is that:

- the map/unmap dance is not (entirely) about whether to pin the page

- the map/unmap API is not a bad API, just an incomplete implementation

And I think the above comment confuses both points above.

>> The unmap is also the moment where you can mark the page as dirty.
> 
> Sure, but it's the wrong page :)

The GFN _also_ has to be marked dirty.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-02 17:19       ` Paolo Bonzini
@ 2021-11-02 17:26         ` David Woodhouse
  2021-11-02 17:36         ` [PATCH v3] " David Woodhouse
  2021-11-03  9:47         ` [PATCH v2] " David Woodhouse
  2 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-02 17:26 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 591 bytes --]

On Tue, 2021-11-02 at 18:19 +0100, Paolo Bonzini wrote:
> Yes, I agree.  What I am saying is that:
> 
> - the map/unmap dance is not (entirely) about whether to pin the page
> - the map/unmap API is not a bad API, just an incomplete implementation
> 

Yep, fair enough. The Xen evtchn series contains what I believe is
necessary to make it a complete implementation. But in *this* case it's
fairly gratuitous since, as noted, we already *have* a perfectly
serviceable mapping.

> The GFN _also_ has to be marked dirty.

Argh, yes. I forgot to do that. Will fix in v3. Thanks.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-02 17:19       ` Paolo Bonzini
  2021-11-02 17:26         ` David Woodhouse
@ 2021-11-02 17:36         ` David Woodhouse
  2021-11-11 13:23           ` Paolo Bonzini
  2021-11-03  9:47         ` [PATCH v2] " David Woodhouse
  2 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-02 17:36 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 8599 bytes --]

From 187eaf32966670d11965e2e692de2ba8fdc037f4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 1 Nov 2021 11:55:07 +0000
Subject: [PATCH 1/7] KVM: x86: Fix recording of guest steal time / preempted
 status
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is
not missed") we switched to using a gfn_to_pfn_cache for accessing the
guest steal time structure in order to allow for an atomic xchg of the
preempted field. This has a couple of problems.

Firstly, kvm_map_gfn() doesn't work at all for IOMEM pages when the
atomic flag is set, which it is in kvm_steal_time_set_preempted(). So a
guest vCPU using an IOMEM page for its steal time would never have its
preempted field set.

Secondly, the gfn_to_pfn_cache is not invalidated in all cases where it
should have been. There are two stages to the GFN → PFN conversion;
first the GFN is converted to a userspace HVA, and then that HVA is
looked up in the process page tables to find the underlying host PFN.
Correct invalidation of the latter would require being hooked up to the
MMU notifiers, but that doesn't happen — so it just keeps mapping and
unmapping the *wrong* PFN after the userspace page tables change.

In the !IOMEM case at least the stale page *is* pinned all the time it's
cached, so it won't be freed and reused by anyone else while still
receiving the steal time updates. (This kind of makes a mockery of this
repeated map/unmap dance which I thought was supposed to avoid pinning
the page. AFAICT we might as well have just kept a kernel mapping of it
all the time).

But there's no point in a kernel mapping of it anyway, when in all cases
we care about, we have a perfectly serviceable userspace HVA for it. We
just need to implement the atomic xchg on the userspace address with
appropriate exception handling, which is fairly trivial.

Cc: stable@vger.kernel.org
Fixes: b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
v2: Fix asm constraints (err is an output). Rebase so that it applies cleanly
    before the Xen series (which changes the argument to kvm_map_gfn() that
    is removed in this patch anyway.)
v3: Mark the GFN dirty after writing it.

 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/x86.c              | 111 +++++++++++++++++++++++---------
 2 files changed, 82 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 13f64654dfff..750f74da9793 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -751,7 +751,7 @@ struct kvm_vcpu_arch {
 		u8 preempted;
 		u64 msr_val;
 		u64 last_steal;
-		struct gfn_to_pfn_cache cache;
+		struct gfn_to_hva_cache cache;
 	} st;
 
 	u64 l1_tsc_offset;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfe0de3008a6..b49ab3188942 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3195,8 +3195,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
-	struct kvm_host_map map;
-	struct kvm_steal_time *st;
+	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+	struct kvm_steal_time __user *st;
+	struct kvm_memslots *slots;
+	u64 steal;
+	u32 version;
 
 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
 		kvm_xen_runstate_set_running(vcpu);
@@ -3206,47 +3209,89 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	/* -EAGAIN is returned in atomic context so we can just return. */
-	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
-			&map, &vcpu->arch.st.cache, false))
+	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
 		return;
 
-	st = map.hva +
-		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+	slots = kvm_memslots(vcpu->kvm);
+
+	if (unlikely(slots->generation != ghc->generation ||
+		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+		gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+		/* We rely on the fact that it fits in a single page. */
+		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+			return;
+	}
+
+	st = (struct kvm_steal_time __user *)ghc->hva;
+	if (!user_access_begin(st, sizeof(*st)))
+		return;
 
 	/*
 	 * Doing a TLB flush here, on the guest's behalf, can avoid
 	 * expensive IPIs.
 	 */
 	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
-		u8 st_preempted = xchg(&st->preempted, 0);
+		u8 st_preempted = 0;
+		int err;
+
+		asm volatile("1:\t" LOCK_PREFIX "xchgb %0, %2\n"
+			     "\txor %1, %1\n"
+			     "2:\n"
+			     "\t.section .fixup,\"ax\"\n"
+			     "3:\tmovl %3, %1\n"
+			     "\tjmp\t2b\n"
+			     "\t.previous\n"
+			     _ASM_EXTABLE_UA(1b, 3b)
+			     : "=r" (st_preempted),
+			       "=r" (err)
+			     : "m" (st->preempted),
+			       "i" (-EFAULT),
+			       "0" (st_preempted));
+		if (err)
+			goto out;
+
+		user_access_end();
+
+		vcpu->arch.st.preempted = 0;
 
 		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
 				       st_preempted & KVM_VCPU_FLUSH_TLB);
 		if (st_preempted & KVM_VCPU_FLUSH_TLB)
 			kvm_vcpu_flush_tlb_guest(vcpu);
+
+		if (!user_access_begin(st, sizeof(*st)))
+			goto dirty;
 	} else {
-		st->preempted = 0;
+		unsafe_put_user(0, &st->preempted, out);
+		vcpu->arch.st.preempted = 0;
 	}
 
-	vcpu->arch.st.preempted = 0;
-
-	if (st->version & 1)
-		st->version += 1;  /* first time write, random junk */
+	unsafe_get_user(version, &st->version, out);
+	if (version & 1)
+		version += 1;  /* first time write, random junk */
 
-	st->version += 1;
+	version += 1;
+	unsafe_put_user(version, &st->version, out);
 
 	smp_wmb();
 
-	st->steal += current->sched_info.run_delay -
+	unsafe_get_user(steal, &st->steal, out);
+	steal += current->sched_info.run_delay -
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	unsafe_put_user(steal, &st->steal, out);
 
-	smp_wmb();
-
-	st->version += 1;
+	version += 1;
+	unsafe_put_user(version, &st->version, out);
 
-	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+	user_access_end();
+ dirty:
+	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
 }
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4285,8 +4330,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 {
-	struct kvm_host_map map;
-	struct kvm_steal_time *st;
+	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+	struct kvm_steal_time __user *st;
+	struct kvm_memslots *slots;
+	static const u8 preempted = KVM_VCPU_PREEMPTED;
 
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
@@ -4294,16 +4341,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.st.preempted)
 		return;
 
-	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
-			&vcpu->arch.st.cache, true))
+	/* This happens on process exit */
+	if (unlikely(current->mm != vcpu->kvm->mm))
 		return;
 
-	st = map.hva +
-		offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+	slots = kvm_memslots(vcpu->kvm);
+
+	if (unlikely(slots->generation != ghc->generation ||
+		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+		return;
 
-	st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+	st = (struct kvm_steal_time __user *)ghc->hva;
+	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
 
-	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -10817,11 +10871,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
-	struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
 	int idx;
 
-	kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
 	kvmclock_reset(vcpu);
 
 	static_call(kvm_x86_vcpu_free)(vcpu);
-- 
2.25.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-02 17:19       ` Paolo Bonzini
  2021-11-02 17:26         ` David Woodhouse
  2021-11-02 17:36         ` [PATCH v3] " David Woodhouse
@ 2021-11-03  9:47         ` David Woodhouse
  2021-11-03 12:35           ` Paolo Bonzini
  2 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-03  9:47 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed



On 2 November 2021 17:19:34 GMT, Paolo Bonzini <pbonzini@redhat.com> wrote:
>On 02/11/21 18:11, David Woodhouse wrote:
>> On Tue, 2021-11-02 at 18:01 +0100, Paolo Bonzini wrote:
>>> On 02/11/21 17:38, David Woodhouse wrote:
>>>> This kind of makes a mockery of this
>>>> repeated map/unmap dance which I thought was supposed to avoid pinning
>>>> the page
>>>
>>> The map/unmap dance is supposed to catch the moment where you'd look at
>>> a stale cache, by giving the non-atomic code a chance to update the
>>> gfn->pfn mapping.
>>>
>> 
>> It might have *chance* to do so, but it doesn't actually do it.
>> 
>> As noted, a GFN→PFN mapping is really a GFN→HVA→PFN mapping. And the
>> non-atomic code *does* update the GFN→HVA part of that, correctly
>> looking at the memslots generation etc..
>> 
>> But it pays absolutely no attention to the *second* part, and assumes
>> that the HVA→PFN mapping in the userspace page tables will never
>> change.
>> 
>> Which isn't necessarily true, even if the underlying physical page *is*
>> pinned to avoid most cases (ksm, swap, etc.) of the *kernel* changing
>> it. Userspace still can.
>
>Yes, I agree.  What I am saying is that:
>
>- the map/unmap dance is not (entirely) about whether to pin the page
>
>- the map/unmap API is not a bad API, just an incomplete implementation
>
>And I think the above comment confuses both points above.


Sorry, it took me a while to realise that by "above comment" you mean the original commit comment (which you want me to reword) instead of just what I'd said in my previous email. How about this version? If it's OK like this then I can resubmit later today when I get back to a proper keyboard.


In commit b043138246a4 ("x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed") we switched to using a gfn_to_pfn_cache for accessing the guest steal time structure in order to allow for an atomic xchg of the preempted field. This has a couple of problems.

Firstly, kvm_map_gfn() doesn't work at all for IOMEM pages when the atomic flag is set, which it is in kvm_steal_time_set_preempted(). So a guest vCPU using an IOMEM page for its steal time would never have its preempted field set.

Secondly, the gfn_to_pfn_cache is not invalidated in all cases where it should have been. There are two stages to the GFN → PFN conversion; first the GFN is converted to a userspace HVA, and then that HVA is looked up in the process page tables to find the underlying host PFN. Correct invalidation of the latter would require being hooked up to the MMU notifiers, but that doesn't happen — so it just keeps mapping and unmapping the *wrong* PFN after the userspace page tables change.

In the !IOMEM case at least the stale page *is* pinned all the time it's cached, so it won't be freed and reused by anyone else while still receiving the steal time updates.

To support Xen event channel delivery I will be fixing this up and using the MMU notifiers to mark the mapping invalid at appropriate times — giving us a way to use kvm_map_gfn() safely with an atomic fast path via the kernel mapping, and a slow fallback path for when the mapping needs to be refreshed.

But for steal time reporting there's no point in a kernel mapping of it anyway, when in all cases we care about, we have a perfectly serviceable (and tautologically not stale) userspace HVA for it. We just need to implement the atomic xchg on the userspace address with appropriate exception handling, which is fairly trivial.

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-03  9:47         ` [PATCH v2] " David Woodhouse
@ 2021-11-03 12:35           ` Paolo Bonzini
  2021-11-03 12:56             ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-03 12:35 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/3/21 10:47, David Woodhouse wrote:
> Sorry, it took me a while to realise that by "above comment" you mean
> the original commit comment (which you want me to reword) instead of
> just what I'd said in my previous email. How about this version? If
> it's OK like this then I can resubmit later today when I get back to
> a proper keyboard.

No need to resubmit, thanks!  I'll review the code later and decide 
whether to include this in 5.16 or go for the "good" solution in 5.16 
and submit this one for 5.15 only.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-03 12:35           ` Paolo Bonzini
@ 2021-11-03 12:56             ` David Woodhouse
  2021-11-03 13:05               ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-03 12:56 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed



On 3 November 2021 12:35:11 GMT, Paolo Bonzini <pbonzini@redhat.com> wrote:
>On 11/3/21 10:47, David Woodhouse wrote:
>> Sorry, it took me a while to realise that by "above comment" you mean
>> the original commit comment (which you want me to reword) instead of
>> just what I'd said in my previous email. How about this version? If
>> it's OK like this then I can resubmit later today when I get back to
>> a proper keyboard.
>
>No need to resubmit, thanks!  I'll review the code later and decide 
>whether to include this in 5.16 or go for the "good" solution in 5.16 
>and submit this one for 5.15 only.

I would call this the good solution for steal time. We really do always have a userspace HVA for that when it matters, and we should use it.

For Xen event channel delivery we have to do it from hardware interrupts under arbitrary current->mm and we need a kernel mapping, and we need the MMU notifiers and all that stuff. But for every mapping we do that way, we need extra checks in the MMU notifiers.

For steal time there's just no need.

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-03 12:56             ` David Woodhouse
@ 2021-11-03 13:05               ` Paolo Bonzini
  2021-11-03 13:23                 ` David Woodhouse
  2021-11-03 13:34                 ` David Woodhouse
  0 siblings, 2 replies; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-03 13:05 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/3/21 13:56, David Woodhouse wrote:
>> No need to resubmit, thanks!  I'll review the code later and
>> decide whether to include this in 5.16 or go for the "good"
>> solution in 5.16 and submit this one for 5.15 only.
> I would call this the good solution for steal time. We really do
> always have a userspace HVA for that when it matters, and we should
> use it.
> 
> For Xen event channel delivery we have to do it from hardware
> interrupts under arbitrary current->mm and we need a kernel mapping,
> and we need the MMU notifiers and all that stuff. But for every
> mapping we do that way, we need extra checks in the MMU notifiers.
> 
> For steal time there's just no need.

Yes, but doing things by hand that it is slightly harder to get right, 
between the asm and the manual user_access_{begin,end}.

The good solution would be to handle the remapping of _all_ gfn-to-pfn 
caches from the MMU notifiers, so that you can still do map/unmap, keep 
the code simple, and get for free the KVM-specific details such as 
marking the gfn as dirty.

When I was working on it before, I got stuck with wanting to do it not 
just good but perfect, including the eVMCS page in it.  But that makes 
no sense because really all that needs to be fixed is the _current_ 
users of the gfn-to-pfn cache.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-03 13:05               ` Paolo Bonzini
@ 2021-11-03 13:23                 ` David Woodhouse
  2021-11-03 13:34                 ` David Woodhouse
  1 sibling, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-03 13:23 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed



On 3 November 2021 13:05:11 GMT, Paolo Bonzini <pbonzini@redhat.com> wrote:
>On 11/3/21 13:56, David Woodhouse wrote:
>>> No need to resubmit, thanks!  I'll review the code later and
>>> decide whether to include this in 5.16 or go for the "good"
>>> solution in 5.16 and submit this one for 5.15 only.
>> I would call this the good solution for steal time. We really do
>> always have a userspace HVA for that when it matters, and we should
>> use it.
>> 
>> For Xen event channel delivery we have to do it from hardware
>> interrupts under arbitrary current->mm and we need a kernel mapping,
>> and we need the MMU notifiers and all that stuff. But for every
>> mapping we do that way, we need extra checks in the MMU notifiers.
>> 
>> For steal time there's just no need.
>
>Yes, but doing things by hand that it is slightly harder to get right, 
>between the asm and the manual user_access_{begin,end}.

Yes. Before I embarked on this I did have a fantasy that I could just use the futex asm helpers which already do much of that, but it didn't turn out that way. But once that part is done it shouldn't need to be touched again. It's only for the *locked* accesses like bit set and xchg; for anything else the normal user access works.

>The good solution would be to handle the remapping of _all_ gfn-to-pfn 
>caches from the MMU notifiers, so that you can still do map/unmap, keep 
>the code simple, and get for free the KVM-specific details such as 
>marking the gfn as dirty.
>
>When I was working on it before, I got stuck with wanting to do it not 
>just good but perfect, including the eVMCS page in it.  But that makes 
>no sense because really all that needs to be fixed is the _current_ 
>users of the gfn-to-pfn cache.

Yeah. Well, let's take a look at the Xen evtchn stuff and heckle the new rwlock I used, and the way we have to hold that lock *while* doing the access. Then we can ponder whether we want to offer that as a "generic" thing for providing a kernel mapping, and have the MMU notifiers walk a list of them to check for invalidation. Or whether we can actually use an HVA after all (and in at least some cases we really can).

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v2] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-03 13:05               ` Paolo Bonzini
  2021-11-03 13:23                 ` David Woodhouse
@ 2021-11-03 13:34                 ` David Woodhouse
  1 sibling, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-03 13:34 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed



On 3 November 2021 13:05:11 GMT, Paolo Bonzini <pbonzini@redhat.com> wrote:
>On 11/3/21 13:56, David Woodhouse wrote:
>>> No need to resubmit, thanks!  I'll review the code later and
>>> decide whether to include this in 5.16 or go for the "good"
>>> solution in 5.16 and submit this one for 5.15 only.
>> I would call this the good solution for steal time. We really do
>> always have a userspace HVA for that when it matters, and we should
>> use it.
>> 
>> For Xen event channel delivery we have to do it from hardware
>> interrupts under arbitrary current->mm and we need a kernel mapping,
>> and we need the MMU notifiers and all that stuff. But for every
>> mapping we do that way, we need extra checks in the MMU notifiers.
>> 
>> For steal time there's just no need.
>
>Yes, but doing things by hand that it is slightly harder to get right, 
>between the asm and the manual user_access_{begin,end}.

Right. When I embarked on this I had a fantasy that I could just use the futex asm helpers which do most of it for us. But it didn't turn out that way. On the other hand, it's only needed for the *atomic* accesses (xchg, bit set) and most of the time we can just use normal uaccess stuff (and remember to mark the gfn dirty!)

>The good solution would be to handle the remapping of _all_ gfn-to-pfn 
>caches from the MMU notifiers, so that you can still do map/unmap, keep 
>the code simple, and get for free the KVM-specific details such as 
>marking the gfn as dirty.
>
>When I was working on it before, I got stuck with wanting to do it not 
>just good but perfect, including the eVMCS page in it.  But that makes 
>no sense because really all that needs to be fixed is the _current_ 
>users of the gfn-to-pfn cache.

Yeah.

Well, let's take a look at how I've done it for the Xen event channel delivery, in particular the rwlock that has to be held *while* doing the access via the mapped kernel address. Then we can ponder whether we want to offer something along those lines as a generic facility.

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-02 17:36         ` [PATCH v3] " David Woodhouse
@ 2021-11-11 13:23           ` Paolo Bonzini
  2021-11-12  8:28             ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-11 13:23 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/2/21 18:36, David Woodhouse wrote:

> +		asm volatile("1:\t" LOCK_PREFIX "xchgb %0, %2\n"
> +			     "\txor %1, %1\n"
> +			     "2:\n"
> +			     "\t.section .fixup,\"ax\"\n"
> +			     "3:\tmovl %3, %1\n"
> +			     "\tjmp\t2b\n"
> +			     "\t.previous\n"
> +			     _ASM_EXTABLE_UA(1b, 3b)
> +			     : "=r" (st_preempted),
> +			       "=r" (err)
> +			     : "m" (st->preempted),
> +			       "i" (-EFAULT),
> +			       "0" (st_preempted));

Since Peter is removing custom fixups, I'm going for code that is
slightly suboptimal (though just by one extra instruction) but doesn't
interfere with him.

Also, xchg doesn't need a lock prefix.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3301,21 +3301,15 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
  	 */
  	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
  		u8 st_preempted = 0;
-		int err;
+		int err = -EFAULT;
  
-		asm volatile("1:\t" LOCK_PREFIX "xchgb %0, %2\n"
-			     "\txor %1, %1\n"
+		asm volatile("1: xchgb %0, %2\n"
+			     "xor %1, %1\n"
  			     "2:\n"
-			     "\t.section .fixup,\"ax\"\n"
-			     "3:\tmovl %3, %1\n"
-			     "\tjmp\t2b\n"
-			     "\t.previous\n"
-			     _ASM_EXTABLE_UA(1b, 3b)
-			     : "=r" (st_preempted),
-			       "=r" (err)
-			     : "m" (st->preempted),
-			       "i" (-EFAULT),
-			       "0" (st_preempted));
+			     _ASM_EXTABLE_UA(1b, 2b)
+			     : "+r" (st_preempted),
+			       "+&r" (err)
+			     : "m" (st->preempted));
  		if (err)
  			goto out;
  

Queued with these changes.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-11 13:23           ` Paolo Bonzini
@ 2021-11-12  8:28             ` David Woodhouse
  2021-11-12  9:31               ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-12  8:28 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 2137 bytes --]

On Thu, 2021-11-11 at 14:23 +0100, Paolo Bonzini wrote:
> Queued with these changes.

Thanks. I note...

    [I didn't entirely agree with David's assessment of the
     usefulness of the gfn_to_pfn cache, and integrated the outcome
     of the discussion in the above commit message. - Paolo]

 ... the key change being the 'Until' in:

    Until the gfn_to_pfn cache handles the remapping automatically by
    integrating with the MMU notifiers, we might as well not get a
    kernel mapping of it...

I do not recall that we'd actually reached a conclusion that we *will*
make the gfn_to_pfn cache generally usable in that fashion. The latest
I knew of that discussion was my message at 
https://lore.kernel.org/kvm/55a5d4e3fbd29dd55e276b97eeaefd0411b3290b.camel@infradead.org/
in which I said I'd be a whole lot happier with that if we could do it
with RCU instead of an rwlock — but I don't think we can because we'd
need to call synchronize_srcu() in the MMU notifier callback that might
not be permitted to sleep?

I'm also slightly less comfortable with having the MMU notifier work
through an arbitrary *list* of gfn_to_pfn caches that it potentially
needs to invalidate, but that is very much a minor concern compared
with the first.

I started looking through the nested code which is the big user of this
facility. The important part of the gfn_to_pfn mapping as I've used it
for Xen event channel delivery is the fast path in the common case,
falling back to a slow path that needs to sleep, to revalidate the
mapping. That fast vs. slow path (with a workqueue) already existed for
irqfd delivery and I just needed to hook into it in the right places.

I didn't see anything in nested code that would benefit from that same
setup, and AFAICT it should all be running with current->mm == kvm->mm
so surely it ought to be able to just access things using the userspace
HVA and sleep if necessary?

(There's an *entirely* gratuitous one in nested_cache_shadow_vmcs12()
which does a map/memcpy/unmap that really ought to be kvm_read_guest().
I'll send a patch for that shortly)



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12  8:28             ` David Woodhouse
@ 2021-11-12  9:31               ` Paolo Bonzini
  2021-11-12  9:54                 ` David Woodhouse
  2021-11-12 19:44                 ` [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status David Woodhouse
  0 siblings, 2 replies; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-12  9:31 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/12/21 09:28, David Woodhouse wrote:
> I do not recall that we'd actually reached a conclusion that we *will*
> make the gfn_to_pfn cache generally usable in that fashion. The latest
> I knew of that discussion was my message at
> https://lore.kernel.org/kvm/55a5d4e3fbd29dd55e276b97eeaefd0411b3290b.camel@infradead.org/
> in which I said I'd be a whole lot happier with that if we could do it
> with RCU instead of an rwlock — but I don't think we can because we'd
> need to call synchronize_srcu() in the MMU notifier callback that might
> not be permitted to sleep?

Why do you have a problem with the rwlock?  If it's per-cache, and it's 
mostly taken within vCPU context (with the exception of Xen), contention 
should be nonexistent.

> I'm also slightly less comfortable with having the MMU notifier work
> through an arbitrary *list* of gfn_to_pfn caches that it potentially
> needs to invalidate, but that is very much a minor concern compared
> with the first.
> 
> I started looking through the nested code which is the big user of this
> facility.

Yes, that's also where I got stuck in my first attempt a few months ago. 
  I agree that it can be changed to use gfn-to-hva caches, except for 
the vmcs12->posted_intr_desc_addr and vmcs12->virtual_apic_page_addr.

Paolo

> The important part of the gfn_to_pfn mapping as I've used it
> for Xen event channel delivery is the fast path in the common case,
> falling back to a slow path that needs to sleep, to revalidate the
> mapping. That fast vs. slow path (with a workqueue) already existed for
> irqfd delivery and I just needed to hook into it in the right places.
> 
> I didn't see anything in nested code that would benefit from that same
> setup, and AFAICT it should all be running with current->mm == kvm->mm
> so surely it ought to be able to just access things using the userspace
> HVA and sleep if necessary?
> 
> (There's an *entirely* gratuitous one in nested_cache_shadow_vmcs12()
> which does a map/memcpy/unmap that really ought to be kvm_read_guest().
> I'll send a patch for that shortly)
> 
> 


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12  9:31               ` Paolo Bonzini
@ 2021-11-12  9:54                 ` David Woodhouse
  2021-11-12 10:49                   ` Paolo Bonzini
  2021-11-12 19:44                 ` [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status David Woodhouse
  1 sibling, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-12  9:54 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 3170 bytes --]

On Fri, 2021-11-12 at 10:31 +0100, Paolo Bonzini wrote:
> On 11/12/21 09:28, David Woodhouse wrote:
> > I do not recall that we'd actually reached a conclusion that we *will*
> > make the gfn_to_pfn cache generally usable in that fashion. The latest
> > I knew of that discussion was my message at
> > https://lore.kernel.org/kvm/55a5d4e3fbd29dd55e276b97eeaefd0411b3290b.camel@infradead.org/
> > 
> > in which I said I'd be a whole lot happier with that if we could do it
> > with RCU instead of an rwlock — but I don't think we can because we'd
> > need to call synchronize_srcu() in the MMU notifier callback that might
> > not be permitted to sleep?
> 
> Why do you have a problem with the rwlock?  If it's per-cache, and it's 
> mostly taken within vCPU context (with the exception of Xen), contention 
> should be nonexistent.

My problem with the using the rwlock instead of RCU is not the
contention, it's...

> > I'm also slightly less comfortable with having the MMU notifier work
> > through an arbitrary *list* of gfn_to_pfn caches that it potentially
> > needs to invalidate, but that is very much a minor concern compared
> > with the first.
> > 
> > I started looking through the nested code which is the big user of this
> > facility.
> 
> Yes, that's also where I got stuck in my first attempt a few months ago. 
>   I agree that it can be changed to use gfn-to-hva caches, except for 
> the vmcs12->posted_intr_desc_addr and vmcs12->virtual_apic_page_addr.
> 

... that anything accessing these will *still* need to do so in atomic
context. There's an atomic access which might fail, and then you fall
back to a context in which you can sleep to refresh the mapping. and
you *still* need to perform the actual access with the spinlock held to
protect against concurrent invalidation.


So let's take a look... for posted_intr_desc_addr, that host physical
address is actually written to the VMCS02, isn't it? 

Thinking about the case where the target page is being invalidated
while the vCPU is running... surely in that case the only 'correct'
solution is that the vCPU needs to be kicked out of non-root mode
before the invalidate_range() notifier completes?

That would have worked nicely if the MMU notifier could call
scru_synchronize() on invalidation. Can it kick the vCPU and wait for
it to exit though?

Or maybe there's a variant where we only have to ensure that no posted
interrupts will actually be *delivered* after the invaldation?

Don't get me wrong, a big part of me *loves* the idea that the hairiest
part of my Xen event channel delivery is actually a bug fix that we
need in the kernel anyway, and then the rest of it is simple and
uncontentious.

I'm just not quite sure I see how to provide a generic mechanism that
actually *fixes* the bugs that already exist elsewhere — at least not
without them having their own special cases for invalidation anyway.


(ISTR the virtual apic page is a bit different because it's only an
*address* and it doesn't even have to be backed by real memory at the
corresponding HPA? Otherwise it's basically the same issue?)


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12  9:54                 ` David Woodhouse
@ 2021-11-12 10:49                   ` Paolo Bonzini
  2021-11-12 11:29                     ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-12 10:49 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/12/21 10:54, David Woodhouse wrote:
>>> I'm also slightly less comfortable with having the MMU notifier work
>>> through an arbitrary *list* of gfn_to_pfn caches that it potentially
>>> needs to invalidate, but that is very much a minor concern compared
>>> with the first.
>>>
>>> I started looking through the nested code which is the big user of this
>>> facility.
>>
>> Yes, that's also where I got stuck in my first attempt a few months ago.
>>    I agree that it can be changed to use gfn-to-hva caches, except for
>> the vmcs12->posted_intr_desc_addr and vmcs12->virtual_apic_page_addr.
> 
> ... that anything accessing these will *still* need to do so in atomic
> context. There's an atomic access which might fail, and then you fall
> back to a context in which you can sleep to refresh the mapping. and
> you *still* need to perform the actual access with the spinlock held to
> protect against concurrent invalidation.
> 
> So let's take a look... for posted_intr_desc_addr, that host physical
> address is actually written to the VMCS02, isn't it?
> 
> Thinking about the case where the target page is being invalidated
> while the vCPU is running... surely in that case the only 'correct'
> solution is that the vCPU needs to be kicked out of non-root mode
> before the invalidate_range() notifier completes?

Yes.

> That would have worked nicely if the MMU notifier could call
> scru_synchronize() on invalidation. Can it kick the vCPU and wait for
> it to exit though?

Yes, there's kvm_make_all_cpus_request (see 
kvm_arch_mmu_notifier_invalidate_range).  It can sleep, which is 
theoretically wrong---but in practice non-blockable invalidations only 
occur from the OOM reaper, so no CPU can be running.  If we care, we can 
return early from kvm_arch_mmu_notifier_invalidate_range for 
non-blockable invalidations.

> Don't get me wrong, a big part of me *loves* the idea that the hairiest
> part of my Xen event channel delivery is actually a bug fix that we
> need in the kernel anyway, and then the rest of it is simple and
> uncontentious.
> 
> (ISTR the virtual apic page is a bit different because it's only an
> *address* and it doesn't even have to be backed by real memory at the
> corresponding HPA? Otherwise it's basically the same issue?)

We do back it by real memory anyway, so it's the same.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12 10:49                   ` Paolo Bonzini
@ 2021-11-12 11:29                     ` David Woodhouse
  2021-11-12 12:27                       ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-12 11:29 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 1849 bytes --]

On Fri, 2021-11-12 at 11:49 +0100, Paolo Bonzini wrote:
> > That would have worked nicely if the MMU notifier could call
> > scru_synchronize() on invalidation. Can it kick the vCPU and wait for
> > it to exit though?
> 
> Yes, there's kvm_make_all_cpus_request (see 
> kvm_arch_mmu_notifier_invalidate_range).  It can sleep, which is 
> theoretically wrong---but in practice non-blockable invalidations only 
> occur from the OOM reaper, so no CPU can be running.  If we care, we can 
> return early from kvm_arch_mmu_notifier_invalidate_range for 
> non-blockable invalidations.

OK, so these don't actually want any of that stuff with the rwlock and
the invalidation setting the pointer to KVM_UNMAPPED_PAGE that I did in
https://lore.kernel.org/kvm/20211101190314.17954-6-dwmw2@infradead.org/
for Xen event channels.

It looks like they want their own way of handling it; if the GPA being
invalidated matches posted_intr_desc_addr or virtual_apic_page_addr
then the MMU notifier just needs to call kvm_make_all_cpus_request()
with some suitable checking/WARN magic around the "we will never need
to sleep when we shouldn't" assertion that you made above.

(And a little bit more thinking about ordering for the case of
concurrent invalidation occurring while we are entering the L2 guest,
but I think that works out OK.)



We *could* use the rwlock thing for steal time reporting, but I still
don't really think it's worth doing so. Again, if it was truly going to
be a generic mechanism that would solve lots of other problems, I'd be
up for it. But if steal time would be the *only* other user of a
generic version of the rwlock thing, that just seems like
overengineering. I'm still mostly inclined to stand by my original
observation that it has a perfectly serviceable HVA that it can use
instead.



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12 11:29                     ` David Woodhouse
@ 2021-11-12 12:27                       ` Paolo Bonzini
  2021-11-12 13:28                         ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-12 12:27 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/12/21 12:29, David Woodhouse wrote:
> We *could* use the rwlock thing for steal time reporting, but I still
> don't really think it's worth doing so. Again, if it was truly going to
> be a generic mechanism that would solve lots of other problems, I'd be
> up for it. But if steal time would be the *only* other user of a
> generic version of the rwlock thing, that just seems like
> overengineering. I'm still mostly inclined to stand by my original
> observation that it has a perfectly serviceable HVA that it can use
> instead.

Well yeah, I'd have to see the code to decide.  But maybe this is where
we disagree, what I want from generic KVM is a nice and easy-to-use API.
I only care to a certain extent how messy it is inside, because a nice
generic API means not reinventing the wheel across architectures.

That said, I think that your patch is much more complicated than it
should be, because it hooks in the wrong place.  There are two cases:

1) for kmap/kunmap, the invalidate_range() notifier is the right place
to remove any references taken after invalidate_range_start().  For the
APIC access page it needs a kvm_make_all_cpus_request, but for the
shinfo page it can be a simple back-to-back write_lock/write_unlock
(a super-poor-man RCU, if you wish).  And it can be extended to a
per-cache rwlock.

2) for memremap/memunmap, all you really care about is reacting to
changes in the memslots, so the MMU notifier integration has nothing
to do.  You still need to call the same hook as
kvm_mmu_notifier_invalidate_range() when memslots change, so that
the update is done outside atomic context.

So as far as short-term uses of the cache are concerned, all it
takes (if I'm right...) is a list_for_each_entry in
kvm_mmu_notifier_invalidate_range, visiting the list of
gfn-to-pfn caches and lock-unlock each cache's rwlock.  Plus
a way to inform the code of memslot changes before any atomic
code tries to use an invalid cache.

> It looks like they want their own way of handling it; if the GPA being
> invalidated matches posted_intr_desc_addr or virtual_apic_page_addr
> then the MMU notifier just needs to call kvm_make_all_cpus_request()
> with some suitable checking/WARN magic around the "we will never need
> to sleep when we shouldn't" assertion that you made above.

I was thinking of an extra flag to decide whether (in addition
to the write_lock/write_unlock) the MMU notifier also needs to do
the kvm_make_all_cpus_request:

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b213ca966d41..f134db24b973 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -309,8 +309,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
  		kvm_release_page_clean(vmx->nested.apic_access_page);
  		vmx->nested.apic_access_page = NULL;
  	}
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+	vmx->nested.virtual_apic_map.guest_uses_pa = false;
+	vmx->nested.pi_desc_map.guest_uses_pa = false;
  	vmx->nested.pi_desc = NULL;
  
  	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
@@ -3183,6 +3184,10 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
  			 */
  			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
  		}
+		// on invalidation, this causes kvm_make_all_cpus_request
+		// and also dirties the page
+		map->guest_uses_pa = true;
+		kvm_vcpu_unmap(vcpu, map);
  	}
  
  	if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -3204,6 +3207,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
  			vmx->nested.pi_desc = NULL;
  			pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
  		}
+		map->guest_uses_pa = true;
+		kvm_vcpu_unmap(vcpu, map);
  	}
  	if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
  		exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
@@ -4559,8 +4564,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
  		kvm_release_page_clean(vmx->nested.apic_access_page);
  		vmx->nested.apic_access_page = NULL;
  	}
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
-	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+	vmx->nested.virtual_apic_map.guest_uses_pa = false;
+	vmx->nested.pi_desc_map.guest_uses_pa = false;
  	vmx->nested.pi_desc = NULL;
  
  	if (vmx->nested.reload_vmcs01_apic_access_page) {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3b09ac93c86e..342f12321df7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6185,6 +6185,8 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
  
  	/* Defer reload until vmcs01 is the current VMCS. */
  	if (is_guest_mode(vcpu)) {
+		// TODO...
+		nested_vmx_update_vmcs02_phys_addrs(vcpu);
  		to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
  		return;
  	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91f723f37b22..6d0b7d2f1465 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9670,9 +9670,6 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
  
  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
  {
-	if (!lapic_in_kernel(vcpu))
-		return;
-
  	if (!kvm_x86_ops.set_apic_access_page_addr)
  		return;
  
With this infrastructure the APIC access page can be changed to a
gfn_to_pfn cache along the same lines (and whose guest_uses_pa is
always true).

Paolo


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12 12:27                       ` Paolo Bonzini
@ 2021-11-12 13:28                         ` David Woodhouse
  2021-11-12 14:56                           ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-12 13:28 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 5057 bytes --]

On Fri, 2021-11-12 at 13:27 +0100, Paolo Bonzini wrote:
> On 11/12/21 12:29, David Woodhouse wrote:
> > We *could* use the rwlock thing for steal time reporting, but I still
> > don't really think it's worth doing so. Again, if it was truly going to
> > be a generic mechanism that would solve lots of other problems, I'd be
> > up for it. But if steal time would be the *only* other user of a
> > generic version of the rwlock thing, that just seems like
> > overengineering. I'm still mostly inclined to stand by my original
> > observation that it has a perfectly serviceable HVA that it can use
> > instead.
> 
> Well yeah, I'd have to see the code to decide.  But maybe this is where
> we disagree, what I want from generic KVM is a nice and easy-to-use API.
> I only care to a certain extent how messy it is inside, because a nice
> generic API means not reinventing the wheel across architectures.

I'm happy enough with that if I understand how to make it make sense :)

> That said, I think that your patch is much more complicated than it
> should be, because it hooks in the wrong place.  There are two cases:
> 
> 1) for kmap/kunmap, the invalidate_range() notifier is the right place
> to remove any references taken after invalidate_range_start().

Right, I added it in invalidate_range_start() because it was *simpler*
that way (I get given a GFN not an HVA) but that's wrong; it really
does need to be in invalidate_range() as you say).

>   For the
> APIC access page it needs a kvm_make_all_cpus_request, but for the
> shinfo page it can be a simple back-to-back write_lock/write_unlock
> (a super-poor-man RCU, if you wish).  And it can be extended to a
> per-cache rwlock.

A back-to-back write_lock/write_unlock *without* setting the address to
KVM_UNMAPPED_PAGE? I'm not sure I see how that protects the IRQ
delivery from accessing the (now stale) physical page after the MMU
notifier has completed? Not unless it's going to call hva_to_pfn again
for itself under the read_lock, every time it delivers an IRQ?

My version marked it the PFN dirty and invalidated the hva pointer so
that nothing will touch it again — but left it actually *mapped*
because we can't necessarily sleep to unmap. But that's all it did in
the notifier:

+	if (static_branch_unlikely(&kvm_xen_enabled.key)) {
+		write_lock(&kvm->arch.xen.shinfo_lock);
+
+		if (kvm->arch.xen.shared_info &&
+		    kvm->arch.xen.shinfo_gfn >= range->start &&
+		    kvm->arch.xen.shinfo_cache.gfn < range->end) {
+			/*
+			 * If kvm_xen_shared_info_init() had *finished* mapping the
+			 * page and assigned the pointer for real, then mark the page
+			 * dirty now instead of via the eventual cache teardown.
+			 */
+			if (kvm->arch.xen.shared_info != KVM_UNMAPPED_PAGE) {
+				kvm_set_pfn_dirty(kvm->arch.xen.shinfo_cache.pfn);
+				kvm->arch.xen.shinfo_cache.dirty = false;
+			}
+
+			kvm->arch.xen.shared_info = NULL;
+		}
+
+		write_unlock(&kvm->arch.xen.shinfo_lock);
+	}



> 2) for memremap/memunmap, all you really care about is reacting to
> changes in the memslots, so the MMU notifier integration has nothing
> to do.  You still need to call the same hook as
> kvm_mmu_notifier_invalidate_range() when memslots change, so that
> the update is done outside atomic context.

Hm, we definitely *do* care about reacting to MMU notifiers in this
case too. Userspace can do memory overcommit / ballooning etc.
*without* changing the memslots, and only mmap/munmap/userfault_fd on
the corresponding HVA ranges.

> So as far as short-term uses of the cache are concerned, all it
> takes (if I'm right...) is a list_for_each_entry in
> kvm_mmu_notifier_invalidate_range, visiting the list of
> gfn-to-pfn caches and lock-unlock each cache's rwlock.  Plus
> a way to inform the code of memslot changes before any atomic
> code tries to use an invalid cache.

I do the memslot part already. It basically ends up being 

	if (!map->hva || map->hva == KVM_UNMAPPED_PAGE ||
	    slots->generation != ghc->generation ||
	    kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
		ret = -EWOULDBLOCK; /* Try again when you can sleep */
		goto out_unlock;
	}


> > It looks like they want their own way of handling it; if the GPA being
> > invalidated matches posted_intr_desc_addr or virtual_apic_page_addr
> > then the MMU notifier just needs to call kvm_make_all_cpus_request()
> > with some suitable checking/WARN magic around the "we will never need
> > to sleep when we shouldn't" assertion that you made above.
> 
> I was thinking of an extra flag to decide whether (in addition
> to the write_lock/write_unlock) the MMU notifier also needs to do
> the kvm_make_all_cpus_request:

Yeah, OK.

I think we want to kill the struct kvm_host_map completely, merge its
extra 'hva' and 'page' fields into the (possibly renamed)
gfn_to_pfn_cache along with your 'guest_uses_pa' flag, and take it from
there.

I'll go knock up something that we can heckle further...


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12 13:28                         ` David Woodhouse
@ 2021-11-12 14:56                           ` Paolo Bonzini
  2021-11-12 15:27                             ` David Woodhouse
  2021-11-15 16:47                             ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
  0 siblings, 2 replies; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-12 14:56 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/12/21 14:28, David Woodhouse wrote:
> A back-to-back write_lock/write_unlock*without*  setting the address to
> KVM_UNMAPPED_PAGE? I'm not sure I see how that protects the IRQ
> delivery from accessing the (now stale) physical page after the MMU
> notifier has completed? Not unless it's going to call hva_to_pfn again
> for itself under the read_lock, every time it delivers an IRQ?

Yeah, you're right, it still has to invalidate it somehow.  So
KVM_UNMAPPED_PAGE would go in the hva field of the gfn_to_pfn cache
(merged with kvm_host_map).  Or maybe one can use an invalid generation,
too.

I was under the mistaken impression that with MMU notifiers one could
make atomic kvm_vcpu_map never fail, but now I think that makes no sense;
it could always encounter stale memslots.

>> 2) for memremap/memunmap, all you really care about is reacting to
>> changes in the memslots, so the MMU notifier integration has nothing
>> to do.  You still need to call the same hook as
>> kvm_mmu_notifier_invalidate_range() when memslots change, so that
>> the update is done outside atomic context.
> 
> Hm, we definitely *do* care about reacting to MMU notifiers in this
> case too. Userspace can do memory overcommit / ballooning etc.
> *without* changing the memslots, and only mmap/munmap/userfault_fd on
> the corresponding HVA ranges.

Can it do so for VM_IO/VM_PFNMAP memory?

> I think we want to kill the struct kvm_host_map completely, merge its
> extra 'hva' and 'page' fields into the (possibly renamed)
> gfn_to_pfn_cache along with your 'guest_uses_pa' flag, and take it from
> there.

Yes, that makes sense.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12 14:56                           ` Paolo Bonzini
@ 2021-11-12 15:27                             ` David Woodhouse
  2021-11-15 16:47                             ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
  1 sibling, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-12 15:27 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 1722 bytes --]

On Fri, 2021-11-12 at 15:56 +0100, Paolo Bonzini wrote:
> On 11/12/21 14:28, David Woodhouse wrote:
> > A back-to-back write_lock/write_unlock*without*  setting the address to
> > KVM_UNMAPPED_PAGE? I'm not sure I see how that protects the IRQ
> > delivery from accessing the (now stale) physical page after the MMU
> > notifier has completed? Not unless it's going to call hva_to_pfn again
> > for itself under the read_lock, every time it delivers an IRQ?
> 
> Yeah, you're right, it still has to invalidate it somehow.  So
> KVM_UNMAPPED_PAGE would go in the hva field of the gfn_to_pfn cache
> (merged with kvm_host_map).  Or maybe one can use an invalid generation,
> too.

Right. Or now you have me adding *flags* anyway, so I might just use
one of those to mark it invalid.

We do need to keep the original hva pointer around to unmap it anyway,
even if it's become invalid. That's why my existing code has a separate
kvm->arch.xen.shared_info pointer in *addition* to the map->hva
pointer; the latter of which *isn't* wiped on invalidation.

> > > 2) for memremap/memunmap, all you really care about is reacting to
> > > changes in the memslots, so the MMU notifier integration has nothing
> > > to do.  You still need to call the same hook as
> > > kvm_mmu_notifier_invalidate_range() when memslots change, so that
> > > the update is done outside atomic context.
> > 
> > Hm, we definitely *do* care about reacting to MMU notifiers in this
> > case too. Userspace can do memory overcommit / ballooning etc.
> > *without* changing the memslots, and only mmap/munmap/userfault_fd on
> > the corresponding HVA ranges.
> 
> Can it do so for VM_IO/VM_PFNMAP memory?

It can, yes.


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status
  2021-11-12  9:31               ` Paolo Bonzini
  2021-11-12  9:54                 ` David Woodhouse
@ 2021-11-12 19:44                 ` David Woodhouse
  1 sibling, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-12 19:44 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 3549 bytes --]

On Fri, 2021-11-12 at 10:31 +0100, Paolo Bonzini wrote:
> Yes, that's also where I got stuck in my first attempt a few months ago. 
>   I agree that it can be changed to use gfn-to-hva caches, except for 
> the vmcs12->posted_intr_desc_addr and vmcs12->virtual_apic_page_addr.

Let's start with the low-hanging fruit... what are the recommended
tests for this kibnd of thing? I don't think we have kernel self-tests
that exercise this, do we?

As before, this is at
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn


From: David Woodhouse <dwmw@amazon.co.uk>
Subject: [PATCH] KVM: nVMX: Use kvm_{read,write}_guest_cached() for shadow_vmcs12

Using kvm_vcpu_map() for reading from the guest is entirely gratuitous,
when all we do is a single memcpy and unmap it again. Fix it up to use
kvm_read_guest()... but in fact I couldn't bring myself to do that
without also making it use a gfn_to_hva_cache for both that *and* the
copy in the other direction.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/vmx/nested.c | 24 +++++++++++++++---------
 arch/x86/kvm/vmx/vmx.h    |  5 +++++
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b213ca966d41..7e2a99f435b6 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -670,33 +670,39 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
 				       struct vmcs12 *vmcs12)
 {
-	struct kvm_host_map map;
-	struct vmcs12 *shadow;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
 
 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 	    vmcs12->vmcs_link_pointer == INVALID_GPA)
 		return;
 
-	shadow = get_shadow_vmcs12(vcpu);
-
-	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+	    kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
 		return;
 
-	memcpy(shadow, map.hva, VMCS12_SIZE);
-	kvm_vcpu_unmap(vcpu, &map, false);
+	kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+			      VMCS12_SIZE);
 }
 
 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
 					      struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
 
 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 	    vmcs12->vmcs_link_pointer == INVALID_GPA)
 		return;
 
-	kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
-			get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+	    kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
+		return;
+
+	kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+			       VMCS12_SIZE);
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a4ead6023133..cdadbd5dc0ca 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -141,6 +141,11 @@ struct nested_vmx {
 	 */
 	struct vmcs12 *cached_shadow_vmcs12;
 
+	/*
+	 * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer
+	 */
+	struct gfn_to_hva_cache shadow_vmcs12_cache;
+
 	/*
 	 * Indicates if the shadow vmcs or enlightened vmcs must be updated
 	 * with the data held by struct vmcs12.
-- 
2.31.1



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-12 14:56                           ` Paolo Bonzini
  2021-11-12 15:27                             ` David Woodhouse
@ 2021-11-15 16:47                             ` David Woodhouse
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
  2021-11-15 18:50                               ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache Paolo Bonzini
  1 sibling, 2 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:47 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 4926 bytes --]

On Fri, 2021-11-12 at 15:56 +0100, Paolo Bonzini wrote:
> > I think we want to kill the struct kvm_host_map completely, merge its
> > extra 'hva' and 'page' fields into the (possibly renamed)
> > gfn_to_pfn_cache along with your 'guest_uses_pa' flag, and take it from
> > there.
> 
> Yes, that makes sense.

OK... here's what I have so far.

I ended up killing the gfn_to_pfn_cache completely in a preliminary
patch, just so I could cleanly untangle it from the implementation of
kvm_vcpu_map()/kvm_vcpu_unmap(). Those want to die too, but they can
die *after* we have converted all their users to something else.

I then reinstate a newly-invented gfn_to_pfn_cache in another commit,
working only for the rwlock case so far because I have questions... :)

So the idea that we discussed is that the user of the gfn_to_pfn_cache
may set the guest_uses_ga flag to indicate that the cached PFN is used
by (e.g.) the VMCS02.

So... a user of this must check the validity after setting its mode to
IN_GUEST_MODE, and the invalidation must make a request and wake any
vCPU(s) which might be using it. As an optimisation, since these are
likely to be single-vCPU only, I added a 'vcpu' field to the cache.

The wakeup (in #if 0 so far) looks a bit like this...

	unsigned int req = KVM_REQ_GPC_INVALIDATE;

	/*
	 * If the OOM reaper is active, then all vCPUs should have been stopped
	 * already, so perform the request without KVM_REQUEST_WAIT and be sad
	 * if anything needed to be woken.
	 */
	if (!may_block)
		req |= ~KVM_REQUEST_WAIT;

	if (wake_all_vcpus) {
		called = kvm_make_all_cpus_request(kvm, req);
	} else if (wake_vcpus) {
		called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
	}
	WARN_ON_ONCE(called && !may_block);

I moved the invalidation to the invalidate_range MMU notifier, as
discussed. But that's where the plan falls down a little bit because
IIUC, that one can't sleep at all. I need to move it *back* to
invalidate_range_start() where I had it before, if I want to let it
wait for vCPUs to exit. Which means... that the cache 'refresh' call
must wait until the mmu_notifier_count reaches zero? Am I allowed to do
that, and make the "There can be only one waiter" comment in
kvm_mmu_notifier_invalidate_range_end() no longer true? Or is there a
better way?

I was also pondering whether to introduce a new arch-independent
KVM_REQ_GPC_INVALIDATE, or let it be arch-dependent and make it a field
of the cache, so that users can raise whatever requests they like?

Anyway, this much works for Xen event channel delivery (again) and
looks like it's *most* of the way there for fixing the nested stuff.

The first four or maybe even eight (modulo testing) patches are
probably ready to be merged anyway. The "Maintain valid mapping of Xen
shared_info page" patch is utterly trivial now and eventually I'll
probably fold it into the subsequent patch, but it's left separate for
illustration, for now.

David Woodhouse (11):
      KVM: x86: Fix steal time asm constraints in 32-bit mode
      KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO
      KVM: selftests: Add event channel upcall support to xen_shinfo_test
      KVM: x86/xen: Use sizeof_field() instead of open-coding it
      KVM: nVMX: Use kvm_{read,write}_guest_cached() for shadow_vmcs12
      KVM: nVMX: Use kvm_read_guest_offset_cached() for nested VMCS check
      KVM: nVMX: Use a gfn_to_hva_cache for vmptrld
      KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache
      KVM: Reinstate gfn_to_pfn_cache with invalidation support
      KVM: x86/xen: Maintain valid mapping of Xen shared_info page
      KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery

 Documentation/virt/kvm/api.rst                       |  21 ++++++
 arch/x86/include/asm/kvm_host.h                      |   3 +-
 arch/x86/kvm/irq_comm.c                              |  12 ++++
 arch/x86/kvm/vmx/nested.c                            |  76 +++++++++++++---------
 arch/x86/kvm/vmx/vmx.h                               |  10 +++
 arch/x86/kvm/x86.c                                   |   5 +-
 arch/x86/kvm/xen.c                                   | 305 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 arch/x86/kvm/xen.h                                   |   9 +++
 include/linux/kvm_host.h                             |  27 ++++++--
 include/linux/kvm_types.h                            |  13 +++-
 include/uapi/linux/kvm.h                             |  11 ++++
 tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++---
 virt/kvm/kvm_main.c                                  | 340 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
 13 files changed, 862 insertions(+), 157 deletions(-)

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode
  2021-11-15 16:47                             ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
@ 2021-11-15 16:50                               ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 02/11] KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO David Woodhouse
                                                   ` (9 more replies)
  2021-11-15 18:50                               ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache Paolo Bonzini
  1 sibling, 10 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

In 64-bit mode, x86 instruction encoding allows us to use the low 8 bits
of any GPR as an 8-bit operand. In 32-bit mode, however, we can only use
the [abcd] registers. For which, GCC has the "q" constraint instead of
the less restrictive "r".

Fixes: 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dc7eb5fddfd3..54452269a4ff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3307,7 +3307,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 			     "xor %1, %1\n"
 			     "2:\n"
 			     _ASM_EXTABLE_UA(1b, 2b)
-			     : "+r" (st_preempted),
+			     : "+q" (st_preempted),
 			       "+&r" (err)
 			     : "m" (st->preempted));
 		if (err)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 02/11] KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 03/11] KVM: selftests: Add event channel upcall support to xen_shinfo_test David Woodhouse
                                                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

In commit 319afe68567b ("KVM: xen: do not use struct gfn_to_hva_cache") we
stopped storing this in-kernel as a GPA, and started storing it as a GFN.
Which means we probably should have stopped calling gpa_to_gfn() on it
when userspace asks for it back.

Cc: stable@vger.kernel.org
Fixes: 319afe68567b ("KVM: xen: do not use struct gfn_to_hva_cache")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/xen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 8f62baebd028..6dd3d687cf04 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -299,7 +299,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-		data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn);
+		data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn;
 		r = 0;
 		break;
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 03/11] KVM: selftests: Add event channel upcall support to xen_shinfo_test
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 02/11] KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 04/11] KVM: x86/xen: Use sizeof_field() instead of open-coding it David Woodhouse
                                                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

When I first looked at this, there was no support for guest exception
handling in the KVM selftests. In fact it was merged into 5.10 before
the Xen support got merged in 5.11, and I could have used it from the
start.

Hook it up now, to exercise the Xen upcall delivery. I'm about to make
things a bit more interesting by handling the full 2level event channel
stuff in-kernel on top of the basic vector injection that we already
have, and I'll want to build more tests on top.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 .../selftests/kvm/x86_64/xen_shinfo_test.c    | 75 ++++++++++++++++---
 1 file changed, 66 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index eda0d2a51224..a0699f00b3d6 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -24,8 +24,12 @@
 
 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
 #define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + 0x20)
+#define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
 
 #define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + 0x20)
+#define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
+
+#define EVTCHN_VECTOR	0x10
 
 static struct kvm_vm *vm;
 
@@ -56,15 +60,44 @@ struct vcpu_runstate_info {
     uint64_t time[4];
 };
 
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+struct vcpu_info {
+        uint8_t evtchn_upcall_pending;
+        uint8_t evtchn_upcall_mask;
+        unsigned long evtchn_pending_sel;
+        struct arch_vcpu_info arch;
+        struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
 #define RUNSTATE_running  0
 #define RUNSTATE_runnable 1
 #define RUNSTATE_blocked  2
 #define RUNSTATE_offline  3
 
+static void evtchn_handler(struct ex_regs *regs)
+{
+	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
+	vi->evtchn_upcall_pending = 0;
+
+	GUEST_SYNC(0x20);
+}
+
 static void guest_code(void)
 {
 	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
 
+	__asm__ __volatile__(
+		"sti\n"
+		"nop\n"
+	);
+
+	/* Trigger an interrupt injection */
+	GUEST_SYNC(0);
+
 	/* Test having the host set runstates manually */
 	GUEST_SYNC(RUNSTATE_runnable);
 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
@@ -153,7 +186,7 @@ int main(int argc, char *argv[])
 
 	struct kvm_xen_vcpu_attr vi = {
 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
-		.u.gpa = SHINFO_REGION_GPA + 0x40,
+		.u.gpa = VCPU_INFO_ADDR,
 	};
 	vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &vi);
 
@@ -163,6 +196,16 @@ int main(int argc, char *argv[])
 	};
 	vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock);
 
+	struct kvm_xen_hvm_attr vec = {
+		.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
+		.u.vector = EVTCHN_VECTOR,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vm, VCPU_ID);
+	vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler);
+
 	if (do_runstate_tests) {
 		struct kvm_xen_vcpu_attr st = {
 			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
@@ -171,9 +214,14 @@ int main(int argc, char *argv[])
 		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
 	}
 
+	struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
+	vinfo->evtchn_upcall_pending = 0;
+
 	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
 	rs->state = 0x5a;
 
+	bool evtchn_irq_expected = false;
+
 	for (;;) {
 		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 		struct ucall uc;
@@ -193,16 +241,21 @@ int main(int argc, char *argv[])
 			struct kvm_xen_vcpu_attr rst;
 			long rundelay;
 
-			/* If no runstate support, bail out early */
-			if (!do_runstate_tests)
-				goto done;
-
-			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
-				    rs->time[1] + rs->time[2] + rs->time[3],
-				    "runstate times don't add up");
+			if (do_runstate_tests)
+				TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+					    rs->time[1] + rs->time[2] + rs->time[3],
+					    "runstate times don't add up");
 
 			switch (uc.args[1]) {
-			case RUNSTATE_running...RUNSTATE_offline:
+			case 0:
+				evtchn_irq_expected = true;
+				vinfo->evtchn_upcall_pending = 1;
+				break;
+
+			case RUNSTATE_runnable...RUNSTATE_offline:
+				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
+				if (!do_runstate_tests)
+					goto done;
 				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
 				rst.u.runstate.state = uc.args[1];
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst);
@@ -236,6 +289,10 @@ int main(int argc, char *argv[])
 					sched_yield();
 				} while (get_run_delay() < rundelay);
 				break;
+			case 0x20:
+				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
+				evtchn_irq_expected = false;
+				break;
 			}
 			break;
 		}
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 04/11] KVM: x86/xen: Use sizeof_field() instead of open-coding it
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 02/11] KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 03/11] KVM: selftests: Add event channel upcall support to xen_shinfo_test David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 05/11] KVM: nVMX: Use kvm_{read,write}_guest_cached() for shadow_vmcs12 David Woodhouse
                                                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/xen.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 6dd3d687cf04..565da9c3853b 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -127,9 +127,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	state_entry_time = vx->runstate_entry_time;
 	state_entry_time |= XEN_RUNSTATE_UPDATE;
 
-	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) !=
+	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
 		     sizeof(state_entry_time));
-	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
+	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
 		     sizeof(state_entry_time));
 
 	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
@@ -144,9 +144,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	 */
 	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
 		     offsetof(struct compat_vcpu_runstate_info, state));
-	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
+	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
 		     sizeof(vx->current_runstate));
-	BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
+	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
 		     sizeof(vx->current_runstate));
 
 	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
@@ -163,9 +163,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 		     offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
 	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
 		     offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
-	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
-		     sizeof(((struct compat_vcpu_runstate_info *)0)->time));
-	BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
+		     sizeof_field(struct compat_vcpu_runstate_info, time));
+	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
 		     sizeof(vx->runstate_times));
 
 	if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
@@ -205,9 +205,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
 		     offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
 	BUILD_BUG_ON(sizeof(rc) !=
-		     sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending));
+		     sizeof_field(struct vcpu_info, evtchn_upcall_pending));
 	BUILD_BUG_ON(sizeof(rc) !=
-		     sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending));
+		     sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
 
 	/*
 	 * For efficiency, this mirrors the checks for using the valid
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 05/11] KVM: nVMX: Use kvm_{read,write}_guest_cached() for shadow_vmcs12
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
                                                   ` (2 preceding siblings ...)
  2021-11-15 16:50                                 ` [PATCH 04/11] KVM: x86/xen: Use sizeof_field() instead of open-coding it David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 06/11] KVM: nVMX: Use kvm_read_guest_offset_cached() for nested VMCS check David Woodhouse
                                                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

Using kvm_vcpu_map() for reading from the guest is entirely gratuitous,
when all we do is a single memcpy and unmap it again. Fix it up to use
kvm_read_guest()... but in fact I couldn't bring myself to do that
without also making it use a gfn_to_hva_cache for both that *and* the
copy in the other direction.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/vmx/nested.c | 24 +++++++++++++++---------
 arch/x86/kvm/vmx/vmx.h    |  5 +++++
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b213ca966d41..7e2a99f435b6 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -670,33 +670,39 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
 				       struct vmcs12 *vmcs12)
 {
-	struct kvm_host_map map;
-	struct vmcs12 *shadow;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
 
 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 	    vmcs12->vmcs_link_pointer == INVALID_GPA)
 		return;
 
-	shadow = get_shadow_vmcs12(vcpu);
-
-	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
+	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+	    kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
 		return;
 
-	memcpy(shadow, map.hva, VMCS12_SIZE);
-	kvm_vcpu_unmap(vcpu, &map, false);
+	kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+			      VMCS12_SIZE);
 }
 
 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
 					      struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
 
 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
 	    vmcs12->vmcs_link_pointer == INVALID_GPA)
 		return;
 
-	kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
-			get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+	    kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+				      vmcs12->vmcs_link_pointer, VMCS12_SIZE))
+		return;
+
+	kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu),
+			       VMCS12_SIZE);
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a4ead6023133..cdadbd5dc0ca 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -141,6 +141,11 @@ struct nested_vmx {
 	 */
 	struct vmcs12 *cached_shadow_vmcs12;
 
+	/*
+	 * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer
+	 */
+	struct gfn_to_hva_cache shadow_vmcs12_cache;
+
 	/*
 	 * Indicates if the shadow vmcs or enlightened vmcs must be updated
 	 * with the data held by struct vmcs12.
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 06/11] KVM: nVMX: Use kvm_read_guest_offset_cached() for nested VMCS check
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
                                                   ` (3 preceding siblings ...)
  2021-11-15 16:50                                 ` [PATCH 05/11] KVM: nVMX: Use kvm_{read,write}_guest_cached() for shadow_vmcs12 David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 07/11] KVM: nVMX: Use a gfn_to_hva_cache for vmptrld David Woodhouse
                                                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

Kill another mostly gratuitous kvm_vcpu_map() which could just use the
userspace HVA for it.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/vmx/nested.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 7e2a99f435b6..070bf9558b2a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2916,9 +2916,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
 					  struct vmcs12 *vmcs12)
 {
-	int r = 0;
-	struct vmcs12 *shadow;
-	struct kvm_host_map map;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache;
+	struct vmcs_hdr hdr;
 
 	if (vmcs12->vmcs_link_pointer == INVALID_GPA)
 		return 0;
@@ -2926,17 +2926,21 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
 	if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
 		return -EINVAL;
 
-	if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
-		return -EINVAL;
+	if (ghc->gpa != vmcs12->vmcs_link_pointer &&
+	    CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc,
+					 vmcs12->vmcs_link_pointer, VMCS12_SIZE)))
+                return -EINVAL;
 
-	shadow = map.hva;
+	if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+					    offsetof(struct vmcs12, hdr),
+					    sizeof(hdr))))
+		return -EINVAL;
 
-	if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
-	    CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
-		r = -EINVAL;
+	if (CC(hdr.revision_id != VMCS12_REVISION) ||
+	    CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
+		return -EINVAL;
 
-	kvm_vcpu_unmap(vcpu, &map, false);
-	return r;
+	return 0;
 }
 
 /*
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 07/11] KVM: nVMX: Use a gfn_to_hva_cache for vmptrld
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
                                                   ` (4 preceding siblings ...)
  2021-11-15 16:50                                 ` [PATCH 06/11] KVM: nVMX: Use kvm_read_guest_offset_cached() for nested VMCS check David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 08/11] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache David Woodhouse
                                                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

And thus another call to kvm_vcpu_map() can die.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/vmx/nested.c | 26 +++++++++++++++++---------
 arch/x86/kvm/vmx/vmx.h    |  5 +++++
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 070bf9558b2a..280f34ea02c3 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5274,10 +5274,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		return 1;
 
 	if (vmx->nested.current_vmptr != vmptr) {
-		struct kvm_host_map map;
-		struct vmcs12 *new_vmcs12;
+		struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
+		struct vmcs_hdr hdr;
 
-		if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
+		if (ghc->gpa != vmptr &&
+		    kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
 			/*
 			 * Reads from an unbacked page return all 1s,
 			 * which means that the 32 bits located at the
@@ -5288,12 +5289,16 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
 		}
 
-		new_vmcs12 = map.hva;
+		if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr,
+						 offsetof(struct vmcs12, hdr),
+						 sizeof(hdr))) {
+			return nested_vmx_fail(vcpu,
+				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+		}
 
-		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
-		    (new_vmcs12->hdr.shadow_vmcs &&
+		if (hdr.revision_id != VMCS12_REVISION ||
+		    (hdr.shadow_vmcs &&
 		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
-			kvm_vcpu_unmap(vcpu, &map, false);
 			return nested_vmx_fail(vcpu,
 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
 		}
@@ -5304,8 +5309,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		 * Load VMCS12 from guest memory since it is not already
 		 * cached.
 		 */
-		memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
-		kvm_vcpu_unmap(vcpu, &map, false);
+		if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
+					  VMCS12_SIZE)) {
+			return nested_vmx_fail(vcpu,
+				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+		}
 
 		set_current_vmptr(vmx, vmptr);
 	}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index cdadbd5dc0ca..4df2ac24ffc1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -146,6 +146,11 @@ struct nested_vmx {
 	 */
 	struct gfn_to_hva_cache shadow_vmcs12_cache;
 
+	/*
+	 * GPA to HVA cache for VMCS12
+	 */
+	struct gfn_to_hva_cache vmcs12_cache;
+
 	/*
 	 * Indicates if the shadow vmcs or enlightened vmcs must be updated
 	 * with the data held by struct vmcs12.
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 08/11] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
                                                   ` (5 preceding siblings ...)
  2021-11-15 16:50                                 ` [PATCH 07/11] KVM: nVMX: Use a gfn_to_hva_cache for vmptrld David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-16 10:21                                   ` Paolo Bonzini
  2021-11-15 16:50                                 ` [PATCH 09/11] KVM: Reinstate gfn_to_pfn_cache with invalidation support David Woodhouse
                                                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

In commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time /
preempted status") I removed the only user of these functions because
it was basically impossible to use them safely.

There are two stages to the GFN → PFN mapping; first through the KVM
memslots to a userspace HVA and then through the page tables to
translate that HVA to an underlying PFN. Invalidations of the former
were being handled correctly, but no attempt was made to use the MMU
notifiers to invalidate the cache when the HVA→GFN mapping changed.

As a prelude to reinventing the gfn_to_pfn_cache with more usable
semantics, rip it out entirely and untangle the implementation of
the unsafe kvm_vcpu_map()/kvm_vcpu_unmap() functions from it.

All current users of kvm_vcpu_map() also look broken right now, and
will be dealt with separately. They broadly fall into two classes:

 • Those which map, access the data and immediately unmap. This is
   mostly gratuitous and could just as well use the existing user
   HVA, and could probably benefit from a gfn_to_hva_cache as they
   do so.

 • Those which keep the mapping around for a longer time, perhaps
   even using the PFN directly from the guest. These will need to
   be converted to the new gfn_to_pfn_cache and then kvm_vcpu_map()
   can be removed too.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_host.h  |   6 +--
 include/linux/kvm_types.h |   7 ---
 virt/kvm/kvm_main.c       | 100 +++++---------------------------------
 3 files changed, 12 insertions(+), 101 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9e0667e3723e..c310648cc8f1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -874,7 +874,7 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_dirty(kvm_pfn_t pfn);
 void kvm_set_pfn_accessed(kvm_pfn_t pfn);
 
-void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
+void kvm_release_pfn(kvm_pfn_t pfn, bool dirty);
 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 			int len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
@@ -950,12 +950,8 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
-		struct gfn_to_pfn_cache *cache, bool atomic);
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
-		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic);
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2237abb93ccd..234eab059839 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -53,13 +53,6 @@ struct gfn_to_hva_cache {
 	struct kvm_memory_slot *memslot;
 };
 
-struct gfn_to_pfn_cache {
-	u64 generation;
-	gfn_t gfn;
-	kvm_pfn_t pfn;
-	bool dirty;
-};
-
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 /*
  * Memory caches are used to preallocate memory ahead of various MMU flows,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d31724500501..9646bb9112c1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2548,72 +2548,36 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
-void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
+void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
 {
 	if (pfn == 0)
 		return;
 
-	if (cache)
-		cache->pfn = cache->gfn = 0;
-
 	if (dirty)
 		kvm_release_pfn_dirty(pfn);
 	else
 		kvm_release_pfn_clean(pfn);
 }
 
-static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
-				 struct gfn_to_pfn_cache *cache, u64 gen)
-{
-	kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
-	cache->pfn = gfn_to_pfn_memslot(slot, gfn);
-	cache->gfn = gfn;
-	cache->dirty = false;
-	cache->generation = gen;
-}
-
-static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
-			 struct kvm_host_map *map,
-			 struct gfn_to_pfn_cache *cache,
-			 bool atomic)
+int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 {
 	kvm_pfn_t pfn;
 	void *hva = NULL;
 	struct page *page = KVM_UNMAPPED_PAGE;
-	struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
-	u64 gen = slots->generation;
 
 	if (!map)
 		return -EINVAL;
 
-	if (cache) {
-		if (!cache->pfn || cache->gfn != gfn ||
-			cache->generation != gen) {
-			if (atomic)
-				return -EAGAIN;
-			kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
-		}
-		pfn = cache->pfn;
-	} else {
-		if (atomic)
-			return -EAGAIN;
-		pfn = gfn_to_pfn_memslot(slot, gfn);
-	}
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	if (is_error_noslot_pfn(pfn))
 		return -EINVAL;
 
 	if (pfn_valid(pfn)) {
 		page = pfn_to_page(pfn);
-		if (atomic)
-			hva = kmap_atomic(page);
-		else
-			hva = kmap(page);
+		hva = kmap(page);
 #ifdef CONFIG_HAS_IOMEM
-	} else if (!atomic) {
-		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
 	} else {
-		return -EINVAL;
+		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
 #endif
 	}
 
@@ -2627,27 +2591,9 @@ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
 
 	return 0;
 }
-
-int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
-		struct gfn_to_pfn_cache *cache, bool atomic)
-{
-	return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
-			cache, atomic);
-}
-EXPORT_SYMBOL_GPL(kvm_map_gfn);
-
-int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
-{
-	return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
-		NULL, false);
-}
 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
 
-static void __kvm_unmap_gfn(struct kvm *kvm,
-			struct kvm_memory_slot *memslot,
-			struct kvm_host_map *map,
-			struct gfn_to_pfn_cache *cache,
-			bool dirty, bool atomic)
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 {
 	if (!map)
 		return;
@@ -2655,45 +2601,21 @@ static void __kvm_unmap_gfn(struct kvm *kvm,
 	if (!map->hva)
 		return;
 
-	if (map->page != KVM_UNMAPPED_PAGE) {
-		if (atomic)
-			kunmap_atomic(map->hva);
-		else
-			kunmap(map->page);
-	}
+	if (map->page != KVM_UNMAPPED_PAGE)
+		kunmap(map->page);
 #ifdef CONFIG_HAS_IOMEM
-	else if (!atomic)
-		memunmap(map->hva);
 	else
-		WARN_ONCE(1, "Unexpected unmapping in atomic context");
+		memunmap(map->hva);
 #endif
 
 	if (dirty)
-		mark_page_dirty_in_slot(kvm, memslot, map->gfn);
+		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
 
-	if (cache)
-		cache->dirty |= dirty;
-	else
-		kvm_release_pfn(map->pfn, dirty, NULL);
+	kvm_release_pfn(map->pfn, dirty);
 
 	map->hva = NULL;
 	map->page = NULL;
 }
-
-int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 
-		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
-{
-	__kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
-			cache, dirty, atomic);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
-
-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
-{
-	__kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
-			map, NULL, dirty, false);
-}
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 09/11] KVM: Reinstate gfn_to_pfn_cache with invalidation support
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
                                                   ` (6 preceding siblings ...)
  2021-11-15 16:50                                 ` [PATCH 08/11] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 10/11] KVM: x86/xen: Maintain valid mapping of Xen shared_info page David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery David Woodhouse
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

This supports the rwlock-based usage mode only for now; to support the
guest_uses_pa case we need to work a few more details out. Perhaps add
a new KVM_REQ_ type for it, maybe arch-specific or not, and work out
how we can do the invalidation from invalidate_range_start and ensure
that it doesn't get reinstated before invalidate_range actually happens.

This much is good enough to start testing for Xen event channels though.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 include/linux/kvm_host.h  |  14 +++
 include/linux/kvm_types.h |  18 +++
 virt/kvm/kvm_main.c       | 250 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 282 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c310648cc8f1..762bf2586feb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -559,6 +559,10 @@ struct kvm {
 	unsigned long mn_active_invalidate_count;
 	struct rcuwait mn_memslots_update_rcuwait;
 
+	/* For invalidation of gfn_to_pfn_caches */
+	struct list_head gpc_list;
+	spinlock_t gpc_lock;
+
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
 	 * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
@@ -966,6 +970,16 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 			 unsigned long len);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 
+int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+			      struct kvm_vcpu *vcpu, bool guest_uses_pa,
+			      bool kernel_map, gpa_t gpa, unsigned long len,
+			      bool write);
+int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				 gpa_t gpa, unsigned long len, bool write);
+bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				gpa_t gpa, unsigned long len);
+void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
 
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 234eab059839..896bd78a30e3 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -19,6 +19,7 @@ struct kvm_memslots;
 enum kvm_mr_change;
 
 #include <linux/types.h>
+#include <linux/spinlock_types.h>
 
 #include <asm/kvm_types.h>
 
@@ -53,6 +54,23 @@ struct gfn_to_hva_cache {
 	struct kvm_memory_slot *memslot;
 };
 
+struct gfn_to_pfn_cache {
+	u64 generation;
+	gpa_t gpa;
+	unsigned long uhva;
+	struct kvm_memory_slot *memslot;
+	struct kvm_vcpu *vcpu;
+	struct list_head list;
+	rwlock_t lock;
+	void *khva;
+	kvm_pfn_t pfn;
+	bool active;
+	bool valid;
+	bool dirty;
+	bool guest_uses_pa;
+	bool kernel_map;
+};
+
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 /*
  * Memory caches are used to preallocate memory ahead of various MMU flows,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9646bb9112c1..7382aa45d5e8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -458,6 +458,9 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 	return container_of(mn, struct kvm, mmu_notifier);
 }
 
+static void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+					unsigned long end, bool may_block);
+
 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 					      struct mm_struct *mm,
 					      unsigned long start, unsigned long end)
@@ -465,6 +468,8 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int idx;
 
+	gfn_to_pfn_cache_invalidate(kvm, start, end, false);
+
 	idx = srcu_read_lock(&kvm->srcu);
 	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -1051,6 +1056,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	spin_lock_init(&kvm->mn_invalidate_lock);
 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
 
+	INIT_LIST_HEAD(&kvm->gpc_list);
+	spin_lock_init(&kvm->gpc_lock);
+
 	INIT_LIST_HEAD(&kvm->devices);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -2618,6 +2626,248 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
+static void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+					unsigned long end, bool may_block)
+{
+	bool wake_vcpus = false, wake_all_vcpus = false;
+	DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+	struct gfn_to_pfn_cache *gpc;
+	bool called = false;
+
+	spin_lock(&kvm->gpc_lock);
+	list_for_each_entry(gpc, &kvm->gpc_list, list) {
+		write_lock_irq(&gpc->lock);
+
+		/* Only a single page so no need to care about length */
+		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+		    gpc->uhva >= start && gpc->uhva < end) {
+			gpc->valid = false;
+
+			if (gpc->dirty) {
+				int idx = srcu_read_lock(&kvm->srcu);
+				mark_page_dirty(kvm, gpa_to_gfn(gpc->gpa));
+				srcu_read_unlock(&kvm->srcu, idx);
+
+				kvm_set_pfn_dirty(gpc->pfn);
+				gpc->dirty = false;
+			}
+
+			/*
+			 * If a guest vCPU could be using the physical address,
+			 * it needs to be woken.
+			 */
+			if (gpc->guest_uses_pa) {
+				if (wake_all_vcpus) {
+					/* Nothing to do */
+				} else if (gpc->vcpu) {
+					/* Only need to wake one vCPU for this */
+					if (!wake_vcpus) {
+						wake_vcpus = true;
+						bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+					}
+					__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
+				} else
+					wake_all_vcpus = true;
+				}
+			}
+		write_unlock_irq(&gpc->lock);
+	}
+	spin_unlock(&kvm->gpc_lock);
+
+#if 0
+	unsigned int req = KVM_REQ_GPC_INVALIDATE;
+
+	/*
+	 * If the OOM reaper is active, then all vCPUs should have been stopped
+	 * already, so perform the request without KVM_REQUEST_WAIT and be sad
+	 * if anything needed to be woken.
+	 */
+	if (!may_block)
+		req |= ~KVM_REQUEST_WAIT;
+
+	if (wake_all_vcpus) {
+		called = kvm_make_all_cpus_request(kvm, req);
+	} else if (wake_vcpus) {
+		called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
+	}
+#endif
+	WARN_ON_ONCE(called && !may_block);
+}
+
+bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				gpa_t gpa, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+
+	if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE)
+		return false;
+
+	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+	    kvm_is_error_hva(gpc->uhva))
+		return false;
+
+	if (!gpc->valid)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check);
+
+int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				 gpa_t gpa, unsigned long len, bool write)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	unsigned long page_offset = gpa & ~PAGE_MASK;
+	kvm_pfn_t old_pfn, new_pfn;
+	unsigned long old_uhva;
+	gpa_t old_gpa;
+	void *old_khva;
+	bool old_valid, old_dirty;
+	int ret = 0;
+
+	/*
+	 * If must fit within a single page. The 'len' argument is
+	 * only to enforce that.
+	 */
+	if (page_offset + len > PAGE_SIZE)
+		return -EINVAL;
+
+	write_lock_irq(&gpc->lock);
+
+	old_gpa = gpc->gpa;
+	old_pfn = gpc->pfn;
+	old_khva = gpc->khva;
+	old_uhva = gpc->uhva;
+	old_valid = gpc->valid;
+	old_dirty = gpc->dirty;
+
+	/* If the userspace HVA is invalid, refresh that first */
+	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+	    kvm_is_error_hva(gpc->uhva)) {
+		gfn_t gfn = gpa_to_gfn(gpa);
+
+		gpc->dirty = false;
+		gpc->gpa = gpa;
+		gpc->generation = slots->generation;
+		gpc->memslot = __gfn_to_memslot(slots, gfn);
+		gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
+
+		if (kvm_is_error_hva(gpc->uhva)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		gpc->uhva += page_offset;
+	}
+
+	/*
+	 * If the userspace HVA changed or the PFN was already invalid,
+	 * drop the lock and do the HVA to PFN lookup again.
+	 */
+	if (!old_valid || old_uhva != gpc->uhva) {
+		unsigned long uhva = gpc->uhva;
+		void *new_khva = NULL;
+
+		/* Placeholders for "hva is valid but not yet mapped" */
+		gpc->pfn = KVM_PFN_ERR_FAULT;
+		gpc->khva = NULL;
+		gpc->valid = true;
+
+		write_unlock_irq(&gpc->lock);
+
+		new_pfn = hva_to_pfn(uhva, false, NULL, true, NULL);
+		if (is_error_noslot_pfn(new_pfn))
+			ret = -EFAULT;
+		else if (gpc->kernel_map) {
+			if (new_pfn == old_pfn) {
+				new_khva = (void *)((unsigned long)old_khva - page_offset);
+				old_pfn = KVM_PFN_ERR_FAULT;
+				old_khva = NULL;
+			} else if (pfn_valid(new_pfn)) {
+				new_khva = kmap(pfn_to_page(new_pfn));
+#ifdef CONFIG_HAS_IOMEM
+			} else {
+				new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB);
+#endif
+			}
+			if (!new_khva)
+				ret = -EFAULT;
+		}
+
+		write_lock_irq(&gpc->lock);
+		if (ret) {
+			gpc->valid = false;
+			gpc->pfn = KVM_PFN_ERR_FAULT;
+			gpc->khva = NULL;
+		} else {
+			/* At this point, gpc->valid may already have been cleared */
+			gpc->pfn = new_pfn;
+			gpc->khva = new_khva + page_offset;
+		}
+	}
+
+ out:
+	if (ret)
+		gpc->dirty = false;
+	else
+		gpc->dirty = write;
+
+	write_unlock_irq(&gpc->lock);
+
+	/* Unmap the old page if it was mapped before */
+	if (!is_error_noslot_pfn(old_pfn)) {
+		if (pfn_valid(old_pfn)) {
+			kunmap(pfn_to_page(old_pfn));
+#ifdef CONFIG_HAS_IOMEM
+		} else {
+			memunmap(old_khva);
+#endif
+		}
+		kvm_release_pfn(old_pfn, old_dirty);
+		if (old_dirty)
+			mark_page_dirty(kvm, old_gpa);
+	}
+
+	return ret;
+}
+
+int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+			      struct kvm_vcpu *vcpu, bool guest_uses_pa,
+			      bool kernel_map, gpa_t gpa, unsigned long len,
+			      bool write)
+{
+	if (!gpc->active) {
+		rwlock_init(&gpc->lock);
+
+		gpc->khva = NULL;
+		gpc->pfn = KVM_PFN_ERR_FAULT;
+		gpc->uhva = KVM_HVA_ERR_BAD;
+		gpc->vcpu = vcpu;
+		gpc->guest_uses_pa = guest_uses_pa;
+		gpc->kernel_map = kernel_map;
+		gpc->valid = false;
+		gpc->active = true;
+
+		spin_lock(&kvm->gpc_lock);
+		list_add(&gpc->list, &kvm->gpc_list);
+		spin_unlock(&kvm->gpc_lock);
+	}
+	return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len, write);
+}
+
+void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+{
+	if (gpc->active) {
+		spin_lock(&kvm->gpc_lock);
+		list_del(&gpc->list);
+		spin_unlock(&kvm->gpc_lock);
+
+		/* In failing, it will tear down any existing mapping */
+		(void)kvm_gfn_to_pfn_cache_refresh(kvm, gpc, GPA_INVALID, 0, false);
+		gpc->active = false;
+	}
+}
+
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	kvm_pfn_t pfn;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 10/11] KVM: x86/xen: Maintain valid mapping of Xen shared_info page
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
                                                   ` (7 preceding siblings ...)
  2021-11-15 16:50                                 ` [PATCH 09/11] KVM: Reinstate gfn_to_pfn_cache with invalidation support David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 16:50                                 ` [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery David Woodhouse
  9 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

In order to allow for event channel delivery, we would like to have a
kernel mapping of the shared_info page which can be accessed in atomic
context in the common case.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/xen.c              | 25 ++++++++++++++-----------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e5d8700319cc..ea53740f6fc7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1013,7 +1013,7 @@ struct msr_bitmap_range {
 struct kvm_xen {
 	bool long_mode;
 	u8 upcall_vector;
-	gfn_t shinfo_gfn;
+	struct gfn_to_pfn_cache shinfo_cache;
 };
 
 enum kvm_irqchip_mode {
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 565da9c3853b..dcd88f1092d2 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -23,16 +23,21 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
 
 static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
 {
+	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
 	gpa_t gpa = gfn_to_gpa(gfn);
 	int wc_ofs, sec_hi_ofs;
 	int ret = 0;
 	int idx = srcu_read_lock(&kvm->srcu);
 
-	if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
-		ret = -EFAULT;
+	if (gfn == GPA_INVALID) {
+		kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
 		goto out;
 	}
-	kvm->arch.xen.shinfo_gfn = gfn;
+
+	ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, gpa,
+					PAGE_SIZE, true);
+	if (ret)
+		goto out;
 
 	/* Paranoia checks on the 32-bit struct layout */
 	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -260,15 +265,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-		if (data->u.shared_info.gfn == GPA_INVALID) {
-			kvm->arch.xen.shinfo_gfn = GPA_INVALID;
-			r = 0;
-			break;
-		}
 		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
 		break;
 
-
 	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
 		if (data->u.vector && data->u.vector < 0x10)
 			r = -EINVAL;
@@ -299,7 +298,10 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-		data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn;
+		if (kvm->arch.xen.shinfo_cache.active)
+			data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+		else
+			data->u.shared_info.gfn = GPA_INVALID;
 		r = 0;
 		break;
 
@@ -661,11 +663,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 
 void kvm_xen_init_vm(struct kvm *kvm)
 {
-	kvm->arch.xen.shinfo_gfn = GPA_INVALID;
 }
 
 void kvm_xen_destroy_vm(struct kvm *kvm)
 {
+	kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
 	if (kvm->arch.xen_hvm_config.msr)
 		static_branch_slow_dec_deferred(&kvm_xen_enabled);
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
                                                   ` (8 preceding siblings ...)
  2021-11-15 16:50                                 ` [PATCH 10/11] KVM: x86/xen: Maintain valid mapping of Xen shared_info page David Woodhouse
@ 2021-11-15 16:50                                 ` David Woodhouse
  2021-11-15 17:02                                   ` David Woodhouse
  2021-11-15 18:49                                   ` Paolo Bonzini
  9 siblings, 2 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 16:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

From: David Woodhouse <dwmw@amazon.co.uk>

This adds basic support for delivering 2 level event channels to a guest.

Initially, it only supports delivery via the IRQ routing table, triggered
by an eventfd. In order to do so, it has a kvm_xen_set_evtchn_fast()
function which will use the pre-mapped shared_info page if it already
exists and is still valid, while the slow path through the irqfd_inject
workqueue will remap the shared_info page if necessary.

It sets the bits in the shared_info page but not the vcpu_info; that is
deferred to __kvm_xen_has_interrupt() which raises the vector to the
appropriate vCPU.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 Documentation/virt/kvm/api.rst                |  21 ++
 arch/x86/include/asm/kvm_host.h               |   1 +
 arch/x86/kvm/irq_comm.c                       |  12 +
 arch/x86/kvm/x86.c                            |   3 +-
 arch/x86/kvm/xen.c                            | 262 +++++++++++++++++-
 arch/x86/kvm/xen.h                            |   9 +
 include/linux/kvm_host.h                      |   7 +
 include/uapi/linux/kvm.h                      |  11 +
 .../selftests/kvm/x86_64/xen_shinfo_test.c    | 112 +++++++-
 9 files changed, 431 insertions(+), 7 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index aeeb071c7688..2a24098f9f95 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1796,6 +1796,7 @@ No flags are specified so far, the corresponding field must be set to zero.
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
   };
@@ -1805,6 +1806,7 @@ No flags are specified so far, the corresponding field must be set to zero.
   #define KVM_IRQ_ROUTING_MSI 2
   #define KVM_IRQ_ROUTING_S390_ADAPTER 3
   #define KVM_IRQ_ROUTING_HV_SINT 4
+  #define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 flags:
 
@@ -1856,6 +1858,20 @@ address_hi must be zero.
 	__u32 sint;
   };
 
+  struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+  };
+
+
+When KVM_CAP_XEN_HVM includes the KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL bit
+in its indication of supported features, routing to Xen event channels
+is supported. Although the priority field is present, only the value
+KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL is supported, which means delivery by
+2 level event channels. FIFO event channel support may be added in
+the future.
+
 
 4.55 KVM_SET_TSC_KHZ
 --------------------
@@ -7401,6 +7417,7 @@ PVHVM guests. Valid flags are::
   #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
   #define KVM_XEN_HVM_CONFIG_SHARED_INFO	(1 << 2)
   #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 2)
+  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 3)
 
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@@ -7420,6 +7437,10 @@ The KVM_XEN_HVM_CONFIG_RUNSTATE flag indicates that the runstate-related
 features KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR/_CURRENT/_DATA/_ADJUST are
 supported by the KVM_XEN_VCPU_SET_ATTR/KVM_XEN_VCPU_GET_ATTR ioctls.
 
+The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries
+of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
+field set to indicate 2 level event channel delivery.
+
 8.31 KVM_CAP_PPC_MULTITCE
 -------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea53740f6fc7..465455334c0c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -603,6 +603,7 @@ struct kvm_vcpu_xen {
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
+	unsigned long evtchn_pending_sel;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d5b72a08e566..afd2de84be60 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -24,6 +24,7 @@
 
 #include "hyperv.h"
 #include "x86.h"
+#include "xen.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
@@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 			return r;
 		break;
 
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		if (!level)
+			return -1;
+
+		return kvm_xen_set_evtchn_fast(e, kvm);
+#endif
 	default:
 		break;
 	}
@@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
+#ifdef CONFIG_KVM_XEN
+	case KVM_IRQ_ROUTING_XEN_EVTCHN:
+		return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54452269a4ff..0a689bb62e9e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4147,7 +4147,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_XEN_HVM:
 		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
-		    KVM_XEN_HVM_CONFIG_SHARED_INFO;
+		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
+		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
 		if (sched_info_on())
 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
 		break;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index dcd88f1092d2..6d44e3cf7a7a 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -16,6 +16,7 @@
 #include <trace/events/kvm.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/event_channel.h>
 
 #include "trace.h"
 
@@ -195,6 +196,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
+	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+	bool atomic = in_atomic() || !task_is_running(current);
 	int err;
 	u8 rc = 0;
 
@@ -204,6 +207,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 */
 	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
 	struct kvm_memslots *slots = kvm_memslots(v->kvm);
+	bool ghc_valid = slots->generation == ghc->generation &&
+		!kvm_is_error_hva(ghc->hva) && ghc->memslot;
+
 	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
 
 	/* No need for compat handling here */
@@ -219,8 +225,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * cache in kvm_read_guest_offset_cached(), but just uses
 	 * __get_user() instead. And falls back to the slow path.
 	 */
-	if (likely(slots->generation == ghc->generation &&
-		   !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+	if (!evtchn_pending_sel && ghc_valid) {
 		/* Fast path */
 		pagefault_disable();
 		err = __get_user(rc, (u8 __user *)ghc->hva + offset);
@@ -239,11 +244,82 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	 * and we'll end up getting called again from a context where we *can*
 	 * fault in the page and wait for it.
 	 */
-	if (in_atomic() || !task_is_running(current))
+	if (atomic)
 		return 1;
 
-	kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
-				     sizeof(rc));
+	if (!ghc_valid) {
+		err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
+		if (err || !ghc->memslot) {
+			/*
+			 * If this failed, userspace has screwed up the
+			 * vcpu_info mapping. No interrupts for you.
+			 */
+			return 0;
+		}
+	}
+
+	/*
+	 * Now we have a valid (protected by srcu) userspace HVA in
+	 * ghc->hva which points to the struct vcpu_info. If there
+	 * are any bits in the in-kernel evtchn_pending_sel then
+	 * we need to write those to the guest vcpu_info and set
+	 * its evtchn_upcall_pending flag. If there aren't any bits
+	 * to add, we only want to *check* evtchn_upcall_pending.
+	 */
+	if (evtchn_pending_sel) {
+		bool long_mode = v->kvm->arch.xen.long_mode;
+
+		if (!user_access_begin((void *)ghc->hva, sizeof(struct vcpu_info)))
+			return 0;
+
+		if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
+			struct vcpu_info __user *vi = (void *)ghc->hva;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
+				     "\tnotq %0\n"
+				     "\t" LOCK_PREFIX "andq %0, %2\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel)
+				     : "m" (vi->evtchn_pending_sel),
+				       "m" (v->arch.xen.evtchn_pending_sel),
+				       "0" (evtchn_pending_sel));
+		} else {
+			struct compat_vcpu_info __user *vi = (void *)ghc->hva;
+			u32 evtchn_pending_sel32 = evtchn_pending_sel;
+
+			/* Attempt to set the evtchn_pending_sel bits in the
+			 * guest, and if that succeeds then clear the same
+			 * bits in the in-kernel version. */
+			asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
+				     "\tnotl %0\n"
+				     "\t" LOCK_PREFIX "andl %0, %2\n"
+				     "2:\n"
+				     "\t.section .fixup,\"ax\"\n"
+				     "3:\tjmp\t2b\n"
+				     "\t.previous\n"
+				     _ASM_EXTABLE_UA(1b, 3b)
+				     : "=r" (evtchn_pending_sel32)
+				     : "m" (vi->evtchn_pending_sel),
+				       "m" (v->arch.xen.evtchn_pending_sel),
+				       "0" (evtchn_pending_sel32));
+		}
+		rc = 1;
+		unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
+
+	err:
+		user_access_end();
+
+		mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+	} else {
+		__get_user(rc, (u8 __user *)ghc->hva + offset);
+	}
 
 	return rc;
 }
@@ -740,3 +816,179 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
 
 	return 0;
 }
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+		return EVTCHN_2L_NR_CHANNELS;
+	else
+		return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
+/*
+ * This follows the kvm_set_irq() API, so it returns:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+			    struct kvm *kvm)
+{
+	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+	struct kvm_vcpu *vcpu;
+	unsigned long *pending_bits, *mask_bits;
+	unsigned long flags;
+	int port_word_bit;
+	bool kick_vcpu = false;
+	int idx;
+	int rc;
+
+	vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
+	if (!vcpu)
+		return -1;
+
+	if (!vcpu->arch.xen.vcpu_info_set)
+		return -1;
+
+	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
+		return -1;
+
+	rc = -EWOULDBLOCK;
+	read_lock_irqsave(&gpc->lock, flags);
+
+	idx = srcu_read_lock(&kvm->srcu);
+	if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+		goto out_rcu;
+
+	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+		struct shared_info *shinfo = gpc->khva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 64;
+	} else {
+		struct compat_shared_info *shinfo = gpc->khva;
+		pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+		mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+		port_word_bit = e->xen_evtchn.port / 32;
+	}
+
+	/*
+	 * If this port wasn't already set, and if it isn't masked, then
+	 * we try to set the corresponding bit in the in-kernel shadow of
+	 * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+	 * already set, then we kick the vCPU in question to write to the
+	 * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+	 */
+	if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
+		rc = 0; /* It was already raised */
+	} else if (test_bit(e->xen_evtchn.port, mask_bits)) {
+		rc = -1; /* Masked */
+	} else {
+		rc = 1; /* Delivered. But was the vCPU waking already? */
+		if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+			kick_vcpu = true;
+	}
+
+ out_rcu:
+	srcu_read_unlock(&kvm->srcu, idx);
+	read_unlock_irqrestore(&gpc->lock, flags);
+
+	if (kick_vcpu) {
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+			 int irq_source_id, int level, bool line_status)
+{
+	bool mm_borrowed = false;
+	int rc;
+
+	if (!level)
+		return -1;
+
+	rc = kvm_xen_set_evtchn_fast(e, kvm);
+	if (rc != -EWOULDBLOCK)
+		return rc;
+
+	if (current->mm != kvm->mm) {
+		/*
+		 * If not on a thread which already belongs to this KVM,
+		 * we'd better be in the irqfd workqueue.
+		 */
+		if (WARN_ON_ONCE(current->mm))
+			return -EINVAL;
+
+		kthread_use_mm(kvm->mm);
+		mm_borrowed = true;
+	}
+
+	/*
+	 * For the irqfd workqueue, using the main kvm->lock mutex is
+	 * fine since this function is invoked from kvm_set_irq() with
+	 * no other lock held, no srcu. In future if it will be called
+	 * directly from a vCPU thread (e.g. on hypercall for an IPI)
+	 * then it may need to switch to using a leaf-node mutex for
+	 * serializing the shared_info mapping.
+	 */
+	mutex_lock(&kvm->lock);
+
+	/*
+	 * It is theoretically possible for the page to be unmapped
+	 * and the MMU notifier to invalidate the shared_info before
+	 * we even get to use it. In that case, this looks like an
+	 * infinite loop. It was tempting to do it via the userspace
+	 * HVA instead... but that just *hides* the fact that it's
+	 * an infinite loop, because if a fault occurs and it waits
+	 * for the page to come back, it can *still* immediately
+	 * fault and have to wait again, repeatedly.
+	 *
+	 * Conversely, the page could also have been reinstated by
+	 * another thread before we even obtain the mutex above, so
+	 * check again *first* before remapping it.
+	 */
+	do {
+		struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+		int idx;
+
+		rc = kvm_xen_set_evtchn_fast(e, kvm);
+		if (rc != -EWOULDBLOCK)
+			break;
+
+		idx = srcu_read_lock(&kvm->srcu);
+		rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
+						  PAGE_SIZE, true);
+		srcu_read_unlock(&kvm->srcu, idx);
+	} while(!rc);
+
+	mutex_unlock(&kvm->lock);
+
+	if (mm_borrowed)
+		kthread_unuse_mm(kvm->mm);
+
+	return rc;
+}
+
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue)
+
+{
+	if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
+		return -EINVAL;
+
+	/* We only support 2 level event channels for now */
+	if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+		return -EINVAL;
+
+	e->xen_evtchn.port = ue->u.xen_evtchn.port;
+	e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+	e->set = evtchn_set_fn;
+
+	return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index cc0cf5f37450..adbcc9ed59db 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
 void kvm_xen_destroy_vm(struct kvm *kvm);
 
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+			    struct kvm *kvm);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+			 struct kvm_kernel_irq_routing_entry *e,
+			 const struct kvm_irq_routing_entry *ue);
+
 static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -134,6 +140,9 @@ struct compat_shared_info {
 	struct compat_arch_shared_info arch;
 };
 
+#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 *				\
+				      sizeof_field(struct compat_shared_info, \
+						   evtchn_pending))
 struct compat_vcpu_runstate_info {
     int state;
     uint64_t state_entry_time;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 762bf2586feb..d1187b051203 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -469,6 +469,12 @@ struct kvm_hv_sint {
 	u32 sint;
 };
 
+struct kvm_xen_evtchn {
+	u32 port;
+	u32 vcpu;
+	u32 priority;
+};
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -489,6 +495,7 @@ struct kvm_kernel_irq_routing_entry {
 		} msi;
 		struct kvm_s390_adapter_int adapter;
 		struct kvm_hv_sint hv_sint;
+		struct kvm_xen_evtchn xen_evtchn;
 	};
 	struct hlist_node link;
 };
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1daa45268de2..12421e76adcb 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1162,11 +1162,20 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
+struct kvm_irq_routing_xen_evtchn {
+	__u32 port;
+	__u32 vcpu;
+	__u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
 #define KVM_IRQ_ROUTING_S390_ADAPTER 3
 #define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -1178,6 +1187,7 @@ struct kvm_irq_routing_entry {
 		struct kvm_irq_routing_msi msi;
 		struct kvm_irq_routing_s390_adapter adapter;
 		struct kvm_irq_routing_hv_sint hv_sint;
+		struct kvm_irq_routing_xen_evtchn xen_evtchn;
 		__u32 pad[8];
 	} u;
 };
@@ -1208,6 +1218,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
 
 struct kvm_xen_hvm_config {
 	__u32 flags;
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
index a0699f00b3d6..a865e60a042c 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -14,6 +14,9 @@
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
+#include <signal.h>
+
+#include <sys/eventfd.h>
 
 #define VCPU_ID		5
 
@@ -22,10 +25,12 @@
 #define SHINFO_REGION_SLOT	10
 #define PAGE_SIZE		4096
 
+#define SHINFO_ADDR	(SHINFO_REGION_GPA)
 #define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
 #define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + 0x20)
 #define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
 
+#define SHINFO_VADDR	(SHINFO_REGION_GVA)
 #define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + 0x20)
 #define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
 
@@ -73,15 +78,30 @@ struct vcpu_info {
         struct pvclock_vcpu_time_info time;
 }; /* 64 bytes (x86) */
 
+struct shared_info {
+	struct vcpu_info vcpu_info[32];
+	unsigned long evtchn_pending[64];
+	unsigned long evtchn_mask[64];
+	struct pvclock_wall_clock wc;
+	uint32_t wc_sec_hi;
+	/* arch_shared_info here */
+};
+
 #define RUNSTATE_running  0
 #define RUNSTATE_runnable 1
 #define RUNSTATE_blocked  2
 #define RUNSTATE_offline  3
 
+struct {
+	struct kvm_irq_routing info;
+	struct kvm_irq_routing_entry entries[2];
+} irq_routes;
+
 static void evtchn_handler(struct ex_regs *regs)
 {
 	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
 	vi->evtchn_upcall_pending = 0;
+	vi->evtchn_pending_sel = 0;
 
 	GUEST_SYNC(0x20);
 }
@@ -127,7 +147,19 @@ static void guest_code(void)
 	GUEST_SYNC(6);
 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
 
-	GUEST_DONE();
+	/* Attempt to deliver a *masked* interrupt */
+	GUEST_SYNC(7);
+
+	/* Wait until we see the bit set */
+	struct shared_info *si = (void *)SHINFO_VADDR;
+	while (!si->evtchn_pending[0])
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+
+	/* Now deliver an *unmasked* interrupt */
+	GUEST_SYNC(8);
+
+	for (;;)
+		__asm__ __volatile__ ("rep nop" : : : "memory");
 }
 
 static int cmp_timespec(struct timespec *a, struct timespec *b)
@@ -144,6 +176,11 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
 		return 0;
 }
 
+static void handle_alrm(int sig)
+{
+	TEST_FAIL("IRQ delivery timed out");
+}
+
 int main(int argc, char *argv[])
 {
 	struct timespec min_ts, max_ts, vm_ts;
@@ -155,6 +192,7 @@ int main(int argc, char *argv[])
 	}
 
 	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
+	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
 
 	clock_gettime(CLOCK_REALTIME, &min_ts);
 
@@ -166,6 +204,11 @@ int main(int argc, char *argv[])
 				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 2, 0);
 	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 2);
 
+	struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+
+	int zero_fd = open("/dev/zero", O_RDONLY);
+	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
+
 	struct kvm_xen_hvm_config hvmc = {
 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
 		.msr = XEN_HYPERCALL_MSR,
@@ -184,6 +227,16 @@ int main(int argc, char *argv[])
 	};
 	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
 
+	/*
+	 * Test what happens when the HVA of the shinfo page is remapped after
+	 * the kernel has a reference to it. But make sure we copy the clock
+	 * info over since that's only set at setup time, and we test it later.
+	 */
+	struct pvclock_wall_clock wc_copy = shinfo->wc;
+	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
+	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
+	shinfo->wc = wc_copy;
+
 	struct kvm_xen_vcpu_attr vi = {
 		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
 		.u.gpa = VCPU_INFO_ADDR,
@@ -214,6 +267,49 @@ int main(int argc, char *argv[])
 		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st);
 	}
 
+	int irq_fd[2] = { -1, -1 };
+
+	if (do_eventfd_tests) {
+		irq_fd[0] = eventfd(0, 0);
+		irq_fd[1] = eventfd(0, 0);
+
+		/* Unexpected, but not a KVM failure */
+		if (irq_fd[0] == -1 || irq_fd[1] == -1)
+			do_eventfd_tests = false;
+	}
+
+	if (do_eventfd_tests) {
+		irq_routes.info.nr = 2;
+
+		irq_routes.entries[0].gsi = 32;
+		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[0].u.xen_evtchn.port = 15;
+		irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID;
+		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		irq_routes.entries[1].gsi = 33;
+		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[1].u.xen_evtchn.port = 66;
+		irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID;
+		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes);
+
+		struct kvm_irqfd ifd = { };
+
+		ifd.fd = irq_fd[0];
+		ifd.gsi = 32;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		ifd.fd = irq_fd[1];
+		ifd.gsi = 33;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		struct sigaction sa = { };
+		sa.sa_handler = handle_alrm;
+		sigaction(SIGALRM, &sa, NULL);
+	}
+
 	struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
 	vinfo->evtchn_upcall_pending = 0;
 
@@ -289,9 +385,23 @@ int main(int argc, char *argv[])
 					sched_yield();
 				} while (get_run_delay() < rundelay);
 				break;
+			case 7:
+				if (!do_eventfd_tests)
+					goto done;
+				shinfo->evtchn_mask[0] = 0x8000;
+				eventfd_write(irq_fd[0], 1UL);
+				alarm(1);
+				break;
+			case 8:
+				eventfd_write(irq_fd[1], 1UL);
+				evtchn_irq_expected = true;
+				break;
+
 			case 0x20:
 				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
 				evtchn_irq_expected = false;
+				if (shinfo->evtchn_pending[1])
+					goto done;
 				break;
 			}
 			break;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery
  2021-11-15 16:50                                 ` [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery David Woodhouse
@ 2021-11-15 17:02                                   ` David Woodhouse
  2021-11-15 18:49                                   ` Paolo Bonzini
  1 sibling, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 17:02 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

[-- Attachment #1: Type: text/plain, Size: 3837 bytes --]

On Mon, 2021-11-15 at 16:50 +0000, David Woodhouse wrote:
> +       /*
> +        * Now we have a valid (protected by srcu) userspace HVA in
> +        * ghc->hva which points to the struct vcpu_info. If there
> +        * are any bits in the in-kernel evtchn_pending_sel then
> +        * we need to write those to the guest vcpu_info and set
> +        * its evtchn_upcall_pending flag. If there aren't any bits
> +        * to add, we only want to *check* evtchn_upcall_pending.
> +        */
> +       if (evtchn_pending_sel) {
> +               bool long_mode = v->kvm->arch.xen.long_mode;
> +
> +               if (!user_access_begin((void *)ghc->hva, sizeof(struct vcpu_info)))
> +                       return 0;
> +
> +               if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
> +                       struct vcpu_info __user *vi = (void *)ghc->hva;
> +
> +                       /* Attempt to set the evtchn_pending_sel bits in the
> +                        * guest, and if that succeeds then clear the same
> +                        * bits in the in-kernel version. */
> +                       asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
> +                                    "\tnotq %0\n"
> +                                    "\t" LOCK_PREFIX "andq %0, %2\n"
> +                                    "2:\n"
> +                                    "\t.section .fixup,\"ax\"\n"
> +                                    "3:\tjmp\t2b\n"
> +                                    "\t.previous\n"
> +                                    _ASM_EXTABLE_UA(1b, 3b)
> +                                    : "=r" (evtchn_pending_sel)
> +                                    : "m" (vi->evtchn_pending_sel),
> +                                      "m" (v->arch.xen.evtchn_pending_sel),
> +                                      "0" (evtchn_pending_sel));
> +               } else {
> +                       struct compat_vcpu_info __user *vi = (void *)ghc->hva;
> +                       u32 evtchn_pending_sel32 = evtchn_pending_sel;
> +
> +                       /* Attempt to set the evtchn_pending_sel bits in the
> +                        * guest, and if that succeeds then clear the same
> +                        * bits in the in-kernel version. */
> +                       asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
> +                                    "\tnotl %0\n"
> +                                    "\t" LOCK_PREFIX "andl %0, %2\n"
> +                                    "2:\n"
> +                                    "\t.section .fixup,\"ax\"\n"
> +                                    "3:\tjmp\t2b\n"
> +                                    "\t.previous\n"
> +                                    _ASM_EXTABLE_UA(1b, 3b)
> +                                    : "=r" (evtchn_pending_sel32)
> +                                    : "m" (vi->evtchn_pending_sel),
> +                                      "m" (v->arch.xen.evtchn_pending_sel),
> +                                      "0" (evtchn_pending_sel32));
> +               }
> +               rc = 1;
> +               unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
> +
> +       err:
> +               user_access_end();
> +
> +               mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
> +       } else {
> +               __get_user(rc, (u8 __user *)ghc->hva + offset);
> +       }

I will probably concede that my assertion that "it already has a
perfectly serviceable userspace HVA and it's just a matter of writing a
trivial bit of inline asm" is probably stretching the definition of the
word "trivial" a little bit.

I can convert this bit to use a gfn_to_gpa_cache for the vcpu_info too,
once the dust settles on the implementation of that.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery
  2021-11-15 16:50                                 ` [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery David Woodhouse
  2021-11-15 17:02                                   ` David Woodhouse
@ 2021-11-15 18:49                                   ` Paolo Bonzini
  2021-11-15 18:55                                     ` David Woodhouse
  1 sibling, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-15 18:49 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

On 11/15/21 17:50, David Woodhouse wrote:
> +			asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
> +				     "\tnotq %0\n"
> +				     "\t" LOCK_PREFIX "andq %0, %2\n"
> +				     "2:\n"
> +				     "\t.section .fixup,\"ax\"\n"
> +				     "3:\tjmp\t2b\n"
> +				     "\t.previous\n"
> +				     _ASM_EXTABLE_UA(1b, 3b)
> +				     : "=r" (evtchn_pending_sel)
> +				     : "m" (vi->evtchn_pending_sel),
> +				       "m" (v->arch.xen.evtchn_pending_sel),

These need to be "+m", I think?

And same for st->preempted actually.

Paolo

> +				       "0" (evtchn_pending_sel));


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 16:47                             ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
  2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
@ 2021-11-15 18:50                               ` Paolo Bonzini
  2021-11-15 19:11                                 ` David Woodhouse
  2021-11-15 21:38                                 ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
  1 sibling, 2 replies; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-15 18:50 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/15/21 17:47, David Woodhouse wrote:
> So... a user of this must check the validity after setting its mode to
> IN_GUEST_MODE, and the invalidation must make a request and wake any
> vCPU(s) which might be using it.

Yes, though the check is implicit in the existing call to 
kvm_vcpu_exit_request(vcpu).

> I moved the invalidation to the invalidate_range MMU notifier, as
> discussed. But that's where the plan falls down a little bit because
> IIUC, that one can't sleep at all.

Which is a problem in the existing code, too.  It hasn't broken yet 
because invalidate_range() is _usually_ called with no spinlocks taken 
(the only caller that does call with a spinlock taken seems to be 
hugetlb_cow).

Once the dust settles, we need to add non_block_start/end around calls 
to ops->invalidate_range.

> I need to move it *back*  to
> invalidate_range_start() where I had it before, if I want to let it
> wait for vCPUs to exit. Which means... that the cache 'refresh' call
> must wait until the mmu_notifier_count reaches zero? Am I allowed to do > that, and make the "There can be only one waiter" comment in
> kvm_mmu_notifier_invalidate_range_end() no longer true?

You can also update the cache while taking the mmu_lock for read, and 
retry if mmu_notifier_retry_hva tells you to do so.  Looking at the 
scenario from commit e649b3f0188 you would have:

       (Invalidator) kvm_mmu_notifier_invalidate_range_start()
       (Invalidator) write_lock(mmu_lock)
       (Invalidator) increment mmu_notifier_count
       (Invalidator) write_unlock(mmu_lock)
       (Invalidator) request KVM_REQ_APIC_PAGE_RELOAD
       (KVM VCPU) vcpu_enter_guest()
       (KVM VCPU) kvm_vcpu_reload_apic_access_page()
    +  (KVM VCPU) read_lock(mmu_lock)
    +  (KVM VCPU) mmu_notifier_retry_hva()
    +  (KVM VCPU) read_unlock(mmu_lock)
    +  (KVM VCPU) retry! (mmu_notifier_count>1)
       (Invalidator) actually unmap page
    +  (Invalidator) kvm_mmu_notifier_invalidate_range_end()
    +  (Invalidator) write_lock(mmu_lock)
    +  (Invalidator) decrement mmu_notifier_count
    +  (Invalidator) write_unlock(mmu_lock)
    +  (KVM VCPU) vcpu_enter_guest()
    +  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
    +  (KVM VCPU) mmu_notifier_retry_hva()

Changing mn_memslots_update_rcuwait to a waitq (and renaming it to 
mn_invalidate_waitq) is of course also a possibility.

Also, for the small requests: since you are at it, can you add the code 
in a new file under virt/kvm/?

Paolo

> I was also pondering whether to introduce a new arch-independent
> KVM_REQ_GPC_INVALIDATE, or let it be arch-dependent and make it a field
> of the cache, so that users can raise whatever requests they like?


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery
  2021-11-15 18:49                                   ` Paolo Bonzini
@ 2021-11-15 18:55                                     ` David Woodhouse
  0 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 18:55 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

[-- Attachment #1: Type: text/plain, Size: 773 bytes --]

On Mon, 2021-11-15 at 19:49 +0100, Paolo Bonzini wrote:
> On 11/15/21 17:50, David Woodhouse wrote:
> > +			asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
> > +				     "\tnotq %0\n"
> > +				     "\t" LOCK_PREFIX "andq %0, %2\n"
> > +				     "2:\n"
> > +				     "\t.section .fixup,\"ax\"\n"
> > +				     "3:\tjmp\t2b\n"
> > +				     "\t.previous\n"
> > +				     _ASM_EXTABLE_UA(1b, 3b)
> > +				     : "=r" (evtchn_pending_sel)
> > +				     : "m" (vi->evtchn_pending_sel),
> > +				       "m" (v->arch.xen.evtchn_pending_sel),
> 
> These need to be "+m", I think?
> 
> And same for st->preempted actually

Ack. Although as noted, I think I'll switch to a gfn_to_pfn_cache for
the vcpu_info too, once the dust has settled on the implementation.


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 18:50                               ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache Paolo Bonzini
@ 2021-11-15 19:11                                 ` David Woodhouse
  2021-11-15 19:26                                   ` Paolo Bonzini
  2021-11-15 21:38                                 ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
  1 sibling, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 19:11 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 4619 bytes --]

On Mon, 2021-11-15 at 19:50 +0100, Paolo Bonzini wrote:
> On 11/15/21 17:47, David Woodhouse wrote:
> > So... a user of this must check the validity after setting its mode to
> > IN_GUEST_MODE, and the invalidation must make a request and wake any
> > vCPU(s) which might be using it.
> 
> Yes, though the check is implicit in the existing call to 
> kvm_vcpu_exit_request(vcpu).

Right, though *handling* the request isn't (and I'm still not sure
whether to use a single new KVM_REQ_GPC_INVALIDATE or let the user of
the cache specify the req to use).

I don't really want generic code refreshing these caches even when they
aren't going to be used (e.g. vmcs02 for a vCPU that isn't even in L2
guest mode right now).

> > I moved the invalidation to the invalidate_range MMU notifier, as
> > discussed. But that's where the plan falls down a little bit because
> > IIUC, that one can't sleep at all.
> 
> Which is a problem in the existing code, too.  It hasn't broken yet 
> because invalidate_range() is _usually_ called with no spinlocks taken 
> (the only caller that does call with a spinlock taken seems to be 
> hugetlb_cow).
> 
> Once the dust settles, we need to add non_block_start/end around calls 
> to ops->invalidate_range.
> 
> > I need to move it *back*  to
> > invalidate_range_start() where I had it before, if I want to let it
> > wait for vCPUs to exit. Which means... that the cache 'refresh' call
> > must wait until the mmu_notifier_count reaches zero? Am I allowed to do > that, and make the "There can be only one waiter" comment in
> > kvm_mmu_notifier_invalidate_range_end() no longer true?
> 
> You can also update the cache while taking the mmu_lock for read, and 
> retry if mmu_notifier_retry_hva tells you to do so.  Looking at the 
> scenario from commit e649b3f0188 you would have:
> 
>        (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>        (Invalidator) write_lock(mmu_lock)
>        (Invalidator) increment mmu_notifier_count
>        (Invalidator) write_unlock(mmu_lock)
>        (Invalidator) request KVM_REQ_APIC_PAGE_RELOAD
>        (KVM VCPU) vcpu_enter_guest()
>        (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>     +  (KVM VCPU) read_lock(mmu_lock)
>     +  (KVM VCPU) mmu_notifier_retry_hva()
>     +  (KVM VCPU) read_unlock(mmu_lock)
>     +  (KVM VCPU) retry! (mmu_notifier_count>1)


But unless we do start using a waitq, it can just spin and spin and
spin here can't it? 

    +  (KVM VCPU) read_lock(mmu_lock)>
    +  (KVM VCPU) mmu_notifier_retry_hva()>   
    +  (KVM VCPU) read_unlock(mmu_lock)>   
    +  (KVM VCPU) retry! (mmu_notifier_count>1)

    +  (KVM VCPU) read_lock(mmu_lock)>
    +  (KVM VCPU)
mmu_notifier_retry_hva()>   
    +  (KVM VCPU) read_unlock(mmu_lock)>   
    +  (KVM VCPU) retry! (mmu_notifier_count>1)

    +  (KVM VCPU) read_lock(mmu_lock)>
    +  (KVM VCPU)
mmu_notifier_retry_hva()>   
    +  (KVM VCPU) read_unlock(mmu_lock)>   
    +  (KVM VCPU) retry! (mmu_notifier_count>1)

>        (Invalidator) actually unmap page

    +  (KVM VCPU) read_lock(mmu_lock)>
    +  (KVM VCPU) mmu_notifier_retry_hva()>   
    +  (KVM VCPU) read_unlock(mmu_lock)>   
    +  (KVM VCPU) retry! (mmu_notifier_count>1)

    +  (KVM VCPU) read_lock(mmu_lock)>
    +  (KVM VCPU)
mmu_notifier_retry_hva()>   
    +  (KVM VCPU) read_unlock(mmu_lock)>   
    +  (KVM VCPU) retry! (mmu_notifier_count>1)

>     +  (Invalidator) kvm_mmu_notifier_invalidate_range_end()
>     +  (Invalidator) write_lock(mmu_lock)
>     +  (Invalidator) decrement mmu_notifier_count
>     +  (Invalidator) write_unlock(mmu_lock)
>     +  (KVM VCPU) vcpu_enter_guest()
>     +  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>     +  (KVM VCPU) mmu_notifier_retry_hva()
> 
> Changing mn_memslots_update_rcuwait to a waitq (and renaming it to 
> mn_invalidate_waitq) is of course also a possibility.

I suspect that's the answer.

I think the actual *invalidation* of the cache still lives in the
invalidate_range() callback where I have it at the moment. But making
the req to the affected vCPUs can live in invalidate_range_start(). And
then the code which *handles* that req can wait for the
mmu_notifier_count to reach zero before it proceeds. Atomic users of
the cache (like the Xen event channel code) don't have to get involved
with that.

> Also, for the small requests: since you are at it, can you add the code 
> in a new file under virt/kvm/?

Hm... only if I can make hva_to_pfn() and probably a handful of other
things non-static?



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 19:11                                 ` David Woodhouse
@ 2021-11-15 19:26                                   ` Paolo Bonzini
  2021-11-15 22:59                                     ` Sean Christopherson
  2021-11-16 11:50                                     ` [PATCH 0/7] KVM: Add Makefile.kvm for common files David Woodhouse
  0 siblings, 2 replies; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-15 19:26 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

On 11/15/21 20:11, David Woodhouse wrote:
>> Changing mn_memslots_update_rcuwait to a waitq (and renaming it to
>> mn_invalidate_waitq) is of course also a possibility.
> I suspect that's the answer.
> 
> I think the actual*invalidation*  of the cache still lives in the
> invalidate_range() callback where I have it at the moment. But making
> the req to the affected vCPUs can live in invalidate_range_start(). And
> then the code which*handles*  that req can wait for the
> mmu_notifier_count to reach zero before it proceeds. Atomic users of
> the cache (like the Xen event channel code) don't have to get involved
> with that.
> 
>> Also, for the small requests: since you are at it, can you add the code
>> in a new file under virt/kvm/?
>
> Hm... only if I can make hva_to_pfn() and probably a handful of other
> things non-static?

Yes, I think sooner or later we also want all pfn stuff in one file 
(together with MMU notifiers) and all hva stuff in another; so for now 
you can create virt/kvm/hva_to_pfn.h, or virt/kvm/mm.h, or whatever 
color of the bikeshed you prefer.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 18:50                               ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache Paolo Bonzini
  2021-11-15 19:11                                 ` David Woodhouse
@ 2021-11-15 21:38                                 ` David Woodhouse
  1 sibling, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 21:38 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 5460 bytes --]

On Mon, 2021-11-15 at 19:50 +0100, Paolo Bonzini wrote:
> On 11/15/21 17:47, David Woodhouse wrote:
> > I need to move it *back*  to
> > invalidate_range_start() where I had it before, if I want to let it
> > wait for vCPUs to exit. Which means... that the cache 'refresh' call
> > must wait until the mmu_notifier_count reaches zero? Am I allowed to do
> > that, and make the "There can be only one waiter" comment in
> > kvm_mmu_notifier_invalidate_range_end() no longer true?
> 
> You can also update the cache while taking the mmu_lock for read, and 
> retry if mmu_notifier_retry_hva tells you to do so.  Looking at the 
> scenario from commit e649b3f0188 you would have:
> 
>        (Invalidator) kvm_mmu_notifier_invalidate_range_start()
>        (Invalidator) write_lock(mmu_lock)
>        (Invalidator) increment mmu_notifier_count
>        (Invalidator) write_unlock(mmu_lock)
>        (Invalidator) request KVM_REQ_APIC_PAGE_RELOAD
>        (KVM VCPU) vcpu_enter_guest()
>        (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>     +  (KVM VCPU) read_lock(mmu_lock)
>     +  (KVM VCPU) mmu_notifier_retry_hva()
>     +  (KVM VCPU) read_unlock(mmu_lock)
>     +  (KVM VCPU) retry! (mmu_notifier_count>1)
>        (Invalidator) actually unmap page
>     +  (Invalidator) kvm_mmu_notifier_invalidate_range_end()
>     +  (Invalidator) write_lock(mmu_lock)
>     +  (Invalidator) decrement mmu_notifier_count
>     +  (Invalidator) write_unlock(mmu_lock)
>     +  (KVM VCPU) vcpu_enter_guest()
>     +  (KVM VCPU) kvm_vcpu_reload_apic_access_page()
>     +  (KVM VCPU) mmu_notifier_retry_hva()
> 
> Changing mn_memslots_update_rcuwait to a waitq (and renaming it to 
> mn_invalidate_waitq) is of course also a possibility.

I do think I'll go for a waitq but let's start *really* simple to make
sure I've got the basics right.... does this look vaguely sensible?

It returns -EAGAIN and lets the caller retry; I started with a 'goto'
but didn't have a sane exit condition. In fact, I *still* don't have a
sane exit condition for callers like evtchn_set_fn().

I'm actually tempted to split the caches into two lists
(kvm->guest_gpc_list, kvm->atomic_gpc_list) and invalidate only the
*former* from invalidate_range_start(), with these -EAGAIN semantics.
The atomic ones can stay precisely as they were in the series I already
sent since there's no need for them ever to have to spin/wait as long
as they're invalidated in the invalidate_range() MMU notifier.

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d1187b051203..2d76c09e460c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -151,6 +151,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_UNBLOCK           2
 #define KVM_REQ_UNHALT            3
 #define KVM_REQ_VM_DEAD           (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GPC_INVALIDATE    (5 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQUEST_ARCH_BASE     8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7382aa45d5e8..9bc3162ba650 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -468,8 +468,6 @@ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
 	int idx;
 
-	gfn_to_pfn_cache_invalidate(kvm, start, end, false);
-
 	idx = srcu_read_lock(&kvm->srcu);
 	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -689,6 +687,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	kvm->mn_active_invalidate_count++;
 	spin_unlock(&kvm->mn_invalidate_lock);
 
+	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end, hva_range.may_block);
+
 	__kvm_handle_hva_range(kvm, &hva_range);
 
 	return 0;
@@ -2674,7 +2674,6 @@ static void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
 	}
 	spin_unlock(&kvm->gpc_lock);
 
-#if 0
 	unsigned int req = KVM_REQ_GPC_INVALIDATE;
 
 	/*
@@ -2690,7 +2689,6 @@ static void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
 	} else if (wake_vcpus) {
 		called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
 	}
-#endif
 	WARN_ON_ONCE(called && !may_block);
 }
 
@@ -2767,6 +2765,8 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 	if (!old_valid || old_uhva != gpc->uhva) {
 		unsigned long uhva = gpc->uhva;
 		void *new_khva = NULL;
+		unsigned long mmu_seq;
+		int retry;
 
 		/* Placeholders for "hva is valid but not yet mapped" */
 		gpc->pfn = KVM_PFN_ERR_FAULT;
@@ -2775,10 +2775,20 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
 
 		write_unlock_irq(&gpc->lock);
 
+		mmu_seq = kvm->mmu_notifier_seq;
+		smp_rmb();
+
 		new_pfn = hva_to_pfn(uhva, false, NULL, true, NULL);
 		if (is_error_noslot_pfn(new_pfn))
 			ret = -EFAULT;
-		else if (gpc->kernel_map) {
+		else {
+			read_lock(&kvm->mmu_lock);
+			retry = mmu_notifier_retry_hva(kvm, mmu_seq, uhva);
+			read_unlock(&kvm->mmu_lock);
+			if (retry)
+				ret = -EAGAIN; // or goto the mmu_seq setting bit to retry?
+		}
+		if (!ret && gpc->kernel_map) {
 			if (new_pfn == old_pfn) {
 				new_khva = (void *)((unsigned long)old_khva - page_offset);
 				old_pfn = KVM_PFN_ERR_FAULT;

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 19:26                                   ` Paolo Bonzini
@ 2021-11-15 22:59                                     ` Sean Christopherson
  2021-11-15 23:22                                       ` David Woodhouse
  2021-11-15 23:24                                       ` David Woodhouse
  2021-11-16 11:50                                     ` [PATCH 0/7] KVM: Add Makefile.kvm for common files David Woodhouse
  1 sibling, 2 replies; 70+ messages in thread
From: Sean Christopherson @ 2021-11-15 22:59 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: David Woodhouse, kvm, Boris Ostrovsky, Joao Martins, jmattson,
	wanpengli, vkuznets, mtosatti, joro, karahmed

On Mon, Nov 15, 2021, Paolo Bonzini wrote:
> On 11/15/21 20:11, David Woodhouse wrote:
> > > Changing mn_memslots_update_rcuwait to a waitq (and renaming it to
> > > mn_invalidate_waitq) is of course also a possibility.
> > I suspect that's the answer.
> > 
> > I think the actual*invalidation*  of the cache still lives in the
> > invalidate_range() callback where I have it at the moment.

Oooh!  [finally had a lightbulb moment about ->invalidate_range() after years of
befuddlement].

Two things:

  1. Using _only_ ->invalidate_range() is not correct.  ->invalidate_range() is
     required if and only if the old PFN needs to be _unmapped_.  Specifically,
     if the protections are being downgraded without changing the PFN, it doesn't
     need to be called.  E.g. from hugetlb_change_protection():

	/*
	 * No need to call mmu_notifier_invalidate_range() we are downgrading
	 * page table protection not changing it to point to a new page.
	 *
	 * See Documentation/vm/mmu_notifier.rst
	 */

     x86's kvm_arch_mmu_notifier_invalidate_range() is a special snowflake because
     the APIC access page's VMA is controlled by KVM, i.e. is never downgraded, the
     only thing KVM cares about is if the PFN is changed, because that's the only
     thing that can change.

     In this case, if an HVA is downgraded from RW=R, KVM may not invalidate the
     cache and end up writing to memory that is supposed to be read-only.

     I believe we could use ->invalidate_range() to handle the unmap case if KVM's
     ->invalidate_range_start() hook is enhanced to handle the RW=>R case.  The
     "struct mmu_notifier_range" provides the event type, IIUC we could have the
     _start() variant handle MMU_NOTIFY_PROTECTION_{VMA,PAGE} (and maybe
     MMU_NOTIFY_SOFT_DIRTY?), and let the more precise unmap-only variant handle
     everything else.

  2. If we do split the logic across the two hooks, we should (a) do it in a separate
     series and (b) make the logic common to the gfn_to_pfn cache and to the standard
     kvm_unmap_gfn_range().  That would in theory shave a bit of time off walking
     gfn ranges (maybe even moreso with the scalable memslots implementation?), and
     if we're lucky, would resurrect the mostly-dead .change_pte() hook (see commit
     c13fda237f08 ("KVM: Assert that notifier count is elevated in .change_pte()")).

> > But making the req to the affected vCPUs can live in
> > invalidate_range_start(). And then the code which*handles*  that req can
> > wait for the mmu_notifier_count to reach zero before it proceeds. Atomic
> > users of the cache (like the Xen event channel code) don't have to get
> > involved with that.
> > 
> > > Also, for the small requests: since you are at it, can you add the code
> > > in a new file under virt/kvm/?
> > 
> > Hm... only if I can make hva_to_pfn() and probably a handful of other
> > things non-static?
> 
> Yes, I think sooner or later we also want all pfn stuff in one file
> (together with MMU notifiers) and all hva stuff in another; so for now you
> can create virt/kvm/hva_to_pfn.h, or virt/kvm/mm.h, or whatever color of the
> bikeshed you prefer.

Preemptive bikeshed strike... the MMU notifiers aren't strictly "pfn stuff", as
they operate on HVAs.  I don't know exactly what Paolo has in mind, but kvm/mm.h
or kvm/kvm_mm.h seems like it's less likely to become stale in the future.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 22:59                                     ` Sean Christopherson
@ 2021-11-15 23:22                                       ` David Woodhouse
  2021-11-16 13:17                                         ` David Woodhouse
  2021-11-15 23:24                                       ` David Woodhouse
  1 sibling, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 23:22 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 3667 bytes --]

On Mon, 2021-11-15 at 22:59 +0000, Sean Christopherson wrote:
> On Mon, Nov 15, 2021, Paolo Bonzini wrote:
> > On 11/15/21 20:11, David Woodhouse wrote:
> > > > Changing mn_memslots_update_rcuwait to a waitq (and renaming it to
> > > > mn_invalidate_waitq) is of course also a possibility.
> > > I suspect that's the answer.
> > > 
> > > I think the actual*invalidation*  of the cache still lives in the
> > > invalidate_range() callback where I have it at the moment.
> 
> Oooh!  [finally had a lightbulb moment about ->invalidate_range() after years of
> befuddlement].
> 
> Two things:
> 
>   1. Using _only_ ->invalidate_range() is not correct.  ->invalidate_range() is
>      required if and only if the old PFN needs to be _unmapped_.  Specifically,
>      if the protections are being downgraded without changing the PFN, it doesn't
>      need to be called.  E.g. from hugetlb_change_protection():

OK, that's kind of important to realise. Thanks.

So, I had just split the atomic and guest-mode invalidations apart:
https://git.infradead.org/users/dwmw2/linux.git/commitdiff/6cf5fe318fd
but will go back to doing it all in invalidate_range_start from a
single list.

And just deal with the fact that the atomic users now have to
loop/retry/wait for there *not* to be an MMU notification in progress.

>      I believe we could use ->invalidate_range() to handle the unmap case if KVM's
>      ->invalidate_range_start() hook is enhanced to handle the RW=>R case.  The
>      "struct mmu_notifier_range" provides the event type, IIUC we could have the
>      _start() variant handle MMU_NOTIFY_PROTECTION_{VMA,PAGE} (and maybe
>      MMU_NOTIFY_SOFT_DIRTY?), and let the more precise unmap-only variant handle
>      everything else.

Not sure that helps us much. It was the termination condition on the
"when should we keep retrying, and when should we give up?" that was
painful, and a mixed mode doesn't that problem it go away.

I'll go back and have another look in the morning, with something much
closer to what I showed in
https://lore.kernel.org/kvm/040d61dad066eb2517c108232efb975bc1cda780.camel@infradead.org/

>   2. If we do split the logic across the two hooks, we should (a) do it in a separate
>      series and (b) make the logic common to the gfn_to_pfn cache and to the standard
>      kvm_unmap_gfn_range(). 
> > 
> > Yes, I think sooner or later we also want all pfn stuff in one file
> > (together with MMU notifiers) and all hva stuff in another; so for now you
> > can create virt/kvm/hva_to_pfn.h, or virt/kvm/mm.h, or whatever color of the
> > bikeshed you prefer.
> 
> Preemptive bikeshed strike... the MMU notifiers aren't strictly "pfn stuff", as
> they operate on HVAs.  I don't know exactly what Paolo has in mind, but kvm/mm.h
> or kvm/kvm_mm.h seems like it's less likely to become stale in the future.

I'd moved kvm/mmu_lock.h to kvm/kvm_mm.h and added to it.
https://git.infradead.org/users/dwmw2/linux.git/commitdiff/a247bc2d0d9
(which I'll make retrospective as I rework the series).

After frowning a little at all the different architectures' Makefiles
that all add the same(ish) list of $(KVM)/foobar.o I ended up punting
that problem by only adding pfncache.o on x86 anyway.

If we're going to split other parts of kvm_main.c out into smaller
files, providing a Makefile snippet in virt/kvm/Makefile.kvm that gives
the *list* of those files would be a useful thing to do. But
arch/powerpc/kvm/Makefile makes my head hurt too much for me to be
shaving that particular yak tonight (why is $(KVM)/irqchip.o handled
differently to the rest...?)



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 22:59                                     ` Sean Christopherson
  2021-11-15 23:22                                       ` David Woodhouse
@ 2021-11-15 23:24                                       ` David Woodhouse
  1 sibling, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-15 23:24 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 329 bytes --]

On Mon, 2021-11-15 at 22:59 +0000, Sean Christopherson wrote:
> > That would in theory shave a bit of time off walking
> > gfn ranges (maybe even moreso with the scalable memslots implementation?), 

(Sorry, missed that bit)

I don't care about memslots anyway for this case as I can just compare
against the cached hva.

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 08/11] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache
  2021-11-15 16:50                                 ` [PATCH 08/11] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache David Woodhouse
@ 2021-11-16 10:21                                   ` Paolo Bonzini
  2021-11-17 17:18                                     ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-16 10:21 UTC (permalink / raw)
  To: David Woodhouse, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

On 11/15/21 17:50, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> In commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time /
> preempted status") I removed the only user of these functions because
> it was basically impossible to use them safely.
> 
> There are two stages to the GFN → PFN mapping; first through the KVM
> memslots to a userspace HVA and then through the page tables to
> translate that HVA to an underlying PFN. Invalidations of the former
> were being handled correctly, but no attempt was made to use the MMU
> notifiers to invalidate the cache when the HVA→GFN mapping changed.
> 
> As a prelude to reinventing the gfn_to_pfn_cache with more usable
> semantics, rip it out entirely and untangle the implementation of
> the unsafe kvm_vcpu_map()/kvm_vcpu_unmap() functions from it.
> 
> All current users of kvm_vcpu_map() also look broken right now, and
> will be dealt with separately. They broadly fall into two classes:
> 
>   • Those which map, access the data and immediately unmap. This is
>     mostly gratuitous and could just as well use the existing user
>     HVA, and could probably benefit from a gfn_to_hva_cache as they
>     do so.
> 
>   • Those which keep the mapping around for a longer time, perhaps
>     even using the PFN directly from the guest. These will need to
>     be converted to the new gfn_to_pfn_cache and then kvm_vcpu_map()
>     can be removed too.
> 
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> ---
>   include/linux/kvm_host.h  |   6 +--
>   include/linux/kvm_types.h |   7 ---
>   virt/kvm/kvm_main.c       | 100 +++++---------------------------------
>   3 files changed, 12 insertions(+), 101 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 9e0667e3723e..c310648cc8f1 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -874,7 +874,7 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn);
>   void kvm_set_pfn_dirty(kvm_pfn_t pfn);
>   void kvm_set_pfn_accessed(kvm_pfn_t pfn);
>   
> -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache);
> +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty);
>   int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
>   			int len);
>   int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
> @@ -950,12 +950,8 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
>   kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
>   kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
>   int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map);
> -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
> -		struct gfn_to_pfn_cache *cache, bool atomic);
>   struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
>   void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty);
> -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
> -		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic);
>   unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
>   unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
>   int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index 2237abb93ccd..234eab059839 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -53,13 +53,6 @@ struct gfn_to_hva_cache {
>   	struct kvm_memory_slot *memslot;
>   };
>   
> -struct gfn_to_pfn_cache {
> -	u64 generation;
> -	gfn_t gfn;
> -	kvm_pfn_t pfn;
> -	bool dirty;
> -};
> -
>   #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
>   /*
>    * Memory caches are used to preallocate memory ahead of various MMU flows,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d31724500501..9646bb9112c1 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2548,72 +2548,36 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
>   }
>   EXPORT_SYMBOL_GPL(gfn_to_page);
>   
> -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
> +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
>   {
>   	if (pfn == 0)
>   		return;
>   
> -	if (cache)
> -		cache->pfn = cache->gfn = 0;
> -
>   	if (dirty)
>   		kvm_release_pfn_dirty(pfn);
>   	else
>   		kvm_release_pfn_clean(pfn);
>   }
>   
> -static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
> -				 struct gfn_to_pfn_cache *cache, u64 gen)
> -{
> -	kvm_release_pfn(cache->pfn, cache->dirty, cache);
> -
> -	cache->pfn = gfn_to_pfn_memslot(slot, gfn);
> -	cache->gfn = gfn;
> -	cache->dirty = false;
> -	cache->generation = gen;
> -}
> -
> -static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
> -			 struct kvm_host_map *map,
> -			 struct gfn_to_pfn_cache *cache,
> -			 bool atomic)
> +int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
>   {
>   	kvm_pfn_t pfn;
>   	void *hva = NULL;
>   	struct page *page = KVM_UNMAPPED_PAGE;
> -	struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
> -	u64 gen = slots->generation;
>   
>   	if (!map)
>   		return -EINVAL;
>   
> -	if (cache) {
> -		if (!cache->pfn || cache->gfn != gfn ||
> -			cache->generation != gen) {
> -			if (atomic)
> -				return -EAGAIN;
> -			kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
> -		}
> -		pfn = cache->pfn;
> -	} else {
> -		if (atomic)
> -			return -EAGAIN;
> -		pfn = gfn_to_pfn_memslot(slot, gfn);
> -	}
> +	pfn = gfn_to_pfn(vcpu->kvm, gfn);
>   	if (is_error_noslot_pfn(pfn))
>   		return -EINVAL;
>   
>   	if (pfn_valid(pfn)) {
>   		page = pfn_to_page(pfn);
> -		if (atomic)
> -			hva = kmap_atomic(page);
> -		else
> -			hva = kmap(page);
> +		hva = kmap(page);
>   #ifdef CONFIG_HAS_IOMEM
> -	} else if (!atomic) {
> -		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
>   	} else {
> -		return -EINVAL;
> +		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
>   #endif
>   	}
>   
> @@ -2627,27 +2591,9 @@ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
>   
>   	return 0;
>   }
> -
> -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
> -		struct gfn_to_pfn_cache *cache, bool atomic)
> -{
> -	return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
> -			cache, atomic);
> -}
> -EXPORT_SYMBOL_GPL(kvm_map_gfn);
> -
> -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
> -{
> -	return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
> -		NULL, false);
> -}
>   EXPORT_SYMBOL_GPL(kvm_vcpu_map);
>   
> -static void __kvm_unmap_gfn(struct kvm *kvm,
> -			struct kvm_memory_slot *memslot,
> -			struct kvm_host_map *map,
> -			struct gfn_to_pfn_cache *cache,
> -			bool dirty, bool atomic)
> +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
>   {
>   	if (!map)
>   		return;
> @@ -2655,45 +2601,21 @@ static void __kvm_unmap_gfn(struct kvm *kvm,
>   	if (!map->hva)
>   		return;
>   
> -	if (map->page != KVM_UNMAPPED_PAGE) {
> -		if (atomic)
> -			kunmap_atomic(map->hva);
> -		else
> -			kunmap(map->page);
> -	}
> +	if (map->page != KVM_UNMAPPED_PAGE)
> +		kunmap(map->page);
>   #ifdef CONFIG_HAS_IOMEM
> -	else if (!atomic)
> -		memunmap(map->hva);
>   	else
> -		WARN_ONCE(1, "Unexpected unmapping in atomic context");
> +		memunmap(map->hva);
>   #endif
>   
>   	if (dirty)
> -		mark_page_dirty_in_slot(kvm, memslot, map->gfn);
> +		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
>   
> -	if (cache)
> -		cache->dirty |= dirty;
> -	else
> -		kvm_release_pfn(map->pfn, dirty, NULL);
> +	kvm_release_pfn(map->pfn, dirty);
>   
>   	map->hva = NULL;
>   	map->page = NULL;
>   }
> -
> -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
> -		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
> -{
> -	__kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
> -			cache, dirty, atomic);
> -	return 0;
> -}
> -EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
> -
> -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
> -{
> -	__kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
> -			map, NULL, dirty, false);
> -}
>   EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
>   
>   struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
> 

Queued patches 2-8 as well.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* [PATCH 0/7] KVM: Add Makefile.kvm for common files
  2021-11-15 19:26                                   ` Paolo Bonzini
  2021-11-15 22:59                                     ` Sean Christopherson
@ 2021-11-16 11:50                                     ` David Woodhouse
  2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
  1 sibling, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson, wanpengli, seanjc,
	vkuznets, mtosatti, joro, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

[-- Attachment #1: Type: text/plain, Size: 1521 bytes --]

On Mon, 2021-11-15 at 20:26 +0100, Paolo Bonzini wrote:
> > > Also, for the small requests: since you are at it, can you add the code
> > > in a new file under virt/kvm/?
> >
> > Hm... only if I can make hva_to_pfn() and probably a handful of other
> > things non-static?
> 
> Yes, I think sooner or later we also want all pfn stuff in one file 
> (together with MMU notifiers) and all hva stuff in another; so for now 
> you can create virt/kvm/hva_to_pfn.h, or virt/kvm/mm.h, or whatever 
> color of the bikeshed you prefer.


OK... let's start with this.

David Woodhouse (7):
      KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING
      KVM: Add Makefile.kvm for common files, use it for x86
      KVM: s390: Use Makefile.kvm for common files
      KVM: mips: Use Makefile.kvm for common files
      KVM: RISC-V: Use Makefile.kvm for common files
      KVM: powerpc: Use Makefile.kvm for common files
      KVM: arm64: Use Makefile.kvm for common files

 arch/arm64/kvm/Makefile        |  6 ++----
 arch/mips/kvm/Makefile         |  3 ++-
 arch/powerpc/kvm/Makefile      |  6 +-----
 arch/riscv/kvm/Makefile        |  6 +-----
 arch/s390/kvm/Makefile         |  6 ++----
 arch/x86/kvm/Kconfig           |  1 +
 arch/x86/kvm/Makefile          |  7 +------
 include/linux/kvm_dirty_ring.h |  8 ++++----
 virt/kvm/Kconfig               |  3 +++
 virt/kvm/Makefile.kvm          | 13 +++++++++++++
 virt/kvm/kvm_main.c            |  4 ++--
 11 files changed, 32 insertions(+), 31 deletions(-)

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING
  2021-11-16 11:50                                     ` [PATCH 0/7] KVM: Add Makefile.kvm for common files David Woodhouse
@ 2021-11-16 11:50                                       ` David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 2/7] KVM: Add Makefile.kvm for common files, use it for x86 David Woodhouse
                                                           ` (5 more replies)
  0 siblings, 6 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

From: David Woodhouse <dwmw@amazon.co.uk>

I'd like to make the build include dirty_ring.c based on whether the
arch wants it or not. That's a whole lot simpler if there's a config
symbol instead of doing it implicitly on KVM_DIRTY_LOG_PAGE_OFFSET
being set to something non-zero.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/Kconfig           | 1 +
 include/linux/kvm_dirty_ring.h | 8 ++++----
 virt/kvm/Kconfig               | 3 +++
 virt/kvm/kvm_main.c            | 4 ++--
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 619186138176..d7fa0a42ac25 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
 	select MMU_NOTIFIER
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
+	select HAVE_KVM_DIRTY_RING
 	select IRQ_BYPASS_MANAGER
 	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
index 120e5e90fa1d..4da8d4a4140b 100644
--- a/include/linux/kvm_dirty_ring.h
+++ b/include/linux/kvm_dirty_ring.h
@@ -27,9 +27,9 @@ struct kvm_dirty_ring {
 	int index;
 };
 
-#if (KVM_DIRTY_LOG_PAGE_OFFSET == 0)
+#ifndef CONFIG_HAVE_KVM_DIRTY_RING
 /*
- * If KVM_DIRTY_LOG_PAGE_OFFSET not defined, kvm_dirty_ring.o should
+ * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should
  * not be included as well, so define these nop functions for the arch.
  */
 static inline u32 kvm_dirty_ring_get_rsvd_entries(void)
@@ -74,7 +74,7 @@ static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
 	return true;
 }
 
-#else /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+#else /* CONFIG_HAVE_KVM_DIRTY_RING */
 
 u32 kvm_dirty_ring_get_rsvd_entries(void);
 int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);
@@ -98,6 +98,6 @@ struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
 void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
 bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
 
-#endif /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+#endif /* CONFIG_HAVE_KVM_DIRTY_RING */
 
 #endif	/* KVM_DIRTY_RING_H */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 62b39149b8c8..97cf5413ac25 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -13,6 +13,9 @@ config HAVE_KVM_IRQFD
 config HAVE_KVM_IRQ_ROUTING
        bool
 
+config HAVE_KVM_DIRTY_RING
+       bool
+
 config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9646bb9112c1..356d636e037d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3411,7 +3411,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
 {
-#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING
 	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
 	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
 	     kvm->dirty_ring_size / PAGE_SIZE);
@@ -4114,7 +4114,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_NR_MEMSLOTS:
 		return KVM_USER_MEM_SLOTS;
 	case KVM_CAP_DIRTY_LOG_RING:
-#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
+#ifdef KVM_HAVE_KVM_DIRTY_RING
 		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
 #else
 		return 0;
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 2/7] KVM: Add Makefile.kvm for common files, use it for x86
  2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
@ 2021-11-16 11:50                                         ` David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 3/7] KVM: s390: Use Makefile.kvm for common files David Woodhouse
                                                           ` (4 subsequent siblings)
  5 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

From: David Woodhouse <dwmw@amazon.co.uk>

Splitting kvm_main.c out into smaller and better-organized files is
slightly non-trivial when it involves editing a bunch of per-arch
KVM makefiles. Provide virt/kvm/Makefile.kvm for them to include.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/Makefile |  7 +------
 virt/kvm/Makefile.kvm | 13 +++++++++++++
 2 files changed, 14 insertions(+), 6 deletions(-)
 create mode 100644 virt/kvm/Makefile.kvm

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 75dfd27b6e8a..30f244b64523 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,12 +7,7 @@ ifeq ($(CONFIG_FRAME_POINTER),y)
 OBJECT_FILES_NON_STANDARD_vmenter.o := y
 endif
 
-KVM := ../../../virt/kvm
-
-kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
-				$(KVM)/dirty_ring.o $(KVM)/binary_stats.o
-kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
+include $(srctree)/virt/kvm/Makefile.kvm
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
new file mode 100644
index 000000000000..ee9c310f3601
--- /dev/null
+++ b/virt/kvm/Makefile.kvm
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+KVM ?= ../../../virt/kvm
+
+kvm-y := $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
+kvm-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
+kvm-$(CONFIG_HAVE_KVM_IRQCHIP) += $(KVM)/irqchip.o
+kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 3/7] KVM: s390: Use Makefile.kvm for common files
  2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 2/7] KVM: Add Makefile.kvm for common files, use it for x86 David Woodhouse
@ 2021-11-16 11:50                                         ` David Woodhouse
  2021-11-17  7:29                                           ` Christian Borntraeger
  2021-11-16 11:50                                         ` [PATCH 4/7] KVM: mips: " David Woodhouse
                                                           ` (3 subsequent siblings)
  5 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

From: David Woodhouse <dwmw@amazon.co.uk>

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/s390/kvm/Makefile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index b3aaadc60ead..e4f50453cf7f 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -3,13 +3,11 @@
 #
 # Copyright IBM Corp. 2008
 
-KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o \
-	      $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
+include $(srctree)/virt/kvm/Makefile.kvm
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
-kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
+kvm-objs := kvm-s390.o intercept.o interrupt.o priv.o sigp.o
 kvm-objs += diag.o gaccess.o guestdbg.o vsie.o pv.o
 
 obj-$(CONFIG_KVM) += kvm.o
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 4/7] KVM: mips: Use Makefile.kvm for common files
  2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 2/7] KVM: Add Makefile.kvm for common files, use it for x86 David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 3/7] KVM: s390: Use Makefile.kvm for common files David Woodhouse
@ 2021-11-16 11:50                                         ` David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 5/7] KVM: RISC-V: " David Woodhouse
                                                           ` (2 subsequent siblings)
  5 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

From: David Woodhouse <dwmw@amazon.co.uk>

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/mips/kvm/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index d3710959da55..21ff75bcdbc4 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -2,9 +2,10 @@
 # Makefile for KVM support for MIPS
 #
 
+include $(srctree)/virt/kvm/Makefile.kvm
+
 ccflags-y += -Ivirt/kvm -Iarch/mips/kvm
 
-kvm-y := $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
 kvm-$(CONFIG_CPU_HAS_MSA) += msa.o
 
 kvm-y +=    mips.o emulate.o entry.o \
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 5/7] KVM: RISC-V: Use Makefile.kvm for common files
  2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
                                                           ` (2 preceding siblings ...)
  2021-11-16 11:50                                         ` [PATCH 4/7] KVM: mips: " David Woodhouse
@ 2021-11-16 11:50                                         ` David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 6/7] KVM: powerpc: " David Woodhouse
  2021-11-16 11:50                                         ` [PATCH 7/7] KVM: arm64: " David Woodhouse
  5 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

From: David Woodhouse <dwmw@amazon.co.uk>

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/riscv/kvm/Makefile | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile
index 30cdd1df0098..300590225348 100644
--- a/arch/riscv/kvm/Makefile
+++ b/arch/riscv/kvm/Makefile
@@ -5,14 +5,10 @@
 
 ccflags-y += -I $(srctree)/$(src)
 
-KVM := ../../../virt/kvm
+include $(srctree)/virt/kvm/Makefile.kvm
 
 obj-$(CONFIG_KVM) += kvm.o
 
-kvm-y += $(KVM)/kvm_main.o
-kvm-y += $(KVM)/coalesced_mmio.o
-kvm-y += $(KVM)/binary_stats.o
-kvm-y += $(KVM)/eventfd.o
 kvm-y += main.o
 kvm-y += vm.o
 kvm-y += vmid.o
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 6/7] KVM: powerpc: Use Makefile.kvm for common files
  2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
                                                           ` (3 preceding siblings ...)
  2021-11-16 11:50                                         ` [PATCH 5/7] KVM: RISC-V: " David Woodhouse
@ 2021-11-16 11:50                                         ` David Woodhouse
  2021-11-16 18:43                                           ` Sean Christopherson
  2021-11-16 11:50                                         ` [PATCH 7/7] KVM: arm64: " David Woodhouse
  5 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

From: David Woodhouse <dwmw@amazon.co.uk>

It's all fairly baroque but in the end, I don't think there's any reason
for $(KVM)/irqchip.o to have been handled differently, as they all end
up in $(kvm-y) in the end anyway, regardless of whether they get there
via $(common-objs-y) and the CPU-specific object lists.

The generic Makefile.kvm uses HAVE_KVM_IRQCHIP for irqchip.o instead of
HAVE_KVM_IRQ_ROUTING. That change is fine (and arguably correct) because
they are both set together for KVM_MPIC, or neither is set.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/powerpc/kvm/Makefile | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 583c14ef596e..245f59118413 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -4,11 +4,8 @@
 #
 
 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
-KVM := ../../../virt/kvm
 
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
-common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
-common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
+include $(srctree)/virt/kvm/Makefile.kvm
 
 common-objs-y += powerpc.o emulate_loadstore.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
@@ -125,7 +122,6 @@ kvm-book3s_32-objs := \
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 
 kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
-kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 
 kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* [PATCH 7/7] KVM: arm64: Use Makefile.kvm for common files
  2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
                                                           ` (4 preceding siblings ...)
  2021-11-16 11:50                                         ` [PATCH 6/7] KVM: powerpc: " David Woodhouse
@ 2021-11-16 11:50                                         ` David Woodhouse
  5 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 11:50 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

From: David Woodhouse <dwmw@amazon.co.uk>

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/arm64/kvm/Makefile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 989bb5dad2c8..04a53f71a6b6 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -5,14 +5,12 @@
 
 ccflags-y += -I $(srctree)/$(src)
 
-KVM=../../../virt/kvm
+include $(srctree)/virt/kvm/Makefile.kvm
 
 obj-$(CONFIG_KVM) += kvm.o
 obj-$(CONFIG_KVM) += hyp/
 
-kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
-	 $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
-	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
+kvm-y += arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
 	 inject_fault.o va_layout.o handle_exit.o \
 	 guest.o debug.o reset.o sys_regs.o \
 	 vgic-sys-reg-v3.o fpsimd.o pmu.o \
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-15 23:22                                       ` David Woodhouse
@ 2021-11-16 13:17                                         ` David Woodhouse
  2021-11-16 14:11                                           ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 13:17 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 19207 bytes --]

On Mon, 2021-11-15 at 23:22 +0000, David Woodhouse wrote:
> On Mon, 2021-11-15 at 22:59 +0000, Sean Christopherson wrote:
> > On Mon, Nov 15, 2021, Paolo Bonzini wrote:
> > > On 11/15/21 20:11, David Woodhouse wrote:
> > > > > Changing mn_memslots_update_rcuwait to a waitq (and renaming it to
> > > > > mn_invalidate_waitq) is of course also a possibility.
> > > > I suspect that's the answer.
> > > > 
> > > > I think the actual*invalidation*  of the cache still lives in the
> > > > invalidate_range() callback where I have it at the moment.
> > 
> > Oooh!  [finally had a lightbulb moment about ->invalidate_range() after years of
> > befuddlement].
> > 
> > Two things:
> > 
> >   1. Using _only_ ->invalidate_range() is not correct.  ->invalidate_range() is
> >      required if and only if the old PFN needs to be _unmapped_.  Specifically,
> >      if the protections are being downgraded without changing the PFN, it doesn't
> >      need to be called.  E.g. from hugetlb_change_protection():
> 
> OK, that's kind of important to realise. Thanks.
> 
> So, I had just split the atomic and guest-mode invalidations apart:
> https://git.infradead.org/users/dwmw2/linux.git/commitdiff/6cf5fe318fd
> but will go back to doing it all in invalidate_range_start from a
> single list.
> 
> And just deal with the fact that the atomic users now have to
> loop/retry/wait for there *not* to be an MMU notification in progress.
> 
> >      I believe we could use ->invalidate_range() to handle the unmap case if KVM's
> >      ->invalidate_range_start() hook is enhanced to handle the RW=>R case.  The
> >      "struct mmu_notifier_range" provides the event type, IIUC we could have the
> >      _start() variant handle MMU_NOTIFY_PROTECTION_{VMA,PAGE} (and maybe
> >      MMU_NOTIFY_SOFT_DIRTY?), and let the more precise unmap-only variant handle
> >      everything else.
> 
> Not sure that helps us much. It was the termination condition on the
> "when should we keep retrying, and when should we give up?" that was
> painful, and a mixed mode doesn't that problem it go away.
> 
> I'll go back and have another look in the morning, with something much
> closer to what I showed in
> https://lore.kernel.org/kvm/040d61dad066eb2517c108232efb975bc1cda780.camel@infradead.org/
> 


Looks a bit like this, and it seems to be working for the Xen event
channel self-test. I'll port it into our actual Xen hosting environment
and give it some more serious testing.

I'm not sure I'm ready to sign up to immediately fix everything that's
hosed in nesting and kill off all users of the unsafe kvm_vcpu_map(),
but I'll at least convert one vCPU user to demonstrate that the new
gfn_to_pfn_cache is working sanely for that use case.



From: David Woodhouse <dwmw@amazon.co.uk>
Subject: [PATCH 08/10] KVM: Reinstate gfn_to_pfn_cache with invalidation support

This can be used in two modes. There is an atomic mode where the cached
mapping is accessed while holding the rwlock, and a mode where the
physical address is used by a vCPU in guest mode.

For the latter case, an invalidation will wake the vCPU with the new
KVM_REQ_GPC_INVALIDATE, and the architecture will need to refresh any
caches it still needs to access before entering guest mode again.

Only one vCPU can be targeted by the wake requests; it's simple enough
to make it wake all vCPUs or even a mask but I don't see a use case for
that additional complexity right now.

Invalidation happens from the invalidate_range_start MMU notifier, which
needs to be able to sleep in order to wake the vCPU and wait for it.

This means that revalidation potentially needs to "wait" for the MMU
operation to complete and the invalidate_range_end notifier to be
invoked. Like the vCPU when it takes a page fault in that period, we
just spin — fixing that in a future patch by implementing an actual
*wait* may be another part of shaving this particularly hirsute yak.

As noted in the comments in the function itself, the only case where
the invalidate_range_start notifier is expected to be called *without*
being able to sleep is when the OOM reaper is killing the process. In
that case, we expect the vCPU threads already to have exited, and thus
there will be nothing to wake, and no reason to wait. So we clear the
KVM_REQUEST_WAIT bit and send the request anyway, then complain loudly
if there actually *was* anything to wake up.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/Kconfig              |   1 +
 include/linux/kvm_host.h          |  14 ++
 include/linux/kvm_types.h         |  17 ++
 virt/kvm/Kconfig                  |   3 +
 virt/kvm/Makefile.kvm             |   1 +
 virt/kvm/dirty_ring.c             |   2 +-
 virt/kvm/kvm_main.c               |  13 +-
 virt/kvm/{mmu_lock.h => kvm_mm.h} |  23 ++-
 virt/kvm/pfncache.c               | 275 ++++++++++++++++++++++++++++++
 9 files changed, 342 insertions(+), 7 deletions(-)
 rename virt/kvm/{mmu_lock.h => kvm_mm.h} (55%)
 create mode 100644 virt/kvm/pfncache.c

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d7fa0a42ac25..af351107d47f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -26,6 +26,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
 	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_PFNCACHE
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_DIRTY_RING
 	select IRQ_BYPASS_MANAGER
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c310648cc8f1..52e17e4b7694 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -151,6 +151,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_UNBLOCK           2
 #define KVM_REQ_UNHALT            3
 #define KVM_REQ_VM_DEAD           (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GPC_INVALIDATE    (5 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQUEST_ARCH_BASE     8
 
 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
@@ -559,6 +560,10 @@ struct kvm {
 	unsigned long mn_active_invalidate_count;
 	struct rcuwait mn_memslots_update_rcuwait;
 
+	/* For management / invalidation of gfn_to_pfn_caches */
+	spinlock_t gpc_lock;
+	struct list_head gpc_list;
+
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
 	 * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
@@ -966,6 +971,15 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 			 unsigned long len);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 
+int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+			      struct kvm_vcpu *vcpu, bool kernel_map,
+			      gpa_t gpa, unsigned long len, bool write);
+int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				 gpa_t gpa, unsigned long len, bool write);
+bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				gpa_t gpa, unsigned long len);
+void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc);
+
 void kvm_sigset_activate(struct kvm_vcpu *vcpu);
 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);
 
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 234eab059839..e454d2c003d6 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -19,6 +19,7 @@ struct kvm_memslots;
 enum kvm_mr_change;
 
 #include <linux/types.h>
+#include <linux/spinlock_types.h>
 
 #include <asm/kvm_types.h>
 
@@ -53,6 +54,22 @@ struct gfn_to_hva_cache {
 	struct kvm_memory_slot *memslot;
 };
 
+struct gfn_to_pfn_cache {
+	u64 generation;
+	gpa_t gpa;
+	unsigned long uhva;
+	struct kvm_memory_slot *memslot;
+	struct kvm_vcpu *vcpu;
+	struct list_head list;
+	rwlock_t lock;
+	void *khva;
+	kvm_pfn_t pfn;
+	bool active;
+	bool valid;
+	bool dirty;
+	bool kernel_map;
+};
+
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 /*
  * Memory caches are used to preallocate memory ahead of various MMU flows,
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 97cf5413ac25..f4834c20e4a6 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -4,6 +4,9 @@
 config HAVE_KVM
        bool
 
+config HAVE_KVM_PFNCACHE
+       bool
+
 config HAVE_KVM_IRQCHIP
        bool
 
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index ee9c310f3601..ca499a216d0f 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -11,3 +11,4 @@ kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
 kvm-$(CONFIG_HAVE_KVM_IRQCHIP) += $(KVM)/irqchip.o
 kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
+kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index 88f4683198ea..2b4474387895 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -9,7 +9,7 @@
 #include <linux/vmalloc.h>
 #include <linux/kvm_dirty_ring.h>
 #include <trace/events/kvm.h>
-#include "mmu_lock.h"
+#include "kvm_mm.h"
 
 int __weak kvm_cpu_dirty_log_size(void)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 356d636e037d..85506e4bd145 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -59,7 +59,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
-#include "mmu_lock.h"
+#include "kvm_mm.h"
 #include "vfio.h"
 
 #define CREATE_TRACE_POINTS
@@ -684,6 +684,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	kvm->mn_active_invalidate_count++;
 	spin_unlock(&kvm->mn_invalidate_lock);
 
+	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end,
+					  hva_range.may_block);
+
 	__kvm_handle_hva_range(kvm, &hva_range);
 
 	return 0;
@@ -1051,6 +1054,9 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	spin_lock_init(&kvm->mn_invalidate_lock);
 	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
 
+	INIT_LIST_HEAD(&kvm->gpc_list);
+	spin_lock_init(&kvm->gpc_lock);
+
 	INIT_LIST_HEAD(&kvm->devices);
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -2390,8 +2396,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
  * 2): @write_fault = false && @writable, @writable will tell the caller
  *     whether the mapping is writable.
  */
-static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
-			bool write_fault, bool *writable)
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+		     bool write_fault, bool *writable)
 {
 	struct vm_area_struct *vma;
 	kvm_pfn_t pfn = 0;
diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/kvm_mm.h
similarity index 55%
rename from virt/kvm/mmu_lock.h
rename to virt/kvm/kvm_mm.h
index 9e1308f9734c..b976e4b07e88 100644
--- a/virt/kvm/mmu_lock.h
+++ b/virt/kvm/kvm_mm.h
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#ifndef KVM_MMU_LOCK_H
-#define KVM_MMU_LOCK_H 1
+#ifndef __KVM_MM_H__
+#define __KVM_MM_H__ 1
 
 /*
  * Architectures can choose whether to use an rwlock or spinlock
@@ -20,4 +20,21 @@
 #define KVM_MMU_UNLOCK(kvm)    spin_unlock(&(kvm)->mmu_lock)
 #endif /* KVM_HAVE_MMU_RWLOCK */
 
-#endif
+kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+		     bool write_fault, bool *writable);
+
+#ifdef CONFIG_HAVE_KVM_PFNCACHE
+void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
+				       unsigned long start,
+				       unsigned long end,
+				       bool may_block);
+#else
+static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
+						     unsigned long start,
+						     unsigned long end,
+						     bool may_block)
+{
+}
+#endif /* HAVE_KVM_PFNCACHE */
+
+#endif /* __KVM_MM_H__ */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
new file mode 100644
index 000000000000..f2efc52039a8
--- /dev/null
+++ b/virt/kvm/pfncache.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables kernel and guest-mode vCPU access to guest physical
+ * memory with suitable invalidation mechanisms.
+ *
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * Authors:
+ *   David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+
+#include "kvm_mm.h"
+
+/*
+ * MMU notifier 'invalidate_range_start' hook.
+ */
+void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
+				       unsigned long end, bool may_block)
+{
+	DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
+	struct gfn_to_pfn_cache *gpc;
+	bool wake_vcpus = false;
+
+	spin_lock(&kvm->gpc_lock);
+	list_for_each_entry(gpc, &kvm->gpc_list, list) {
+		write_lock_irq(&gpc->lock);
+
+		/* Only a single page so no need to care about length */
+		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+		    gpc->uhva >= start && gpc->uhva < end) {
+			gpc->valid = false;
+
+			if (gpc->dirty) {
+				int idx = srcu_read_lock(&kvm->srcu);
+				mark_page_dirty(kvm, gpa_to_gfn(gpc->gpa));
+				srcu_read_unlock(&kvm->srcu, idx);
+
+				kvm_set_pfn_dirty(gpc->pfn);
+				gpc->dirty = false;
+			}
+
+			/*
+			 * If a guest vCPU could be using the physical address,
+			 * it needs to be woken.
+			 */
+			if (gpc->vcpu) {
+				if (!wake_vcpus) {
+					wake_vcpus = true;
+					bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
+				}
+				__set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap);
+			}
+		}
+		write_unlock_irq(&gpc->lock);
+	}
+	spin_unlock(&kvm->gpc_lock);
+
+	if (wake_vcpus) {
+		unsigned int req = KVM_REQ_GPC_INVALIDATE;
+		bool called;
+
+		/*
+		 * If the OOM reaper is active, then all vCPUs should have
+		 * been stopped already, so perform the request without
+		 * KVM_REQUEST_WAIT and be sad if any needed to be woken.
+		 */
+		if (!may_block)
+			req &= ~KVM_REQUEST_WAIT;
+
+		called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap);
+
+		WARN_ON_ONCE(called && !may_block);
+	}
+}
+
+bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				gpa_t gpa, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+
+	if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE)
+		return false;
+
+	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+	    kvm_is_error_hva(gpc->uhva))
+		return false;
+
+	if (!gpc->valid)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check);
+
+int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+				 gpa_t gpa, unsigned long len, bool write)
+{
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	unsigned long page_offset = gpa & ~PAGE_MASK;
+	kvm_pfn_t old_pfn, new_pfn;
+	unsigned long old_uhva;
+	gpa_t old_gpa;
+	void *old_khva;
+	bool old_valid, old_dirty;
+	int ret = 0;
+
+	/*
+	 * If must fit within a single page. The 'len' argument is
+	 * only to enforce that.
+	 */
+	if (page_offset + len > PAGE_SIZE)
+		return -EINVAL;
+
+	write_lock_irq(&gpc->lock);
+
+	old_gpa = gpc->gpa;
+	old_pfn = gpc->pfn;
+	old_khva = gpc->khva;
+	old_uhva = gpc->uhva;
+	old_valid = gpc->valid;
+	old_dirty = gpc->dirty;
+
+	/* If the userspace HVA is invalid, refresh that first */
+	if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+	    kvm_is_error_hva(gpc->uhva)) {
+		gfn_t gfn = gpa_to_gfn(gpa);
+
+		gpc->dirty = false;
+		gpc->gpa = gpa;
+		gpc->generation = slots->generation;
+		gpc->memslot = __gfn_to_memslot(slots, gfn);
+		gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
+
+		if (kvm_is_error_hva(gpc->uhva)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		gpc->uhva += page_offset;
+	}
+
+	/*
+	 * If the userspace HVA changed or the PFN was already invalid,
+	 * drop the lock and do the HVA to PFN lookup again.
+	 */
+	if (!old_valid || old_uhva != gpc->uhva) {
+		unsigned long uhva = gpc->uhva;
+		void *new_khva = NULL;
+		unsigned long mmu_seq;
+		int retry;
+
+		/* Placeholders for "hva is valid but not yet mapped" */
+		gpc->pfn = KVM_PFN_ERR_FAULT;
+		gpc->khva = NULL;
+		gpc->valid = true;
+
+		write_unlock_irq(&gpc->lock);
+
+	retry_map:
+		mmu_seq = kvm->mmu_notifier_seq;
+		smp_rmb();
+
+		new_pfn = hva_to_pfn(uhva, false, NULL, true, NULL);
+		if (is_error_noslot_pfn(new_pfn)) {
+			ret = -EFAULT;
+			goto map_done;
+		}
+
+		read_lock(&kvm->mmu_lock);
+		retry = mmu_notifier_retry_hva(kvm, mmu_seq, uhva);
+		read_unlock(&kvm->mmu_lock);
+		if (retry) {
+			cond_resched();
+			goto retry_map;
+		}
+
+		if (gpc->kernel_map) {
+			if (new_pfn == old_pfn) {
+				new_khva = (void *)((unsigned long)old_khva - page_offset);
+				old_pfn = KVM_PFN_ERR_FAULT;
+				old_khva = NULL;
+			} else if (pfn_valid(new_pfn)) {
+				new_khva = kmap(pfn_to_page(new_pfn));
+#ifdef CONFIG_HAS_IOMEM
+			} else {
+				new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB);
+#endif
+			}
+			if (!new_khva)
+				ret = -EFAULT;
+		}
+
+	map_done:
+		write_lock_irq(&gpc->lock);
+		if (ret) {
+			gpc->valid = false;
+			gpc->pfn = KVM_PFN_ERR_FAULT;
+			gpc->khva = NULL;
+		} else {
+			/* At this point, gpc->valid may already have been cleared */
+			gpc->pfn = new_pfn;
+			gpc->khva = new_khva + page_offset;
+		}
+	}
+
+ out:
+	if (ret)
+		gpc->dirty = false;
+	else
+		gpc->dirty = write;
+
+	write_unlock_irq(&gpc->lock);
+
+	/* Unmap the old page if it was mapped before */
+	if (!is_error_noslot_pfn(old_pfn)) {
+		if (pfn_valid(old_pfn)) {
+			kunmap(pfn_to_page(old_pfn));
+#ifdef CONFIG_HAS_IOMEM
+		} else {
+			memunmap(old_khva);
+#endif
+		}
+		kvm_release_pfn(old_pfn, old_dirty);
+		if (old_dirty)
+			mark_page_dirty(kvm, old_gpa);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_refresh);
+
+int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc,
+			      struct kvm_vcpu *vcpu, bool kernel_map,
+			      gpa_t gpa, unsigned long len, bool write)
+{
+	if (!gpc->active) {
+		rwlock_init(&gpc->lock);
+
+		gpc->khva = NULL;
+		gpc->pfn = KVM_PFN_ERR_FAULT;
+		gpc->uhva = KVM_HVA_ERR_BAD;
+		gpc->vcpu = vcpu;
+		gpc->kernel_map = kernel_map;
+		gpc->valid = false;
+		gpc->active = true;
+
+		spin_lock(&kvm->gpc_lock);
+		list_add(&gpc->list, &kvm->gpc_list);
+		spin_unlock(&kvm->gpc_lock);
+	}
+	return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len, write);
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init);
+
+void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc)
+{
+	if (gpc->active) {
+		spin_lock(&kvm->gpc_lock);
+		list_del(&gpc->list);
+		spin_unlock(&kvm->gpc_lock);
+
+		/* In failing, it will tear down any existing mapping */
+		(void)kvm_gfn_to_pfn_cache_refresh(kvm, gpc, GPA_INVALID, 0, false);
+		gpc->active = false;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy);
-- 
2.31.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 13:17                                         ` David Woodhouse
@ 2021-11-16 14:11                                           ` Paolo Bonzini
  2021-11-16 14:25                                             ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-16 14:11 UTC (permalink / raw)
  To: David Woodhouse, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

On 11/16/21 14:17, David Woodhouse wrote:
> I'm not sure I'm ready to sign up to immediately fix everything that's
> hosed in nesting and kill off all users of the unsafe kvm_vcpu_map(),
> but I'll at least convert one vCPU user to demonstrate that the new
> gfn_to_pfn_cache is working sanely for that use case.

I even have old patches that tried to do that, so I can try.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 14:11                                           ` Paolo Bonzini
@ 2021-11-16 14:25                                             ` David Woodhouse
  2021-11-16 14:57                                               ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 14:25 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 2062 bytes --]

On Tue, 2021-11-16 at 15:11 +0100, Paolo Bonzini wrote:
> On 11/16/21 14:17, David Woodhouse wrote:
> > I'm not sure I'm ready to sign up to immediately fix everything that's
> > hosed in nesting and kill off all users of the unsafe kvm_vcpu_map(),
> > but I'll at least convert one vCPU user to demonstrate that the new
> > gfn_to_pfn_cache is working sanely for that use case.
> 
> I even have old patches that tried to do that, so I can try.

Thanks. I think it starts with this on top of my current tree at 
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn


--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9735,6 +9735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
                if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
                        static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+               if (kvm_check_request(KVM_REQ_GPC_INVALIDATE, vcpu))
+                       ; /* Nothing to do. It just wanted to wake us */
        }
 
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -9781,6 +9783,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        local_irq_disable();
        vcpu->mode = IN_GUEST_MODE;
 
+       /*
+        * If the guest requires direct access to mapped L1 pages, check
+        * the caches are valid. Will raise KVM_REQ_GET_NESTED_STATE_PAGES
+        * to go and revalidate them, if necessary.
+        */
+       if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->check_guest_maps)
+               kvm_x86_ops.nested_ops->check_guest_maps();
+
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
        /*

That check_guest_maps() function can validate the caches which the L2
guest is actually using in the VMCS02, and if they need to be refreshed
then raising a req will immediately break out of vcpu_enter_guest() to
allow that to happen.

I *think* we can just use KVM_REQ_GET_NESTED_STATE_PAGES for that and
don't need to invent a new one? 


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 14:25                                             ` David Woodhouse
@ 2021-11-16 14:57                                               ` Paolo Bonzini
  2021-11-16 15:09                                                 ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-16 14:57 UTC (permalink / raw)
  To: David Woodhouse, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

On 11/16/21 15:25, David Woodhouse wrote:
> +       /*
> +        * If the guest requires direct access to mapped L1 pages, check
> +        * the caches are valid. Will raise KVM_REQ_GET_NESTED_STATE_PAGES
> +        * to go and revalidate them, if necessary.
> +        */
> +       if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->check_guest_maps)
> +               kvm_x86_ops.nested_ops->check_guest_maps();
> +

This should not be needed, should it?  As long as the gfn-to-pfn
cache's vcpu field is handled properly, the request will just cause
the vCPU not to enter.  It would have to take the gpc->lock around
changes to gpc->vcpu though (meaning: it's probably best to add a
function gfn_to_pfn_cache_set_vcpu).

Doing it lockless would be harder; I cannot think of any well-known
pattern that is good for this scenario.

> That check_guest_maps() function can validate the caches which the L2
> guest is actually using in the VMCS02, and if they need to be refreshed
> then raising a req will immediately break out of vcpu_enter_guest() to
> allow that to happen.
> 
> I*think*  we can just use KVM_REQ_GET_NESTED_STATE_PAGES for that and
> don't need to invent a new one?

Yes, maybe even do it unconditionally?

-                if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+                if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu) ||
		     kvm_check_request(KVM_REQ_GPC_INVALIDATE, vcpu))

if the gfn-to-pfn cache's vcpu field is set/reset properly across nested
VM entry and exit.

Paolo


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 14:57                                               ` Paolo Bonzini
@ 2021-11-16 15:09                                                 ` David Woodhouse
  2021-11-16 15:49                                                   ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 15:09 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 5083 bytes --]

On Tue, 2021-11-16 at 15:57 +0100, Paolo Bonzini wrote:
> On 11/16/21 15:25, David Woodhouse wrote:
> > +       /*
> > +        * If the guest requires direct access to mapped L1 pages, check
> > +        * the caches are valid. Will raise KVM_REQ_GET_NESTED_STATE_PAGES
> > +        * to go and revalidate them, if necessary.
> > +        */
> > +       if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->check_guest_maps)
> > +               kvm_x86_ops.nested_ops->check_guest_maps();
> > +
> 
> This should not be needed, should it?  As long as the gfn-to-pfn
> cache's vcpu field is handled properly, the request will just cause
> the vCPU not to enter. 

If the MMU mappings never change, the request never happens. But the
memslots *can* change, so it does need to be revalidated each time
through I think?

>  It would have to take the gpc->lock around
> changes to gpc->vcpu though (meaning: it's probably best to add a
> function gfn_to_pfn_cache_set_vcpu).

Hm, in my head that was never going to *change* for a given gpc; it
*belongs* to that vCPU for ever (and was even part of vmx->nested. for
that vCPU, to replace e.g. vmx->nested.pi_desc_map).

If I flesh out what I had in my last email a bit more, perhaps my
vision is a little bit clearer...?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 465455334c0c..9f279d08e570 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1510,6 +1510,7 @@ struct kvm_x86_nested_ops {
 	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
 			    uint16_t *vmcs_version);
 	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+	void (*check_guest_maps)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_init_ops {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 280f34ea02c3..71d2d8171a1c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3242,6 +3242,31 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
 	return true;
 }
 
+static void nested_vmx_check_guest_maps(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_pfn_cache *gpc;
+
+	bool valid;
+
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		gpc = &vmx->nested.pi_desc_cache;
+
+		read_lock(&gpc->lock);
+		valid = kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc,
+						   vmcs12->posted_intr_desc_addr,
+						   PAGE_SIZE);
+		read_unlock(&gpc->lock);
+		if (!valid) {
+			/* XX: This isn't idempotent. Make it so, or use a different
+			 * req for the 'refresh'. */
+			kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+			return;
+		}
+	}
+}
+
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
 {
 	struct vmcs12 *vmcs12;
@@ -6744,4 +6769,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
 	.write_log_dirty = nested_vmx_write_pml_buffer,
 	.enable_evmcs = nested_enable_evmcs,
 	.get_evmcs_version = nested_get_evmcs_version,
+	.check_guest_maps = nested_vmx_check_guest_maps,
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a689bb62e9e..a879e4d08758 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9735,6 +9735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
 			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+		if (kvm_check_request(KVM_REQ_GPC_INVALIDATE, vcpu))
+			; /* Nothing to do. It just wanted to wake us */
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -9781,6 +9783,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	local_irq_disable();
 	vcpu->mode = IN_GUEST_MODE;
 
+	/*
+	 * If the guest requires direct access to mapped L1 pages, check
+	 * the caches are valid. Will raise KVM_REQ_GET_NESTED_STATE_PAGES
+	 * to go and revalidate them, if necessary.
+	 */
+	if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->check_guest_maps)
+		kvm_x86_ops.nested_ops->check_guest_maps(vcpu);
+
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 	/*


> Doing it lockless would be harder; I cannot think of any well-known
> pattern that is good for this scenario.
> 
> > That check_guest_maps() function can validate the caches which the L2
> > guest is actually using in the VMCS02, and if they need to be refreshed
> > then raising a req will immediately break out of vcpu_enter_guest() to
> > allow that to happen.
> > 
> > I*think*  we can just use KVM_REQ_GET_NESTED_STATE_PAGES for that and
> > don't need to invent a new one?
> 
> Yes, maybe even do it unconditionally?
> 

So nested_get_vmcs12_pages() certainly isn't idempotent right now
because of all the kvm_vcpu_map() calls, which would just end up
leaking — but I suppose the point is to kill all those, and then maybe
it will be?

I quite liked the idea of *not* refreshing the caches immediately,m
because we can wait until the vCPU is in L2 mode again and actually
*needs* them.
 

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 15:09                                                 ` David Woodhouse
@ 2021-11-16 15:49                                                   ` Paolo Bonzini
  2021-11-16 16:06                                                     ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-16 15:49 UTC (permalink / raw)
  To: David Woodhouse, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

On 11/16/21 16:09, David Woodhouse wrote:
> On Tue, 2021-11-16 at 15:57 +0100, Paolo Bonzini wrote:
>> This should not be needed, should it?  As long as the gfn-to-pfn
>> cache's vcpu field is handled properly, the request will just cause
>> the vCPU not to enter.
> 
> If the MMU mappings never change, the request never happens. But the
> memslots *can* change, so it does need to be revalidated each time
> through I think?

That needs to be done on KVM_SET_USER_MEMORY_REGION, using the same 
request (or even the same list walking code) as the MMU notifiers.

>> It would have to take the gpc->lock around
>> changes to gpc->vcpu though (meaning: it's probably best to add a
>> function gfn_to_pfn_cache_set_vcpu).
> 
> Hm, in my head that was never going to *change* for a given gpc; it
> *belongs* to that vCPU for ever (and was even part of vmx->nested. for
> that vCPU, to replace e.g. vmx->nested.pi_desc_map).

Ah okay, I thought it would be set in nested vmentry and cleared in 
nested vmexit.

> +static void nested_vmx_check_guest_maps(struct kvm_vcpu *vcpu)
> +{
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct gfn_to_pfn_cache *gpc;
> +
> +	bool valid;
> +
> +	if (nested_cpu_has_posted_intr(vmcs12)) {
> +		gpc = &vmx->nested.pi_desc_cache;
> +
> +		read_lock(&gpc->lock);
> +		valid = kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc,
> +						   vmcs12->posted_intr_desc_addr,
> +						   PAGE_SIZE);
> +		read_unlock(&gpc->lock);
> +		if (!valid) {
> +			/* XX: This isn't idempotent. Make it so, or use a different
> +			 * req for the 'refresh'. */
> +			kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> +			return;
> +		}
> +	}
> +}

That's really slow to do on every vmentry.

> So nested_get_vmcs12_pages() certainly isn't idempotent right now
> because of all the kvm_vcpu_map() calls, which would just end up
> leaking — but I suppose the point is to kill all those, and then maybe
> it will be?

Yes, exactly.  That might be a larger than normal patch, but it should 
not be one too hard to review.  Once there's something that works, we 
can think of how to split (if it's worth it).

Paolo

> I quite liked the idea of *not* refreshing the caches immediately,m
> because we can wait until the vCPU is in L2 mode again and actually
> *needs* them.
>   
> 


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 15:49                                                   ` Paolo Bonzini
@ 2021-11-16 16:06                                                     ` David Woodhouse
  2021-11-16 17:42                                                       ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 16:06 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 10678 bytes --]

On Tue, 2021-11-16 at 16:49 +0100, Paolo Bonzini wrote:
> On 11/16/21 16:09, David Woodhouse wrote:
> > On Tue, 2021-11-16 at 15:57 +0100, Paolo Bonzini wrote:
> > > This should not be needed, should it?  As long as the gfn-to-pfn
> > > cache's vcpu field is handled properly, the request will just cause
> > > the vCPU not to enter.
> > 
> > If the MMU mappings never change, the request never happens. But the
> > memslots *can* change, so it does need to be revalidated each time
> > through I think?
> 
> That needs to be done on KVM_SET_USER_MEMORY_REGION, using the same 
> request (or even the same list walking code) as the MMU notifiers.

Hm....  kvm_arch_memslots_updated() is already kicking every vCPU after
the update, and although that was asynchronous it was actually OK
because unlike in the MMU notifier case, that page wasn't actually
going away — and if that HVA *did* subsequently go away, our HVA-based
notifier check would still catch that and kill it synchronously.

But yes, we *could* come up with a wakeup mechanism which does it that
way.


> > > It would have to take the gpc->lock around
> > > changes to gpc->vcpu though (meaning: it's probably best to add a
> > > function gfn_to_pfn_cache_set_vcpu).
> > 
> > Hm, in my head that was never going to *change* for a given gpc; it
> > *belongs* to that vCPU for ever (and was even part of vmx->nested. for
> > that vCPU, to replace e.g. vmx->nested.pi_desc_map).
> 
> Ah okay, I thought it would be set in nested vmentry and cleared in 
> nested vmexit.

I don't think it needs to be proactively cleared; we just don't
*refresh* it until we need it again.

> > +static void nested_vmx_check_guest_maps(struct kvm_vcpu *vcpu)
> > +{
> > +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +	struct gfn_to_pfn_cache *gpc;
> > +
> > +	bool valid;
> > +
> > +	if (nested_cpu_has_posted_intr(vmcs12)) {
> > +		gpc = &vmx->nested.pi_desc_cache;
> > +
> > +		read_lock(&gpc->lock);
> > +		valid = kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc,
> > +						   vmcs12->posted_intr_desc_addr,
> > +						   PAGE_SIZE);
> > +		read_unlock(&gpc->lock);
> > +		if (!valid) {
> > +			/* XX: This isn't idempotent. Make it so, or use a different
> > +			 * req for the 'refresh'. */
> > +			kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> > +			return;
> > +		}
> > +	}
> > +}
> 
> That's really slow to do on every vmentry.

It probably doesn't have to be.

Ultimately, all it *has* to check is that kvm->memslots->generation
hasn't changed since the caches were valid. That can surely be made
fast enough?

If we *know* the GPA and size haven't changed, and if we know that
gpc->valid becoming false would have been handled differently, then we
could optimise that whole thing away quite effectively to a single
check on ->generations?

> > So nested_get_vmcs12_pages() certainly isn't idempotent right now
> > because of all the kvm_vcpu_map() calls, which would just end up
> > leaking — but I suppose the point is to kill all those, and then maybe
> > it will be?
> 
> Yes, exactly.  That might be a larger than normal patch, but it should 
> not be one too hard to review.  Once there's something that works, we 
> can think of how to split (if it's worth it).

This one actually compiles. Not sure we have any test cases that will
actually exercise it though, do we?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 465455334c0c..9f279d08e570 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1510,6 +1510,7 @@ struct kvm_x86_nested_ops {
 	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
 			    uint16_t *vmcs_version);
 	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+	void (*check_guest_maps)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_init_ops {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 280f34ea02c3..f67751112633 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -309,7 +309,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
 		kvm_release_page_clean(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page = NULL;
 	}
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vmx->nested.virtual_apic_cache);
 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 	vmx->nested.pi_desc = NULL;
 
@@ -3170,10 +3170,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 	}
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		map = &vmx->nested.virtual_apic_map;
+		struct gfn_to_pfn_cache *gpc = &vmx->nested.virtual_apic_cache;
 
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
-			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+		if (!kvm_gfn_to_pfn_cache_init(vcpu->kvm, gpc, vcpu, true,
+					       vmcs12->virtual_apic_page_addr,
+					       PAGE_SIZE, true)) {
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(gpc->pfn));
 		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
 		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
 			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -3198,6 +3200,9 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 	if (nested_cpu_has_posted_intr(vmcs12)) {
 		map = &vmx->nested.pi_desc_map;
 
+		if (kvm_vcpu_mapped(map))
+			kvm_vcpu_unmap(vcpu, map, true);
+
 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
 			vmx->nested.pi_desc =
 				(struct pi_desc *)(((void *)map->hva) +
@@ -3242,6 +3247,29 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
 	return true;
 }
 
+static void nested_vmx_check_guest_maps(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_pfn_cache *gpc;
+
+	int valid;
+
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		gpc = &vmx->nested.virtual_apic_cache;
+
+		read_lock(&gpc->lock);
+		valid = kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc,
+						   vmcs12->virtual_apic_page_addr,
+						   PAGE_SIZE);
+		read_unlock(&gpc->lock);
+		if (!valid) {
+			kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+			return;
+		}
+	}
+}
+
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
 {
 	struct vmcs12 *vmcs12;
@@ -3737,9 +3765,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 
 	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
 	if (max_irr != 256) {
-		vapic_page = vmx->nested.virtual_apic_map.hva;
-		if (!vapic_page)
+		struct gfn_to_pfn_cache *gpc = &vmx->nested.virtual_apic_cache;
+
+		read_lock(&gpc->lock);
+		if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE)) {
+			read_unlock(&gpc->lock);
 			goto mmio_needed;
+		}
+
+		vapic_page = gpc->khva;
 
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
 			vapic_page, &max_irr);
@@ -3749,6 +3783,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			status |= (u8)max_irr;
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
+		read_unlock(&gpc->lock);
 	}
 
 	nested_mark_vmcs12_pages_dirty(vcpu);
@@ -4569,7 +4604,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 		kvm_release_page_clean(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page = NULL;
 	}
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vmx->nested.virtual_apic_cache);
 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 	vmx->nested.pi_desc = NULL;
 
@@ -6744,4 +6779,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
 	.write_log_dirty = nested_vmx_write_pml_buffer,
 	.enable_evmcs = nested_enable_evmcs,
 	.get_evmcs_version = nested_get_evmcs_version,
+	.check_guest_maps = nested_vmx_check_guest_maps,
 };
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ba66c171d951..6c61faef86d3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3839,19 +3839,23 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	void *vapic_page;
+	struct gfn_to_pfn_cache *gpc = &vmx->nested.virtual_apic_cache;
 	u32 vppr;
 	int rvi;
 
 	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
 		!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
-		WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
+		WARN_ON_ONCE(gpc->gpa == GPA_INVALID))
 		return false;
 
 	rvi = vmx_get_rvi();
 
-	vapic_page = vmx->nested.virtual_apic_map.hva;
-	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+	read_lock(&gpc->lock);
+	if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE))
+		vppr = *((u32 *)(gpc->khva + APIC_PROCPRI));
+	else
+		vppr = 0xff;
+	read_unlock(&gpc->lock);
 
 	return ((rvi & 0xf0) > (vppr & 0xf0));
 }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 4df2ac24ffc1..8364e7fc92a0 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -195,7 +195,7 @@ struct nested_vmx {
 	 * pointers, so we must keep them pinned while L2 runs.
 	 */
 	struct page *apic_access_page;
-	struct kvm_host_map virtual_apic_map;
+	struct gfn_to_pfn_cache virtual_apic_cache;
 	struct kvm_host_map pi_desc_map;
 
 	struct kvm_host_map msr_bitmap_map;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a689bb62e9e..a879e4d08758 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9735,6 +9735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
 			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+		if (kvm_check_request(KVM_REQ_GPC_INVALIDATE, vcpu))
+			; /* Nothing to do. It just wanted to wake us */
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -9781,6 +9783,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	local_irq_disable();
 	vcpu->mode = IN_GUEST_MODE;
 
+	/*
+	 * If the guest requires direct access to mapped L1 pages, check
+	 * the caches are valid. Will raise KVM_REQ_GET_NESTED_STATE_PAGES
+	 * to go and revalidate them, if necessary.
+	 */
+	if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->check_guest_maps)
+		kvm_x86_ops.nested_ops->check_guest_maps(vcpu);
+
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 	/*

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply related	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 16:06                                                     ` David Woodhouse
@ 2021-11-16 17:42                                                       ` Paolo Bonzini
  2021-11-16 17:57                                                         ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-16 17:42 UTC (permalink / raw)
  To: David Woodhouse, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

On 11/16/21 17:06, David Woodhouse wrote:
> On Tue, 2021-11-16 at 16:49 +0100, Paolo Bonzini wrote:
>> On 11/16/21 16:09, David Woodhouse wrote:
>>> On Tue, 2021-11-16 at 15:57 +0100, Paolo Bonzini wrote:
>>>> This should not be needed, should it?  As long as the gfn-to-pfn
>>>> cache's vcpu field is handled properly, the request will just cause
>>>> the vCPU not to enter.
>>>
>>> If the MMU mappings never change, the request never happens. But the
>>> memslots *can* change, so it does need to be revalidated each time
>>> through I think?
>>
>> That needs to be done on KVM_SET_USER_MEMORY_REGION, using the same
>> request (or even the same list walking code) as the MMU notifiers.
> 
> Hm....  kvm_arch_memslots_updated() is already kicking every vCPU after
> the update, and although that was asynchronous it was actually OK
> because unlike in the MMU notifier case, that page wasn't actually
> going away — and if that HVA *did* subsequently go away, our HVA-based
> notifier check would still catch that and kill it synchronously.

Right, so it only needs to change the kvm_vcpu_kick into a 
kvm_make_all_cpus_request without KVM_WAIT.

>>> Hm, in my head that was never going to *change* for a given gpc; it
>>> *belongs* to that vCPU for ever (and was even part of vmx->nested. for
>>> that vCPU, to replace e.g. vmx->nested.pi_desc_map).
>>
>> Ah okay, I thought it would be set in nested vmentry and cleared in
>> nested vmexit.
> 
> I don't think it needs to be proactively cleared; we just don't
> *refresh* it until we need it again.

True, but if it's cleared the vCPU won't be kicked, which is nice.

> If we *know* the GPA and size haven't changed, and if we know that
> gpc->valid becoming false would have been handled differently, then we
> could optimise that whole thing away quite effectively to a single
> check on ->generations?

I wonder if we need a per-gpc memslot generation...  Can it be global?

> This one actually compiles. Not sure we have any test cases that will
> actually exercise it though, do we?

I'll try to spend some time writing testcases.

> +		read_lock(&gpc->lock);
> +		if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE)) {
> +			read_unlock(&gpc->lock);
>   			goto mmio_needed;
> +		}
> +
> +		vapic_page = gpc->khva;

If we know this gpc is of the synchronous kind, I think we can skip the 
read_lock/read_unlock here?!?

>   		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
>   			vapic_page, &max_irr);
> @@ -3749,6 +3783,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
>   			status |= (u8)max_irr;
>   			vmcs_write16(GUEST_INTR_STATUS, status);
>   		}
> +		read_unlock(&gpc->lock);
>   	}
>   
>   	nested_mark_vmcs12_pages_dirty(vcpu);
> @@ -4569,7 +4604,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>   		kvm_release_page_clean(vmx->nested.apic_access_page);
>   		vmx->nested.apic_access_page = NULL;
>   	}
> -	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
> +	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vmx->nested.virtual_apic_cache);
>   	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
>   	vmx->nested.pi_desc = NULL;
>   
> @@ -6744,4 +6779,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
>   	.write_log_dirty = nested_vmx_write_pml_buffer,
>   	.enable_evmcs = nested_enable_evmcs,
>   	.get_evmcs_version = nested_get_evmcs_version,
> +	.check_guest_maps = nested_vmx_check_guest_maps,
>   };
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index ba66c171d951..6c61faef86d3 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -3839,19 +3839,23 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
>   static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	void *vapic_page;
> +	struct gfn_to_pfn_cache *gpc = &vmx->nested.virtual_apic_cache;
>   	u32 vppr;
>   	int rvi;
>   
>   	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
>   		!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
> -		WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
> +		WARN_ON_ONCE(gpc->gpa == GPA_INVALID))
>   		return false;
>   
>   	rvi = vmx_get_rvi();
>   
> -	vapic_page = vmx->nested.virtual_apic_map.hva;
> -	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
> +	read_lock(&gpc->lock);
> +	if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE))
> +		vppr = *((u32 *)(gpc->khva + APIC_PROCPRI));
> +	else
> +		vppr = 0xff;
> +	read_unlock(&gpc->lock);
>   
>   	return ((rvi & 0xf0) > (vppr & 0xf0));
>   }
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 4df2ac24ffc1..8364e7fc92a0 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -195,7 +195,7 @@ struct nested_vmx {
>   	 * pointers, so we must keep them pinned while L2 runs.
>   	 */
>   	struct page *apic_access_page;
> -	struct kvm_host_map virtual_apic_map;
> +	struct gfn_to_pfn_cache virtual_apic_cache;
>   	struct kvm_host_map pi_desc_map;
>   
>   	struct kvm_host_map msr_bitmap_map;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0a689bb62e9e..a879e4d08758 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9735,6 +9735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   
>   		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
>   			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> +		if (kvm_check_request(KVM_REQ_GPC_INVALIDATE, vcpu))
> +			; /* Nothing to do. It just wanted to wake us */
>   	}
>   
>   	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
> @@ -9781,6 +9783,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>   	local_irq_disable();
>   	vcpu->mode = IN_GUEST_MODE;
>   
> +	/*
> +	 * If the guest requires direct access to mapped L1 pages, check
> +	 * the caches are valid. Will raise KVM_REQ_GET_NESTED_STATE_PAGES
> +	 * to go and revalidate them, if necessary.
> +	 */
> +	if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->check_guest_maps)
> +		kvm_x86_ops.nested_ops->check_guest_maps(vcpu);
> +
>   	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
>   
>   	/*
> 


^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 17:42                                                       ` Paolo Bonzini
@ 2021-11-16 17:57                                                         ` David Woodhouse
  2021-11-16 18:46                                                           ` Paolo Bonzini
  0 siblings, 1 reply; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 17:57 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

[-- Attachment #1: Type: text/plain, Size: 4622 bytes --]

On Tue, 2021-11-16 at 18:42 +0100, Paolo Bonzini wrote:
> On 11/16/21 17:06, David Woodhouse wrote:
> > On Tue, 2021-11-16 at 16:49 +0100, Paolo Bonzini wrote:
> > > On 11/16/21 16:09, David Woodhouse wrote:
> > > > On Tue, 2021-11-16 at 15:57 +0100, Paolo Bonzini wrote:
> > > > > This should not be needed, should it?  As long as the gfn-to-pfn
> > > > > cache's vcpu field is handled properly, the request will just cause
> > > > > the vCPU not to enter.
> > > > 
> > > > If the MMU mappings never change, the request never happens. But the
> > > > memslots *can* change, so it does need to be revalidated each time
> > > > through I think?
> > > 
> > > That needs to be done on KVM_SET_USER_MEMORY_REGION, using the same
> > > request (or even the same list walking code) as the MMU notifiers.
> > 
> > Hm....  kvm_arch_memslots_updated() is already kicking every vCPU after
> > the update, and although that was asynchronous it was actually OK
> > because unlike in the MMU notifier case, that page wasn't actually
> > going away — and if that HVA *did* subsequently go away, our HVA-based
> > notifier check would still catch that and kill it synchronously.
> 
> Right, so it only needs to change the kvm_vcpu_kick into a 
> kvm_make_all_cpus_request without KVM_WAIT.

Yeah, I think that works.

> > > > Hm, in my head that was never going to *change* for a given gpc; it
> > > > *belongs* to that vCPU for ever (and was even part of vmx->nested. for
> > > > that vCPU, to replace e.g. vmx->nested.pi_desc_map).
> > > 
> > > Ah okay, I thought it would be set in nested vmentry and cleared in
> > > nested vmexit.
> > 
> > I don't think it needs to be proactively cleared; we just don't
> > *refresh* it until we need it again.
> 
> True, but if it's cleared the vCPU won't be kicked, which is nice.

The vCPU will only be kicked once when it becomes invalid anyway. It's
a trade-off. Either we leave it valid for next time that L2 vCPU is
entered, or we actively clear it. Not sure I lose much sleep either
way?

> > If we *know* the GPA and size haven't changed, and if we know that
> > gpc->valid becoming false would have been handled differently, then we
> > could optimise that whole thing away quite effectively to a single
> > check on ->generations?
> 
> I wonder if we need a per-gpc memslot generation...  Can it be global?

Theoretically, maybe. It's kind of mathematically equivalent to the
highest value of each gpc memslot. And any gpc which *isn't* at that
maximum is by definition invalid.

But I'm not sure I see how to implement it without actively going and
clearing the 'valid' bit on all GPCs when it gets bumped... which was
your previous suggestion if basically running the same code as in the
MMU notifier?




> > This one actually compiles. Not sure we have any test cases that will
> > actually exercise it though, do we?
> 
> I'll try to spend some time writing testcases.
> 
> > +		read_lock(&gpc->lock);
> > +		if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE)) {
> > +			read_unlock(&gpc->lock);
> >   			goto mmio_needed;
> > +		}
> > +
> > +		vapic_page = gpc->khva;
> 
> If we know this gpc is of the synchronous kind, I think we can skip the 
> read_lock/read_unlock here?!?

Er... this one was OUTSIDE_GUEST_MODE and is the atomic kind, which
means it needs to hold the lock for the duration of the access in order
to prevent (preemption and) racing with the invalidate?

It's the IN_GUEST_MODE one (in my check_guest_maps()) where we might
get away without the lock, perhaps?

> 
> >   		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
> >   			vapic_page, &max_irr);
> > @@ -3749,6 +3783,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
> >   			status |= (u8)max_irr;
> >   			vmcs_write16(GUEST_INTR_STATUS, status);
> >   		}
> > +		read_unlock(&gpc->lock);
> >   	}
> >   

I just realised that the mark_page_dirty() on invalidation and when the
the irqfd workqueue refreshes the gpc might fall foul of the same
dirty_ring problem that I belatedly just spotted with the Xen shinfo
clock write. I'll fix it up to *always* require a vcpu (to be
associated with the writes), and reinstate the guest_uses_pa flag since
that can no longer in implicit in (vcpu!=NULL).

I may leave the actual task of fixing nesting to you, if that's OK, as
long as we consider the new gfn_to_pfn_cache sufficient to address the
problem? I think it's mostly down to how we *use* it now, rather than
the fundamental design of cache itself?

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 6/7] KVM: powerpc: Use Makefile.kvm for common files
  2021-11-16 11:50                                         ` [PATCH 6/7] KVM: powerpc: " David Woodhouse
@ 2021-11-16 18:43                                           ` Sean Christopherson
  2021-11-16 19:13                                             ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Sean Christopherson @ 2021-11-16 18:43 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Paolo Bonzini, kvm, Boris Ostrovsky, Joao Martins,
	jmattson @ google . com, wanpengli @ tencent . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

On Tue, Nov 16, 2021, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> It's all fairly baroque but in the end, I don't think there's any reason
> for $(KVM)/irqchip.o to have been handled differently, as they all end
> up in $(kvm-y) in the end anyway, regardless of whether they get there
> via $(common-objs-y) and the CPU-specific object lists.
> 
> The generic Makefile.kvm uses HAVE_KVM_IRQCHIP for irqchip.o instead of
> HAVE_KVM_IRQ_ROUTING. That change is fine (and arguably correct) because
> they are both set together for KVM_MPIC, or neither is set.

Nope.

  Symbol: HAVE_KVM_IRQCHIP [=y]
  Type  : bool
  Defined at virt/kvm/Kconfig:7
  Selected by [m]:
    - KVM_XICS [=y] && VIRTUALIZATION [=y] && KVM_BOOK3S_64 [=m] && !KVM_MPIC [=n]
  Selected by [n]:
    - KVM_MPIC [=n] && VIRTUALIZATION [=y] && KVM [=y] && E500 [=n]

leads to this and a whole pile of other errors

arch/powerpc/kvm/../../../virt/kvm/irqchip.c: In function ‘kvm_irq_map_gsi’:
arch/powerpc/kvm/../../../virt/kvm/irqchip.c:31:35: error: invalid use of undefined type ‘struct kvm_irq_routing_table’
   31 |         if (irq_rt && gsi < irq_rt->nr_rt_entries) {
      |                                   ^~


Side topic, please don't post a new version/series in-reply-to a different series.
b4 also gets confused in this case, e.g. it tried to grab the original patch.  b4
has also made me really lazy, heaven forbid I actually had to manually grab these
from mutt :-)

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 17:57                                                         ` David Woodhouse
@ 2021-11-16 18:46                                                           ` Paolo Bonzini
  2021-11-16 19:34                                                             ` David Woodhouse
  0 siblings, 1 reply; 70+ messages in thread
From: Paolo Bonzini @ 2021-11-16 18:46 UTC (permalink / raw)
  To: David Woodhouse, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed

On 11/16/21 18:57, David Woodhouse wrote:
>>> +		read_lock(&gpc->lock);
>>> +		if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE)) {
>>> +			read_unlock(&gpc->lock);
>>>    			goto mmio_needed;
>>> +		}
>>> +
>>> +		vapic_page = gpc->khva;
>> If we know this gpc is of the synchronous kind, I think we can skip the
>> read_lock/read_unlock here?!?
> Er... this one was OUTSIDE_GUEST_MODE and is the atomic kind, which
> means it needs to hold the lock for the duration of the access in order
> to prevent (preemption and) racing with the invalidate?
> 
> It's the IN_GUEST_MODE one (in my check_guest_maps()) where we might
> get away without the lock, perhaps?

Ah, this is check_nested_events which is mostly IN_GUEST_MODE but not 
always (and that sucks for other reasons).  I'll think a bit more about 
it when I actually do the work.

>>>    		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
>>>    			vapic_page, &max_irr);
>>> @@ -3749,6 +3783,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
>>>    			status |= (u8)max_irr;
>>>    			vmcs_write16(GUEST_INTR_STATUS, status);
>>>    		}
>>> +		read_unlock(&gpc->lock);
>>>    	}
>>>    
> I just realised that the mark_page_dirty() on invalidation and when the
> the irqfd workqueue refreshes the gpc might fall foul of the same
> dirty_ring problem that I belatedly just spotted with the Xen shinfo
> clock write. I'll fix it up to*always*  require a vcpu (to be
> associated with the writes), and reinstate the guest_uses_pa flag since
> that can no longer in implicit in (vcpu!=NULL).

Okay.

> I may leave the actual task of fixing nesting to you, if that's OK, as
> long as we consider the new gfn_to_pfn_cache sufficient to address the
> problem? I think it's mostly down to how we*use*  it now, rather than
> the fundamental design of cache itself?

Yes, I agree.  Just stick whatever you have for nesting as an extra 
patch at the end, and I'll take it from there.

Paolo

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 6/7] KVM: powerpc: Use Makefile.kvm for common files
  2021-11-16 18:43                                           ` Sean Christopherson
@ 2021-11-16 19:13                                             ` David Woodhouse
  0 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 19:13 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, kvm, Boris Ostrovsky, Joao Martins,
	jmattson @ google . com, wanpengli @ tencent . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, Christian Borntraeger,
	kvmarm, linux-arm-kernel, linux-mips, linuxppc-dev, kvm-riscv,
	linux-s390

[-- Attachment #1: Type: text/plain, Size: 2476 bytes --]

On Tue, 2021-11-16 at 18:43 +0000, Sean Christopherson wrote:
> On Tue, Nov 16, 2021, David Woodhouse wrote:
> > From: David Woodhouse <dwmw@amazon.co.uk>
> > 
> > It's all fairly baroque but in the end, I don't think there's any reason
> > for $(KVM)/irqchip.o to have been handled differently, as they all end
> > up in $(kvm-y) in the end anyway, regardless of whether they get there
> > via $(common-objs-y) and the CPU-specific object lists.
> > 
> > The generic Makefile.kvm uses HAVE_KVM_IRQCHIP for irqchip.o instead of
> > HAVE_KVM_IRQ_ROUTING. That change is fine (and arguably correct) because
> > they are both set together for KVM_MPIC, or neither is set.
> 
> Nope.
> 
>   Symbol: HAVE_KVM_IRQCHIP [=y]
>   Type  : bool
>   Defined at virt/kvm/Kconfig:7
>   Selected by [m]:
>     - KVM_XICS [=y] && VIRTUALIZATION [=y] && KVM_BOOK3S_64 [=m] && !KVM_MPIC [=n]
>   Selected by [n]:
>     - KVM_MPIC [=n] && VIRTUALIZATION [=y] && KVM [=y] && E500 [=n]
> 
> leads to this and a whole pile of other errors
> 
> arch/powerpc/kvm/../../../virt/kvm/irqchip.c: In function ‘kvm_irq_map_gsi’:
> arch/powerpc/kvm/../../../virt/kvm/irqchip.c:31:35: error: invalid use of undefined type ‘struct kvm_irq_routing_table’
>    31 |         if (irq_rt && gsi < irq_rt->nr_rt_entries) {
>       |                                   ^~
> 

Hm, perhaps it should have been like this then (incremental):

+++ b/virt/kvm/Makefile.kvm
@@ -9,6 +9,6 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
 kvm-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
-kvm-$(CONFIG_HAVE_KVM_IRQCHIP) += $(KVM)/irqchip.o
+kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
 kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
 kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o


> Side topic, please don't post a new version/series in-reply-to a different series.
> b4 also gets confused in this case, e.g. it tried to grab the original patch.  b4
> has also made me really lazy, heaven forbid I actually had to manually grab these
> from mutt :-)

Sorry ;)

I think that one might even be a new series in reply to what was
already a second series on top of what I was *actually* trying to do
when I first started shaving this yak. Or maybe what I was originally
trying to implement has already been lost in the noise :)

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache
  2021-11-16 18:46                                                           ` Paolo Bonzini
@ 2021-11-16 19:34                                                             ` David Woodhouse
  0 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-16 19:34 UTC (permalink / raw)
  To: Paolo Bonzini, Sean Christopherson
  Cc: kvm, Boris Ostrovsky, Joao Martins, jmattson, wanpengli,
	vkuznets, mtosatti, joro, karahmed



On 16 November 2021 18:46:03 GMT, Paolo Bonzini <bonzini@gnu.org> wrote:
>On 11/16/21 18:57, David Woodhouse wrote:
>>>> +		read_lock(&gpc->lock);
>>>> +		if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE)) {
>>>> +			read_unlock(&gpc->lock);
>>>>    			goto mmio_needed;
>>>> +		}
>>>> +
>>>> +		vapic_page = gpc->khva;
>>> If we know this gpc is of the synchronous kind, I think we can skip the
>>> read_lock/read_unlock here?!?
>> Er... this one was OUTSIDE_GUEST_MODE and is the atomic kind, which
>> means it needs to hold the lock for the duration of the access in order
>> to prevent (preemption and) racing with the invalidate?
>> 
>> It's the IN_GUEST_MODE one (in my check_guest_maps()) where we might
>> get away without the lock, perhaps?
>
>Ah, this is check_nested_events which is mostly IN_GUEST_MODE but not 
>always (and that sucks for other reasons).  I'll think a bit more about 
>it when I actually do the work.
>
>>>>    		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
>>>>    			vapic_page, &max_irr);
>>>> @@ -3749,6 +3783,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
>>>>    			status |= (u8)max_irr;
>>>>    			vmcs_write16(GUEST_INTR_STATUS, status);
>>>>    		}
>>>> +		read_unlock(&gpc->lock);
>>>>    	}
>>>>    
>> I just realised that the mark_page_dirty() on invalidation and when the
>> the irqfd workqueue refreshes the gpc might fall foul of the same
>> dirty_ring problem that I belatedly just spotted with the Xen shinfo
>> clock write. I'll fix it up to*always*  require a vcpu (to be
>> associated with the writes), and reinstate the guest_uses_pa flag since
>> that can no longer in implicit in (vcpu!=NULL).
>
>Okay.
>
>> I may leave the actual task of fixing nesting to you, if that's OK, as
>> long as we consider the new gfn_to_pfn_cache sufficient to address the
>> problem? I think it's mostly down to how we*use*  it now, rather than
>> the fundamental design of cache itself?
>
>Yes, I agree.  Just stick whatever you have for nesting as an extra 
>patch at the end, and I'll take it from there.


Will do; thanks. I believe you said you'd already merged a batch including killing the first three of the nesting kvm_vcpu_map() users, so I'll wait for those to show up in a tree I can pull from and then post a single series with the unified Makefile.kvm bits followed by pfncache.c and the Xen event channel support using it. And as requested, ending with the nested pfncache one I posted a couple of messages ago.

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 3/7] KVM: s390: Use Makefile.kvm for common files
  2021-11-16 11:50                                         ` [PATCH 3/7] KVM: s390: Use Makefile.kvm for common files David Woodhouse
@ 2021-11-17  7:29                                           ` Christian Borntraeger
  0 siblings, 0 replies; 70+ messages in thread
From: Christian Borntraeger @ 2021-11-17  7:29 UTC (permalink / raw)
  To: David Woodhouse, Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed, Marc Zyngier, James Morse,
	Alexandru Elisei, Suzuki K Poulose, Catalin Marinas, Will Deacon,
	Huacai Chen, Aleksandar Markovic, Michael Ellerman,
	Benjamin Herrenschmidt, Anup Patel, kvmarm, linux-arm-kernel,
	linux-mips, linuxppc-dev, kvm-riscv, linux-s390



Am 16.11.21 um 12:50 schrieb David Woodhouse:
> From: David Woodhouse <dwmw@amazon.co.uk>
> 
> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>

Looks good.
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>

> ---
>   arch/s390/kvm/Makefile | 6 ++----
>   1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
> index b3aaadc60ead..e4f50453cf7f 100644
> --- a/arch/s390/kvm/Makefile
> +++ b/arch/s390/kvm/Makefile
> @@ -3,13 +3,11 @@
>   #
>   # Copyright IBM Corp. 2008
>   
> -KVM := ../../../virt/kvm
> -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o  $(KVM)/async_pf.o \
> -	      $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
> +include $(srctree)/virt/kvm/Makefile.kvm
>   
>   ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
>   
> -kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
> +kvm-objs := kvm-s390.o intercept.o interrupt.o priv.o sigp.o
>   kvm-objs += diag.o gaccess.o guestdbg.o vsie.o pv.o
>   
>   obj-$(CONFIG_KVM) += kvm.o
> 

^ permalink raw reply	[flat|nested] 70+ messages in thread

* Re: [PATCH 08/11] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache
  2021-11-16 10:21                                   ` Paolo Bonzini
@ 2021-11-17 17:18                                     ` David Woodhouse
  0 siblings, 0 replies; 70+ messages in thread
From: David Woodhouse @ 2021-11-17 17:18 UTC (permalink / raw)
  To: Paolo Bonzini, kvm
  Cc: Boris Ostrovsky, Joao Martins, jmattson @ google . com,
	wanpengli @ tencent . com, seanjc @ google . com,
	vkuznets @ redhat . com, mtosatti @ redhat . com,
	joro @ 8bytes . org, karahmed

[-- Attachment #1: Type: text/plain, Size: 2044 bytes --]

On Tue, 2021-11-16 at 11:21 +0100, Paolo Bonzini wrote:
> 
> Queued patches 2-8 as well.

Thanks. These finally made it out to git.kernel.org and I see one of
them has been afflicted by mojibake.

commit d79f9da821fda5cbc5383a2d9b16a12c962cb772
Author: David Woodhouse <dwmw@amazon.co.uk>
Date:   Mon Nov 15 16:50:27 2021 +0000

    KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache
    
    In commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time /
    preempted status") I removed the only user of these functions because
    it was basically impossible to use them safely.
    
    There are two stages to the GFN → PFN mapping; first through the KVM
    memslots to a userspace HVA and then through the page tables to
    translate that HVA to an underlying PFN. Invalidations of the former
    were being handled correctly, but no attempt was made to use the MMU
    notifiers to invalidate the cache when the HVA→GFN mapping changed.
    
    As a prelude to reinventing the gfn_to_pfn_cache with more usable
    semantics, rip it out entirely and untangle the implementation of
    the unsafe kvm_vcpu_map()/kvm_vcpu_unmap() functions from it.
    
    All current users of kvm_vcpu_map() also look broken right now, and
    will be dealt with separately. They broadly fall into two classes:
    
     • Those which map, access the data and immediately unmap. This is
       mostly gratuitous and could just as well use the existing user
       HVA, and could probably benefit from a gfn_to_hva_cache as they
       do so.
    
     • Those which keep the mapping around for a longer time, perhaps
       even using the PFN directly from the guest. These will need to
       be converted to the new gfn_to_pfn_cache and then kvm_vcpu_map()
       can be removed too.
    
    Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
    Message-Id: <20211115165030.7422-8-dwmw2@infradead.org>
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]

^ permalink raw reply	[flat|nested] 70+ messages in thread

end of thread, other threads:[~2021-11-17 17:19 UTC | newest]

Thread overview: 70+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-01 14:09 [PATCH] KVM: x86: Fix recording of guest steal time / preempted status David Woodhouse
2021-11-02 16:38 ` [PATCH v2] " David Woodhouse
2021-11-02 17:01   ` Paolo Bonzini
2021-11-02 17:11     ` David Woodhouse
2021-11-02 17:19       ` Paolo Bonzini
2021-11-02 17:26         ` David Woodhouse
2021-11-02 17:36         ` [PATCH v3] " David Woodhouse
2021-11-11 13:23           ` Paolo Bonzini
2021-11-12  8:28             ` David Woodhouse
2021-11-12  9:31               ` Paolo Bonzini
2021-11-12  9:54                 ` David Woodhouse
2021-11-12 10:49                   ` Paolo Bonzini
2021-11-12 11:29                     ` David Woodhouse
2021-11-12 12:27                       ` Paolo Bonzini
2021-11-12 13:28                         ` David Woodhouse
2021-11-12 14:56                           ` Paolo Bonzini
2021-11-12 15:27                             ` David Woodhouse
2021-11-15 16:47                             ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
2021-11-15 16:50                               ` [PATCH 01/11] KVM: x86: Fix steal time asm constraints in 32-bit mode David Woodhouse
2021-11-15 16:50                                 ` [PATCH 02/11] KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO David Woodhouse
2021-11-15 16:50                                 ` [PATCH 03/11] KVM: selftests: Add event channel upcall support to xen_shinfo_test David Woodhouse
2021-11-15 16:50                                 ` [PATCH 04/11] KVM: x86/xen: Use sizeof_field() instead of open-coding it David Woodhouse
2021-11-15 16:50                                 ` [PATCH 05/11] KVM: nVMX: Use kvm_{read,write}_guest_cached() for shadow_vmcs12 David Woodhouse
2021-11-15 16:50                                 ` [PATCH 06/11] KVM: nVMX: Use kvm_read_guest_offset_cached() for nested VMCS check David Woodhouse
2021-11-15 16:50                                 ` [PATCH 07/11] KVM: nVMX: Use a gfn_to_hva_cache for vmptrld David Woodhouse
2021-11-15 16:50                                 ` [PATCH 08/11] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache David Woodhouse
2021-11-16 10:21                                   ` Paolo Bonzini
2021-11-17 17:18                                     ` David Woodhouse
2021-11-15 16:50                                 ` [PATCH 09/11] KVM: Reinstate gfn_to_pfn_cache with invalidation support David Woodhouse
2021-11-15 16:50                                 ` [PATCH 10/11] KVM: x86/xen: Maintain valid mapping of Xen shared_info page David Woodhouse
2021-11-15 16:50                                 ` [PATCH 11/11] KVM: x86/xen: Add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery David Woodhouse
2021-11-15 17:02                                   ` David Woodhouse
2021-11-15 18:49                                   ` Paolo Bonzini
2021-11-15 18:55                                     ` David Woodhouse
2021-11-15 18:50                               ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache Paolo Bonzini
2021-11-15 19:11                                 ` David Woodhouse
2021-11-15 19:26                                   ` Paolo Bonzini
2021-11-15 22:59                                     ` Sean Christopherson
2021-11-15 23:22                                       ` David Woodhouse
2021-11-16 13:17                                         ` David Woodhouse
2021-11-16 14:11                                           ` Paolo Bonzini
2021-11-16 14:25                                             ` David Woodhouse
2021-11-16 14:57                                               ` Paolo Bonzini
2021-11-16 15:09                                                 ` David Woodhouse
2021-11-16 15:49                                                   ` Paolo Bonzini
2021-11-16 16:06                                                     ` David Woodhouse
2021-11-16 17:42                                                       ` Paolo Bonzini
2021-11-16 17:57                                                         ` David Woodhouse
2021-11-16 18:46                                                           ` Paolo Bonzini
2021-11-16 19:34                                                             ` David Woodhouse
2021-11-15 23:24                                       ` David Woodhouse
2021-11-16 11:50                                     ` [PATCH 0/7] KVM: Add Makefile.kvm for common files David Woodhouse
2021-11-16 11:50                                       ` [PATCH 1/7] KVM: Introduce CONFIG_HAVE_KVM_DIRTY_RING David Woodhouse
2021-11-16 11:50                                         ` [PATCH 2/7] KVM: Add Makefile.kvm for common files, use it for x86 David Woodhouse
2021-11-16 11:50                                         ` [PATCH 3/7] KVM: s390: Use Makefile.kvm for common files David Woodhouse
2021-11-17  7:29                                           ` Christian Borntraeger
2021-11-16 11:50                                         ` [PATCH 4/7] KVM: mips: " David Woodhouse
2021-11-16 11:50                                         ` [PATCH 5/7] KVM: RISC-V: " David Woodhouse
2021-11-16 11:50                                         ` [PATCH 6/7] KVM: powerpc: " David Woodhouse
2021-11-16 18:43                                           ` Sean Christopherson
2021-11-16 19:13                                             ` David Woodhouse
2021-11-16 11:50                                         ` [PATCH 7/7] KVM: arm64: " David Woodhouse
2021-11-15 21:38                                 ` [RFC PATCH 0/11] Rework gfn_to_pfn_cache David Woodhouse
2021-11-12 19:44                 ` [PATCH v3] KVM: x86: Fix recording of guest steal time / preempted status David Woodhouse
2021-11-03  9:47         ` [PATCH v2] " David Woodhouse
2021-11-03 12:35           ` Paolo Bonzini
2021-11-03 12:56             ` David Woodhouse
2021-11-03 13:05               ` Paolo Bonzini
2021-11-03 13:23                 ` David Woodhouse
2021-11-03 13:34                 ` David Woodhouse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).