Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU

From: David Woodhouse <dwmw2@infradead.org>
To: Paolo Bonzini <pbonzini@redhat.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"Raslan, KarimAllah" <karahmed@amazon.com>,
	Joao Martins <joao.m.martins@oracle.com>,
	Ankur Arora <ankur.a.arora@oracle.com>
Cc: "jmattson@google.com" <jmattson@google.com>,
	"wanpengli@tencent.com" <wanpengli@tencent.com>,
	"seanjc@google.com" <seanjc@google.com>,
	"vkuznets@redhat.com" <vkuznets@redhat.com>,
	"mtosatti@redhat.com" <mtosatti@redhat.com>,
	"joro@8bytes.org" <joro@8bytes.org>
Subject: Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU
Date: Mon, 01 Nov 2021 08:16:20 +0000	[thread overview]
Message-ID: <8ba4d8cf27f03eb13841ebb9039fc4ff15fa1b50.camel@infradead.org> (raw)
In-Reply-To: <fb6f1f4bc7d3c5c470587cb8fde5b59f1efd4b0f.camel@infradead.org>

[-- Attachment #1: Type: text/plain, Size: 9840 bytes --]

On Sun, 2021-10-31 at 08:31 +0000, David Woodhouse wrote:
> On Sun, 2021-10-31 at 07:52 +0100, Paolo Bonzini wrote:
> > On 30/10/21 09:58, David Woodhouse wrote:
> > > > Absolutely!  The fixed version of kvm_map_gfn should not do any
> > > > map/unmap, it should do it eagerly on MMU notifier operations.
> > 
> > > Staring at this some more... what*currently*  protects a
> > > gfn_to_pfn_cache when the page tables change — either because userspace
> > > either mmaps something else over the same HVA, or the underlying page
> > > is just swapped out and restored?
> > 
> > kvm_cache_gfn_to_pfn calls gfn_to_pfn_memslot, which pins the page.
> 
> Empirically, this breaks the test case in the series I sent out last
> night, because the kernel is looking at the *wrong* shared info page
> after userspace maps a new one at that HVA...

The fun part now looks like this; if this is deemed sane I'll work up
something that fixes the steal time thing in a similar way. And maybe
turn it into a generic 'gfn_to_kva_cache'?

The full series, with the final patch showing how it gets used, is at
https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn

From a329c575aadc0b65bb7ebd97eaf9f9374508cea9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 30 Oct 2021 19:53:23 +0100
Subject: [PATCH 5/6] KVM: x86/xen: Maintain valid mapping of Xen shared_info
 page

In order to allow for event channel delivery, we would like to have a
kernel mapping of the shared_info page which can be accessed in atomic
context in the common case.

The gfn_to_pfn_cache only automatically handles invalidation when the
KVM memslots change; it doesn't handle a change in the userspace HVA
to host PFN mappings. So hook into the MMU notifiers to invalidate the
shared_info pointer on demand.

The shared_info can be accessed while holding the shinfo_lock, with a
slow path which takes the kvm->lock mutex to refresh the mapping.
I'd like to use RCU for the invalidation but I don't think we can
always sleep in the invalidate_range notifier. Having a true kernel
mapping of the page means that our access to it can be atomic anyway,
so holding a spinlock is OK.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/kvm_host.h |  4 ++
 arch/x86/kvm/mmu/mmu.c          | 23 ++++++++++++
 arch/x86/kvm/xen.c              | 65 +++++++++++++++++++++++++++------
 include/linux/kvm_host.h        | 26 -------------
 include/linux/kvm_types.h       | 27 ++++++++++++++
 5 files changed, 107 insertions(+), 38 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 13f64654dfff..bb4868c3c06b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1017,6 +1017,10 @@ struct kvm_xen {
 	bool long_mode;
 	u8 upcall_vector;
 	gfn_t shinfo_gfn;
+	rwlock_t shinfo_lock;
+	void *shared_info;
+	struct kvm_host_map shinfo_map;
+	struct gfn_to_pfn_cache shinfo_cache;
 };
 
 enum kvm_irqchip_mode {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0cc58901bf7a..429a4860d67a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -25,6 +25,7 @@
 #include "kvm_emulate.h"
 #include "cpuid.h"
 #include "spte.h"
+#include "xen.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -1588,6 +1589,28 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool flush = false;
 
+	if (static_branch_unlikely(&kvm_xen_enabled.key)) {
+		write_lock(&kvm->arch.xen.shinfo_lock);
+
+		if (kvm->arch.xen.shared_info &&
+		    kvm->arch.xen.shinfo_gfn >= range->start &&
+		    kvm->arch.xen.shinfo_cache.gfn < range->end) {
+			/*
+			 * If kvm_xen_shared_info_init() had *finished* mapping the
+			 * page and assigned the pointer for real, then mark the page
+			 * dirty now instead of via the eventual cache teardown.
+			 */
+			if (kvm->arch.xen.shared_info != KVM_UNMAPPED_PAGE) {
+				kvm_set_pfn_dirty(kvm->arch.xen.shinfo_cache.pfn);
+				kvm->arch.xen.shinfo_cache.dirty = false;
+			}
+
+			kvm->arch.xen.shared_info = NULL;
+		}
+
+		write_unlock(&kvm->arch.xen.shinfo_lock);
+	}
+
 	if (kvm_memslots_have_rmaps(kvm))
 		flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 565da9c3853b..9d143bc7d769 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -21,18 +21,59 @@
 
 DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
 
-static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+static void kvm_xen_shared_info_unmap(struct kvm *kvm)
+{
+	bool was_valid = false;
+
+	write_lock(&kvm->arch.xen.shinfo_lock);
+	if (kvm->arch.xen.shared_info)
+		was_valid = true;
+	kvm->arch.xen.shared_info = NULL;
+	kvm->arch.xen.shinfo_gfn = GPA_INVALID;
+	write_unlock(&kvm->arch.xen.shinfo_lock);
+
+	if (kvm_vcpu_mapped(&kvm->arch.xen.shinfo_map)) {
+		kvm_unmap_gfn(kvm, &kvm->arch.xen.shinfo_map,
+			      &kvm->arch.xen.shinfo_cache, was_valid, false);
+
+		/* If the MMU notifier invalidated it, the gfn_to_pfn_cache
+		 * may be invalid. Force it to notice */
+		if (!was_valid)
+			kvm->arch.xen.shinfo_cache.generation = -1;
+	}
+}
+
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn, bool update_clock)
 {
 	gpa_t gpa = gfn_to_gpa(gfn);
 	int wc_ofs, sec_hi_ofs;
 	int ret = 0;
 	int idx = srcu_read_lock(&kvm->srcu);
 
-	if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
-		ret = -EFAULT;
+	kvm_xen_shared_info_unmap(kvm);
+
+	if (gfn == GPA_INVALID)
 		goto out;
-	}
+
+	/* Let the MMU notifier know that we are in the process of mapping it */
+	write_lock(&kvm->arch.xen.shinfo_lock);
+	kvm->arch.xen.shared_info = KVM_UNMAPPED_PAGE;
 	kvm->arch.xen.shinfo_gfn = gfn;
+	write_unlock(&kvm->arch.xen.shinfo_lock);
+
+	ret = kvm_map_gfn(kvm, gfn, &kvm->arch.xen.shinfo_map,
+			  &kvm->arch.xen.shinfo_cache, false);
+	if (ret)
+		goto out;
+
+	write_lock(&kvm->arch.xen.shinfo_lock);
+	/* Unless the MMU notifier already invalidated it */
+	if (kvm->arch.xen.shared_info == KVM_UNMAPPED_PAGE)
+		kvm->arch.xen.shared_info = kvm->arch.xen.shinfo_map.hva;
+	write_unlock(&kvm->arch.xen.shinfo_lock);
+
+	if (!update_clock)
+		goto out;
 
 	/* Paranoia checks on the 32-bit struct layout */
 	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -260,15 +301,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
 		break;
 
 	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
-		if (data->u.shared_info.gfn == GPA_INVALID) {
-			kvm->arch.xen.shinfo_gfn = GPA_INVALID;
-			r = 0;
-			break;
-		}
-		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+		r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn, true);
 		break;
 
-
 	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
 		if (data->u.vector && data->u.vector < 0x10)
 			r = -EINVAL;
@@ -661,11 +696,17 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
 
 void kvm_xen_init_vm(struct kvm *kvm)
 {
-	kvm->arch.xen.shinfo_gfn = GPA_INVALID;
+	rwlock_init(&kvm->arch.xen.shinfo_lock);
 }
 
 void kvm_xen_destroy_vm(struct kvm *kvm)
 {
+	struct gfn_to_pfn_cache *cache = &kvm->arch.xen.shinfo_cache;
+
+	kvm_xen_shared_info_unmap(kvm);
+
+	kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
 	if (kvm->arch.xen_hvm_config.msr)
 		static_branch_slow_dec_deferred(&kvm_xen_enabled);
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 749cdc77fc4e..f0012d128aa5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -251,32 +251,6 @@ enum {
 	READING_SHADOW_PAGE_TABLES,
 };
 
-#define KVM_UNMAPPED_PAGE	((void *) 0x500 + POISON_POINTER_DELTA)
-
-struct kvm_host_map {
-	/*
-	 * Only valid if the 'pfn' is managed by the host kernel (i.e. There is
-	 * a 'struct page' for it. When using mem= kernel parameter some memory
-	 * can be used as guest memory but they are not managed by host
-	 * kernel).
-	 * If 'pfn' is not managed by the host kernel, this field is
-	 * initialized to KVM_UNMAPPED_PAGE.
-	 */
-	struct page *page;
-	void *hva;
-	kvm_pfn_t pfn;
-	kvm_pfn_t gfn;
-};
-
-/*
- * Used to check if the mapping is valid or not. Never use 'kvm_host_map'
- * directly to check for that.
- */
-static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
-{
-	return !!map->hva;
-}
-
 static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop)
 {
 	return single_task_running() && !need_resched() && ktime_before(cur, stop);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2237abb93ccd..2092f4ca156b 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -60,6 +60,33 @@ struct gfn_to_pfn_cache {
 	bool dirty;
 };
 
+#define KVM_UNMAPPED_PAGE	((void *) 0x500 + POISON_POINTER_DELTA)
+
+struct kvm_host_map {
+	/*
+	 * Only valid if the 'pfn' is managed by the host kernel (i.e. There is
+	 * a 'struct page' for it. When using mem= kernel parameter some memory
+	 * can be used as guest memory but they are not managed by host
+	 * kernel).
+	 * If 'pfn' is not managed by the host kernel, this field is
+	 * initialized to KVM_UNMAPPED_PAGE.
+	 */
+	struct page *page;
+	void *hva;
+	kvm_pfn_t pfn;
+	kvm_pfn_t gfn;
+};
+
+/*
+ * Used to check if the mapping is valid or not. Never use 'kvm_host_map'
+ * directly to check for that.
+ */
+static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
+{
+	return !!map->hva;
+}
+
+
 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
 /*
  * Memory caches are used to preallocate memory ahead of various MMU flows,
-- 
2.31.1


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5174 bytes --]